blob: f363dd378f46e706b87d550c0ad227dea3c34d1d [file] [log] [blame]
# @file analyze_filter.py
#
# Filters results in a SARIF file.
#
# Apache License
# Version 2.0, January 2004
# http://www.apache.org/licenses/
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file has been altered from its original form. Based on code in:
# https://github.com/advanced-security/filter-sarif
#
# It primarily contains modifications made to integrate with the CodeQL plugin.
#
# Specifically:
# https://github.com/advanced-security/filter-sarif/blob/main/filter_sarif.py
#
# View the full and complete license as provided by that repository here:
# https://github.com/advanced-security/filter-sarif/blob/main/LICENSE
#
# SPDX-License-Identifier: Apache-2.0
##
import json
import logging
import re
from os import PathLike
from typing import Iterable, List, Tuple
from analyze.globber import match
def _match_path_and_rule(
path: str, rule: str, patterns: Iterable[str]) -> bool:
"""Returns whether a given path matches a given rule.
Args:
path (str): A file path string.
rule (str): A rule file path string.
patterns (Iterable[str]): An iterable of pattern strings.
Returns:
bool: True if the path matches a rule. Otherwise, False.
"""
result = True
for s, fp, rp in patterns:
if match(rp, rule) and match(fp, path):
result = s
return result
def _parse_pattern(line: str) -> Tuple[str]:
"""Parses a given pattern line.
Args:
line (str): The line string that contains the rule.
Returns:
Tuple[str]: The parsed sign, file pattern, and rule pattern from the
line.
"""
sep_char = ':'
esc_char = '\\'
file_pattern = ''
rule_pattern = ''
seen_separator = False
sign = True
# inclusion or exclusion pattern?
u_line = line
if line:
if line[0] == '-':
sign = False
u_line = line[1:]
elif line[0] == '+':
u_line = line[1:]
i = 0
while i < len(u_line):
c = u_line[i]
i = i + 1
if c == sep_char:
if seen_separator:
raise Exception(
'Invalid pattern: "' + line + '" Contains more than one '
'separator!')
seen_separator = True
continue
elif c == esc_char:
next_c = u_line[i] if (i < len(u_line)) else None
if next_c in ['+' , '-', esc_char, sep_char]:
i = i + 1
c = next_c
if seen_separator:
rule_pattern = rule_pattern + c
else:
file_pattern = file_pattern + c
if not rule_pattern:
rule_pattern = '**'
return sign, file_pattern, rule_pattern
def filter_sarif(input_sarif: PathLike,
output_sarif: PathLike,
patterns: List[str],
split_lines: bool) -> None:
"""Filters a SARIF file with a given set of filter patterns.
Args:
input_sarif (PathLike): Input SARIF file path.
output_sarif (PathLike): Output SARIF file path.
patterns (PathLike): List of filter pattern strings.
split_lines (PathLike): Whether to split lines in individual patterns.
"""
if split_lines:
tmp = []
for p in patterns:
tmp = tmp + re.split('\r?\n', p)
patterns = tmp
patterns = [_parse_pattern(p) for p in patterns if p]
logging.debug('Given patterns:')
for s, fp, rp in patterns:
logging.debug(
'files: {file_pattern} rules: {rule_pattern} ({sign})'.format(
file_pattern=fp,
rule_pattern=rp,
sign='positive' if s else 'negative'))
with open(input_sarif, 'r') as f:
s = json.load(f)
for run in s.get('runs', []):
if run.get('results', []):
new_results = []
for r in run['results']:
if r.get('locations', []):
new_locations = []
for l in r['locations']:
# TODO: The uri field is optional. We might have to
# fetch the actual uri from "artifacts" via
# "index"
# (see https://github.com/microsoft/sarif-tutorials/blob/main/docs/2-Basics.md#-linking-results-to-artifacts)
uri = l.get(
'physicalLocation', {}).get(
'artifactLocation', {}).get(
'uri', None)
# TODO: The ruleId field is optional and potentially
# ambiguous. We might have to fetch the actual
# ruleId from the rule metadata via the ruleIndex
# field.
# (see https://github.com/microsoft/sarif-tutorials/blob/main/docs/2-Basics.md#rule-metadata)
ruleId = r['ruleId']
if (uri is None or
_match_path_and_rule(uri, ruleId, patterns)):
new_locations.append(l)
r['locations'] = new_locations
if new_locations:
new_results.append(r)
else:
# locations array doesn't exist or is empty, so we can't
# match on anything. Therefore, we include the result in
# the output.
new_results.append(r)
run['results'] = new_results
with open(output_sarif, 'w') as f:
json.dump(s, f, indent=2)