# @file analyze_filter.py | |
# | |
# Filters results in a SARIF file. | |
# | |
# Apache License | |
# Version 2.0, January 2004 | |
# http://www.apache.org/licenses/ | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
# This file has been altered from its original form. Based on code in: | |
# https://github.com/advanced-security/filter-sarif | |
# | |
# It primarily contains modifications made to integrate with the CodeQL plugin. | |
# | |
# Specifically: | |
# https://github.com/advanced-security/filter-sarif/blob/main/filter_sarif.py | |
# | |
# View the full and complete license as provided by that repository here: | |
# https://github.com/advanced-security/filter-sarif/blob/main/LICENSE | |
# | |
# SPDX-License-Identifier: Apache-2.0 | |
## | |
import json | |
import logging | |
import re | |
from os import PathLike | |
from typing import Iterable, List, Tuple | |
from analyze.globber import match | |
def _match_path_and_rule( | |
path: str, rule: str, patterns: Iterable[str]) -> bool: | |
"""Returns whether a given path matches a given rule. | |
Args: | |
path (str): A file path string. | |
rule (str): A rule file path string. | |
patterns (Iterable[str]): An iterable of pattern strings. | |
Returns: | |
bool: True if the path matches a rule. Otherwise, False. | |
""" | |
result = True | |
for s, fp, rp in patterns: | |
if match(rp, rule) and match(fp, path): | |
result = s | |
return result | |
def _parse_pattern(line: str) -> Tuple[str]: | |
"""Parses a given pattern line. | |
Args: | |
line (str): The line string that contains the rule. | |
Returns: | |
Tuple[str]: The parsed sign, file pattern, and rule pattern from the | |
line. | |
""" | |
sep_char = ':' | |
esc_char = '\\' | |
file_pattern = '' | |
rule_pattern = '' | |
seen_separator = False | |
sign = True | |
# inclusion or exclusion pattern? | |
u_line = line | |
if line: | |
if line[0] == '-': | |
sign = False | |
u_line = line[1:] | |
elif line[0] == '+': | |
u_line = line[1:] | |
i = 0 | |
while i < len(u_line): | |
c = u_line[i] | |
i = i + 1 | |
if c == sep_char: | |
if seen_separator: | |
raise Exception( | |
'Invalid pattern: "' + line + '" Contains more than one ' | |
'separator!') | |
seen_separator = True | |
continue | |
elif c == esc_char: | |
next_c = u_line[i] if (i < len(u_line)) else None | |
if next_c in ['+' , '-', esc_char, sep_char]: | |
i = i + 1 | |
c = next_c | |
if seen_separator: | |
rule_pattern = rule_pattern + c | |
else: | |
file_pattern = file_pattern + c | |
if not rule_pattern: | |
rule_pattern = '**' | |
return sign, file_pattern, rule_pattern | |
def filter_sarif(input_sarif: PathLike, | |
output_sarif: PathLike, | |
patterns: List[str], | |
split_lines: bool) -> None: | |
"""Filters a SARIF file with a given set of filter patterns. | |
Args: | |
input_sarif (PathLike): Input SARIF file path. | |
output_sarif (PathLike): Output SARIF file path. | |
patterns (PathLike): List of filter pattern strings. | |
split_lines (PathLike): Whether to split lines in individual patterns. | |
""" | |
if split_lines: | |
tmp = [] | |
for p in patterns: | |
tmp = tmp + re.split('\r?\n', p) | |
patterns = tmp | |
patterns = [_parse_pattern(p) for p in patterns if p] | |
logging.debug('Given patterns:') | |
for s, fp, rp in patterns: | |
logging.debug( | |
'files: {file_pattern} rules: {rule_pattern} ({sign})'.format( | |
file_pattern=fp, | |
rule_pattern=rp, | |
sign='positive' if s else 'negative')) | |
with open(input_sarif, 'r') as f: | |
s = json.load(f) | |
for run in s.get('runs', []): | |
if run.get('results', []): | |
new_results = [] | |
for r in run['results']: | |
if r.get('locations', []): | |
new_locations = [] | |
for l in r['locations']: | |
# TODO: The uri field is optional. We might have to | |
# fetch the actual uri from "artifacts" via | |
# "index" | |
# (see https://github.com/microsoft/sarif-tutorials/blob/main/docs/2-Basics.md#-linking-results-to-artifacts) | |
uri = l.get( | |
'physicalLocation', {}).get( | |
'artifactLocation', {}).get( | |
'uri', None) | |
# TODO: The ruleId field is optional and potentially | |
# ambiguous. We might have to fetch the actual | |
# ruleId from the rule metadata via the ruleIndex | |
# field. | |
# (see https://github.com/microsoft/sarif-tutorials/blob/main/docs/2-Basics.md#rule-metadata) | |
ruleId = r['ruleId'] | |
if (uri is None or | |
_match_path_and_rule(uri, ruleId, patterns)): | |
new_locations.append(l) | |
r['locations'] = new_locations | |
if new_locations: | |
new_results.append(r) | |
else: | |
# locations array doesn't exist or is empty, so we can't | |
# match on anything. Therefore, we include the result in | |
# the output. | |
new_results.append(r) | |
run['results'] = new_results | |
with open(output_sarif, 'w') as f: | |
json.dump(s, f, indent=2) |