Add a SARIF output formatter

This commit adds a formatter that outputs JSON in a specific SARIF format according to spec at [1]. This code is largely leveraged from an existing implementation found here [2]. SARIF format is very useful for integration into ecosystems such as GitHub's Actions. [1] https://docs.oasis-open.org/sarif/sarif/v2.1.0/cs01/sarif-v2.1.0-cs01.html [2] https://github.com/microsoft/bandit-sarif-formatter Closes #646 Signed-off-by: Eric Brown <eric_wade_brown@yahoo.com>
PyCQA · Mar 5, 2024 · e98d3c2 · e98d3c2
1 parent a682a18
commit e98d3c2
Show file tree

Hide file tree

Showing 7 changed files with 384 additions and 1 deletion.
diff --git a/bandit/__init__.py b/bandit/__init__.py
@@ -16,4 +16,5 @@
 from bandit.core.issue import *  # noqa
 from bandit.core.test_properties import *  # noqa
 
+__author__ = metadata.metadata("bandit")["Author"]
 __version__ = metadata.version("bandit")
diff --git a/bandit/formatters/sarif.py b/bandit/formatters/sarif.py
@@ -0,0 +1,365 @@
+# Copyright (c) Microsoft.  All Rights Reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Note: this code mostly incorporated from
+# https://github.com/microsoft/bandit-sarif-formatter
+#
+r"""
+===============
+SARIF formatter
+===============
+
+This formatter outputs the issues in SARIF formatted JSON.
+
+:Example:
+
+.. code-block:: javascript
+
+    {
+      "runs": [
+        {
+          "tool": {
+            "driver": {
+              "name": "Bandit",
+              "organization": "PyCQA",
+              "rules": [
+                {
+                  "id": "B101",
+                  "name": "assert_used",
+                  "properties": {
+                    "tags": [
+                      "security",
+                      "external/cwe/cwe-703"
+                    ],
+                    "precision": "high"
+                  },
+                  "helpUri": "https://bandit.readthedocs.io/en/1.7.8/plugins/b101_assert_used.html"
+                }
+              ],
+              "version": "1.7.8",
+              "semanticVersion": "1.7.8"
+            }
+          },
+          "invocations": [
+            {
+              "executionSuccessful": true,
+              "endTimeUtc": "2024-03-05T03:28:48Z"
+            }
+          ],
+          "properties": {
+            "metrics": {
+              "_totals": {
+                "loc": 1,
+                "nosec": 0,
+                "skipped_tests": 0,
+                "SEVERITY.UNDEFINED": 0,
+                "CONFIDENCE.UNDEFINED": 0,
+                "SEVERITY.LOW": 1,
+                "CONFIDENCE.LOW": 0,
+                "SEVERITY.MEDIUM": 0,
+                "CONFIDENCE.MEDIUM": 0,
+                "SEVERITY.HIGH": 0,
+                "CONFIDENCE.HIGH": 1
+              },
+              "./examples/assert.py": {
+                "loc": 1,
+                "nosec": 0,
+                "skipped_tests": 0,
+                "SEVERITY.UNDEFINED": 0,
+                "SEVERITY.LOW": 1,
+                "SEVERITY.MEDIUM": 0,
+                "SEVERITY.HIGH": 0,
+                "CONFIDENCE.UNDEFINED": 0,
+                "CONFIDENCE.LOW": 0,
+                "CONFIDENCE.MEDIUM": 0,
+                "CONFIDENCE.HIGH": 1
+              }
+            }
+          },
+          "results": [
+            {
+              "message": {
+                "text": "Use of assert detected. The enclosed code will be removed when compiling to optimised byte code."
+              },
+              "level": "note",
+              "locations": [
+                {
+                  "physicalLocation": {
+                    "region": {
+                      "snippet": {
+                        "text": "assert True\n"
+                      },
+                      "endColumn": 11,
+                      "endLine": 1,
+                      "startColumn": 0,
+                      "startLine": 1
+                    },
+                    "artifactLocation": {
+                      "uri": "examples/assert.py"
+                    },
+                    "contextRegion": {
+                      "snippet": {
+                        "text": "assert True\n"
+                      },
+                      "endLine": 1,
+                      "startLine": 1
+                    }
+                  }
+                }
+              ],
+              "properties": {
+                "issue_confidence": "HIGH",
+                "issue_severity": "LOW"
+              },
+              "ruleId": "B101",
+              "ruleIndex": 0
+            }
+          ]
+        }
+      ],
+      "version": "2.1.0",
+      "$schema": "https://json.schemastore.org/sarif-2.1.0.json"
+    }
+
+.. versionadded:: 1.7.8
+
+"""  # noqa: E501
+import logging
+import pathlib
+import sys
+import urllib.parse as urlparse
+from datetime import datetime
+
+import sarif_om as om
+from jschema_to_python.to_json import to_json
+
+import bandit
+from bandit.core import docs_utils
+
+LOG = logging.getLogger(__name__)
+SCHEMA_URI = "https://json.schemastore.org/sarif-2.1.0.json"
+TS_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
+
+
+def report(manager, fileobj, sev_level, conf_level, lines=-1):
+    """Prints issues in SARIF format
+
+    :param manager: the bandit manager object
+    :param fileobj: The output file object, which may be sys.stdout
+    :param sev_level: Filtering severity level
+    :param conf_level: Filtering confidence level
+    :param lines: Number of lines to report, -1 for all
+    """
+
+    log = om.SarifLog(
+        schema_uri=SCHEMA_URI,
+        version="2.1.0",
+        runs=[
+            om.Run(
+                tool=om.Tool(
+                    driver=om.ToolComponent(
+                        name="Bandit",
+                        organization=bandit.__author__,
+                        semantic_version=bandit.__version__,
+                        version=bandit.__version__,
+                    )
+                ),
+                invocations=[
+                    om.Invocation(
+                        end_time_utc=datetime.utcnow().strftime(TS_FORMAT),
+                        execution_successful=True,
+                    )
+                ],
+                properties={"metrics": manager.metrics.data},
+            )
+        ],
+    )
+
+    run = log.runs[0]
+    invocation = run.invocations[0]
+
+    skips = manager.get_skipped()
+    add_skipped_file_notifications(skips, invocation)
+
+    issues = manager.get_issue_list(sev_level=sev_level, conf_level=conf_level)
+
+    add_results(issues, run)
+
+    serializedLog = to_json(log)
+
+    with fileobj:
+        fileobj.write(serializedLog)
+
+    if fileobj.name != sys.stdout.name:
+        LOG.info("SARIF output written to file: %s", fileobj.name)
+
+
+def add_skipped_file_notifications(skips, invocation):
+    if skips is None or len(skips) == 0:
+        return
+
+    if invocation.tool_configuration_notifications is None:
+        invocation.tool_configuration_notifications = []
+
+    for skip in skips:
+        (file_name, reason) = skip
+
+        notification = om.Notification(
+            level="error",
+            message=om.Message(text=reason),
+            locations=[
+                om.Location(
+                    physical_location=om.PhysicalLocation(
+                        artifact_location=om.ArtifactLocation(
+                            uri=to_uri(file_name)
+                        )
+                    )
+                )
+            ],
+        )
+
+        invocation.tool_configuration_notifications.append(notification)
+
+
+def add_results(issues, run):
+    if run.results is None:
+        run.results = []
+
+    rules = {}
+    rule_indices = {}
+    for issue in issues:
+        result = create_result(issue, rules, rule_indices)
+        run.results.append(result)
+
+    if len(rules) > 0:
+        run.tool.driver.rules = list(rules.values())
+
+
+def create_result(issue, rules, rule_indices):
+    issue_dict = issue.as_dict()
+
+    rule, rule_index = create_or_find_rule(issue_dict, rules, rule_indices)
+
+    physical_location = om.PhysicalLocation(
+        artifact_location=om.ArtifactLocation(
+            uri=to_uri(issue_dict["filename"])
+        )
+    )
+
+    add_region_and_context_region(
+        physical_location,
+        issue_dict["line_range"],
+        issue_dict["col_offset"],
+        issue_dict["end_col_offset"],
+        issue_dict["code"],
+    )
+
+    return om.Result(
+        rule_id=rule.id,
+        rule_index=rule_index,
+        message=om.Message(text=issue_dict["issue_text"]),
+        level=level_from_severity(issue_dict["issue_severity"]),
+        locations=[om.Location(physical_location=physical_location)],
+        properties={
+            "issue_confidence": issue_dict["issue_confidence"],
+            "issue_severity": issue_dict["issue_severity"],
+        },
+    )
+
+
+def level_from_severity(severity):
+    if severity == "HIGH":
+        return "error"
+    elif severity == "MEDIUM":
+        return "warning"
+    elif severity == "LOW":
+        return "note"
+    else:
+        return "warning"
+
+
+def add_region_and_context_region(
+    physical_location, line_range, col_offset, end_col_offset, code
+):
+    first_line_number, snippet_lines = parse_code(code)
+    snippet_line = snippet_lines[line_range[0] - first_line_number]
+
+    physical_location.region = om.Region(
+        start_line=line_range[0],
+        end_line=line_range[1] if len(line_range) > 1 else line_range[0],
+        start_column=col_offset + 1,
+        end_column=end_col_offset + 1,
+        snippet=om.ArtifactContent(text=snippet_line),
+    )
+
+    physical_location.context_region = om.Region(
+        start_line=first_line_number,
+        end_line=first_line_number + len(snippet_lines) - 1,
+        snippet=om.ArtifactContent(text="".join(snippet_lines)),
+    )
+
+
+def parse_code(code):
+    code_lines = code.split("\n")
+
+    # The last line from the split has nothing in it; it's an artifact of the
+    # last "real" line ending in a newline. Unless, of course, it doesn't:
+    last_line = code_lines[len(code_lines) - 1]
+
+    last_real_line_ends_in_newline = False
+    if len(last_line) == 0:
+        code_lines.pop()
+        last_real_line_ends_in_newline = True
+
+    snippet_lines = []
+    first = True
+    for code_line in code_lines:
+        number_and_snippet_line = code_line.split(" ", 1)
+        if first:
+            first_line_number = int(number_and_snippet_line[0])
+            first = False
+
+        snippet_line = number_and_snippet_line[1] + "\n"
+        snippet_lines.append(snippet_line)
+
+    if not last_real_line_ends_in_newline:
+        last_line = snippet_lines[len(snippet_lines) - 1]
+        snippet_lines[len(snippet_lines) - 1] = last_line[: len(last_line) - 1]
+
+    return first_line_number, snippet_lines
+
+
+def create_or_find_rule(issue_dict, rules, rule_indices):
+    rule_id = issue_dict["test_id"]
+    if rule_id in rules:
+        return rules[rule_id], rule_indices[rule_id]
+
+    rule = om.ReportingDescriptor(
+        id=rule_id,
+        name=issue_dict["test_name"],
+        help_uri=docs_utils.get_url(rule_id),
+        properties={
+            "tags": [
+                "security",
+                f"external/cwe/cwe-{issue_dict['issue_cwe'].get('id')}",
+            ],
+            "precision": issue_dict["issue_confidence"].lower(),
+        },
+    )
+
+    index = len(rules)
+    rules[rule_id] = rule
+    rule_indices[rule_id] = index
+    return rule, index
+
+
+def to_uri(file_path):
+    pure_path = pathlib.PurePath(file_path)
+    if pure_path.is_absolute():
+        return pure_path.as_uri()
+    else:
+        # Replace backslashes with slashes.
+        posix_path = pure_path.as_posix()
+        # %-encode special characters.
+        return urlparse.quote(posix_path)
diff --git a/doc/source/formatters/sarif.rst b/doc/source/formatters/sarif.rst
@@ -0,0 +1,5 @@
+-----
+sarif
+-----
+
+.. automodule:: bandit.formatters.sarif
diff --git a/doc/source/man/bandit.rst b/doc/source/man/bandit.rst
@@ -44,7 +44,7 @@ OPTIONS
                         (-l for LOW, -ll for MEDIUM, -lll for HIGH)
   -i, --confidence      report only issues of a given confidence level or
                         higher (-i for LOW, -ii for MEDIUM, -iii for HIGH)
-  -f {csv,custom,html,json,screen,txt,xml,yaml}, --format {csv,custom,html,json,screen,txt,xml,yaml}
+  -f {csv,custom,html,json,sarif,screen,txt,xml,yaml}, --format {csv,custom,html,json,sarif,screen,txt,xml,yaml}
                         specify output format
   --msg-template MSG_TEMPLATE
                         specify output message template (only usable with