sarif-parsing by trailofbits/skills
npx skills add https://github.com/trailofbits/skills --skill sarif-parsing您是一位 SARIF 解析专家。您的职责是帮助用户有效地读取、分析和处理来自静态分析工具的 SARIF 文件。
在以下情况时使用此技能:
请勿在以下情况使用此技能:
SARIF 2.1.0 是当前的 OASIS 标准。每个 SARIF 文件都具有以下层次结构:
sarifLog
├── version: "2.1.0"
├── $schema: (optional, enables IDE validation)
└── runs[] (array of analysis runs)
├── tool
│ ├── driver
│ │ ├── name (required)
│ │ ├── version
│ │ └── rules[] (rule definitions)
│ └── extensions[] (plugins)
├── results[] (findings)
│ ├── ruleId
│ ├── level (error/warning/note)
│ ├── message.text
│ ├── locations[]
│ │ └── physicalLocation
│ │ ├── artifactLocation.uri
│ │ └── region (startLine, startColumn, etc.)
│ ├── fingerprints{}
│ └── partialFingerprints{}
└── artifacts[] (scanned files metadata)
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
没有稳定的指纹,就无法跨运行跟踪发现结果:
工具报告不同的路径(/path/to/project/ 与 /github/workspace/),因此基于路径的匹配会失败。指纹对内容(代码片段、规则 ID、相对位置)进行哈希处理,以创建独立于环境的稳定标识符。
| 使用场景 | 工具 | 安装 |
|---|---|---|
| 快速 CLI 查询 | jq | brew install jq / apt install jq |
| Python 脚本(简单) | pysarif | pip install pysarif |
| Python 脚本(高级) | sarif-tools | pip install sarif-tools |
| .NET 应用程序 | SARIF SDK | NuGet 包 |
| JavaScript/Node.js | sarif-js | npm 包 |
| Go 应用程序 | garif | go get github.com/chavacava/garif |
| 验证 | SARIF Validator | sarifweb.azurewebsites.net |
用于快速探索和一次性查询:
# 美化打印文件
jq '.' results.sarif
# 统计总发现数
jq '[.runs[].results[]] | length' results.sarif
# 列出所有触发的规则 ID
jq '[.runs[].results[].ruleId] | unique' results.sarif
# 仅提取错误
jq '.runs[].results[] | select(.level == "error")' results.sarif
# 获取带有文件位置的发现结果
jq '.runs[].results[] | {
rule: .ruleId,
message: .message.text,
file: .locations[0].physicalLocation.artifactLocation.uri,
line: .locations[0].physicalLocation.region.startLine
}' results.sarif
# 按严重性过滤并按规则获取计数
jq '[.runs[].results[] | select(.level == "error")] | group_by(.ruleId) | map({rule: .[0].ruleId, count: length})' results.sarif
# 提取特定文件的发现结果
jq --arg file "src/auth.py" '.runs[].results[] | select(.locations[].physicalLocation.artifactLocation.uri | contains($file))' results.sarif
用于具有完整对象模型的程序化访问:
from pysarif import load_from_file, save_to_file
# 加载 SARIF 文件
sarif = load_from_file("results.sarif")
# 遍历运行和结果
for run in sarif.runs:
tool_name = run.tool.driver.name
print(f"Tool: {tool_name}")
for result in run.results:
print(f" [{result.level}] {result.rule_id}: {result.message.text}")
if result.locations:
loc = result.locations[0].physical_location
if loc and loc.artifact_location:
print(f" File: {loc.artifact_location.uri}")
if loc.region:
print(f" Line: {loc.region.start_line}")
# 保存修改后的 SARIF
save_to_file(sarif, "modified.sarif")
用于聚合、报告和 CI/CD 集成:
from sarif import loader
# 加载单个文件
sarif_data = loader.load_sarif_file("results.sarif")
# 或加载多个文件
sarif_set = loader.load_sarif_files(["tool1.sarif", "tool2.sarif"])
# 获取摘要报告
report = sarif_data.get_report()
# 按严重性获取直方图
errors = report.get_issue_type_histogram_for_severity("error")
warnings = report.get_issue_type_histogram_for_severity("warning")
# 过滤结果
high_severity = [r for r in sarif_data.get_results()
if r.get("level") == "error"]
sarif-tools CLI 命令:
# 发现结果摘要
sarif summary results.sarif
# 列出所有结果及其详细信息
sarif ls results.sarif
# 按严重性获取结果
sarif ls --level error results.sarif
# 比较两个 SARIF 文件(查找新的/已修复的问题)
sarif diff baseline.sarif current.sarif
# 转换为其他格式
sarif csv results.sarif > results.csv
sarif html results.sarif > report.html
当合并来自多个工具的结果时:
import json
from pathlib import Path
def aggregate_sarif_files(sarif_paths: list[str]) -> dict:
"""将多个 SARIF 文件合并为一个。"""
aggregated = {
"version": "2.1.0",
"$schema": "https://json.schemastore.org/sarif-2.1.0.json",
"runs": []
}
for path in sarif_paths:
with open(path) as f:
sarif = json.load(f)
aggregated["runs"].extend(sarif.get("runs", []))
return aggregated
def deduplicate_results(sarif: dict) -> dict:
"""基于指纹移除重复的发现结果。"""
seen_fingerprints = set()
for run in sarif["runs"]:
unique_results = []
for result in run.get("results", []):
# 使用 partialFingerprints 或根据位置创建键
fp = None
if result.get("partialFingerprints"):
fp = tuple(sorted(result["partialFingerprints"].items()))
elif result.get("fingerprints"):
fp = tuple(sorted(result["fingerprints"].items()))
else:
# 回退方案:根据规则 + 位置创建指纹
loc = result.get("locations", [{}])[0]
phys = loc.get("physicalLocation", {})
fp = (
result.get("ruleId"),
phys.get("artifactLocation", {}).get("uri"),
phys.get("region", {}).get("startLine")
)
if fp not in seen_fingerprints:
seen_fingerprints.add(fp)
unique_results.append(result)
run["results"] = unique_results
return sarif
import json
from dataclasses import dataclass
from typing import Optional
@dataclass
class Finding:
rule_id: str
level: str
message: str
file_path: Optional[str]
start_line: Optional[int]
end_line: Optional[int]
fingerprint: Optional[str]
def extract_findings(sarif_path: str) -> list[Finding]:
"""从 SARIF 文件中提取结构化的发现结果。"""
with open(sarif_path) as f:
sarif = json.load(f)
findings = []
for run in sarif.get("runs", []):
for result in run.get("results", []):
loc = result.get("locations", [{}])[0]
phys = loc.get("physicalLocation", {})
region = phys.get("region", {})
findings.append(Finding(
rule_id=result.get("ruleId", "unknown"),
level=result.get("level", "warning"),
message=result.get("message", {}).get("text", ""),
file_path=phys.get("artifactLocation", {}).get("uri"),
start_line=region.get("startLine"),
end_line=region.get("endLine"),
fingerprint=next(iter(result.get("partialFingerprints", {}).values()), None)
))
return findings
# 过滤和优先级排序
def prioritize_findings(findings: list[Finding]) -> list[Finding]:
"""按严重性对发现结果进行排序。"""
severity_order = {"error": 0, "warning": 1, "note": 2, "none": 3}
return sorted(findings, key=lambda f: severity_order.get(f.level, 99))
不同的工具以不同的方式报告路径(绝对路径、相对路径、URI 编码):
from urllib.parse import unquote
from pathlib import Path
def normalize_path(uri: str, base_path: str = "") -> str:
"""将 SARIF 工件 URI 规范化为一致的路径。"""
# 如果存在 file:// 前缀,则移除
if uri.startswith("file://"):
uri = uri[7:]
# URL 解码
uri = unquote(uri)
# 处理相对路径
if not Path(uri).is_absolute() and base_path:
uri = str(Path(base_path) / uri)
# 规范化分隔符
return str(Path(uri))
如果出现以下情况,指纹可能不匹配:
解决方案: 使用多种指纹策略:
def compute_stable_fingerprint(result: dict, file_content: str = None) -> str:
"""计算与环境无关的指纹。"""
import hashlib
components = [
result.get("ruleId", ""),
result.get("message", {}).get("text", "")[:100], # 前 100 个字符
]
# 如果可用,添加代码片段
if file_content and result.get("locations"):
region = result["locations"][0].get("physicalLocation", {}).get("region", {})
if region.get("startLine"):
lines = file_content.split("\n")
line_idx = region["startLine"] - 1
if 0 <= line_idx < len(lines):
# 规范化空白字符
components.append(lines[line_idx].strip())
return hashlib.sha256("".join(components).encode()).hexdigest()[:16]
SARIF 允许许多可选字段。始终使用防御性访问:
def safe_get_location(result: dict) -> tuple[str, int]:
"""安全地从结果中提取文件和行号。"""
try:
loc = result.get("locations", [{}])[0]
phys = loc.get("physicalLocation", {})
file_path = phys.get("artifactLocation", {}).get("uri", "unknown")
line = phys.get("region", {}).get("startLine", 0)
return file_path, line
except (IndexError, KeyError, TypeError):
return "unknown", 0
对于非常大的 SARIF 文件(100MB+):
import ijson # pip install ijson
def stream_results(sarif_path: str):
"""流式传输结果而不加载整个文件。"""
with open(sarif_path, "rb") as f:
# 流式遍历结果数组
for result in ijson.items(f, "runs.item.results.item"):
yield result
在处理前进行验证以捕获格式错误的文件:
# 使用 ajv-cli
npm install -g ajv-cli
ajv validate -s sarif-schema-2.1.0.json -d results.sarif
# 使用 Python jsonschema
pip install jsonschema
from jsonschema import validate, ValidationError
import json
def validate_sarif(sarif_path: str, schema_path: str) -> bool:
"""根据模式验证 SARIF 文件。"""
with open(sarif_path) as f:
sarif = json.load(f)
with open(schema_path) as f:
schema = json.load(f)
try:
validate(sarif, schema)
return True
except ValidationError as e:
print(f"Validation error: {e.message}")
return False
- name: Upload SARIF
uses: github/codeql-action/upload-sarif@v3
with:
sarif_file: results.sarif
- name: Check for high severity
run: |
HIGH_COUNT=$(jq '[.runs[].results[] | select(.level == "error")] | length' results.sarif)
if [ "$HIGH_COUNT" -gt 0 ]; then
echo "Found $HIGH_COUNT high severity issues"
exit 1
fi
from sarif import loader
def check_for_regressions(baseline: str, current: str) -> int:
"""返回基线中不存在的新问题数量。"""
baseline_data = loader.load_sarif_file(baseline)
current_data = loader.load_sarif_file(current)
baseline_fps = {get_fingerprint(r) for r in baseline_data.get_results()}
new_issues = [r for r in current_data.get_results()
if get_fingerprint(r) not in baseline_fps]
return len(new_issues)
有关即用型查询模板,请参阅 {baseDir}/resources/jq-queries.md:
有关 Python 实用程序,请参阅 {baseDir}/resources/sarif_helpers.py:
normalize_path() - 处理特定于工具的路径格式compute_fingerprint() - 忽略路径的稳定指纹识别deduplicate_results() - 移除跨运行的重复项每周安装
1.2K
仓库
GitHub 星标
3.9K
首次出现
2026 年 1 月 19 日
安全审计
安装于
claude-code1.0K
codex977
opencode923
gemini-cli902
cursor877
github-copilot843
You are a SARIF parsing expert. Your role is to help users effectively read, analyze, and process SARIF files from static analysis tools.
Use this skill when:
Do NOT use this skill for:
SARIF 2.1.0 is the current OASIS standard. Every SARIF file has this hierarchical structure:
sarifLog
├── version: "2.1.0"
├── $schema: (optional, enables IDE validation)
└── runs[] (array of analysis runs)
├── tool
│ ├── driver
│ │ ├── name (required)
│ │ ├── version
│ │ └── rules[] (rule definitions)
│ └── extensions[] (plugins)
├── results[] (findings)
│ ├── ruleId
│ ├── level (error/warning/note)
│ ├── message.text
│ ├── locations[]
│ │ └── physicalLocation
│ │ ├── artifactLocation.uri
│ │ └── region (startLine, startColumn, etc.)
│ ├── fingerprints{}
│ └── partialFingerprints{}
└── artifacts[] (scanned files metadata)
Without stable fingerprints, you can't track findings across runs:
Tools report different paths (/path/to/project/ vs /github/workspace/), so path-based matching fails. Fingerprints hash the content (code snippet, rule ID, relative location) to create stable identifiers regardless of environment.
| Use Case | Tool | Installation |
|---|---|---|
| Quick CLI queries | jq | brew install jq / apt install jq |
| Python scripting (simple) | pysarif | pip install pysarif |
| Python scripting (advanced) | sarif-tools | pip install sarif-tools |
| .NET applications | SARIF SDK | NuGet package |
| JavaScript/Node.js | sarif-js | npm package |
| Go applications | garif |
For rapid exploration and one-off queries:
# Pretty print the file
jq '.' results.sarif
# Count total findings
jq '[.runs[].results[]] | length' results.sarif
# List all rule IDs triggered
jq '[.runs[].results[].ruleId] | unique' results.sarif
# Extract errors only
jq '.runs[].results[] | select(.level == "error")' results.sarif
# Get findings with file locations
jq '.runs[].results[] | {
rule: .ruleId,
message: .message.text,
file: .locations[0].physicalLocation.artifactLocation.uri,
line: .locations[0].physicalLocation.region.startLine
}' results.sarif
# Filter by severity and get count per rule
jq '[.runs[].results[] | select(.level == "error")] | group_by(.ruleId) | map({rule: .[0].ruleId, count: length})' results.sarif
# Extract findings for a specific file
jq --arg file "src/auth.py" '.runs[].results[] | select(.locations[].physicalLocation.artifactLocation.uri | contains($file))' results.sarif
For programmatic access with full object model:
from pysarif import load_from_file, save_to_file
# Load SARIF file
sarif = load_from_file("results.sarif")
# Iterate through runs and results
for run in sarif.runs:
tool_name = run.tool.driver.name
print(f"Tool: {tool_name}")
for result in run.results:
print(f" [{result.level}] {result.rule_id}: {result.message.text}")
if result.locations:
loc = result.locations[0].physical_location
if loc and loc.artifact_location:
print(f" File: {loc.artifact_location.uri}")
if loc.region:
print(f" Line: {loc.region.start_line}")
# Save modified SARIF
save_to_file(sarif, "modified.sarif")
For aggregation, reporting, and CI/CD integration:
from sarif import loader
# Load single file
sarif_data = loader.load_sarif_file("results.sarif")
# Or load multiple files
sarif_set = loader.load_sarif_files(["tool1.sarif", "tool2.sarif"])
# Get summary report
report = sarif_data.get_report()
# Get histogram by severity
errors = report.get_issue_type_histogram_for_severity("error")
warnings = report.get_issue_type_histogram_for_severity("warning")
# Filter results
high_severity = [r for r in sarif_data.get_results()
if r.get("level") == "error"]
sarif-tools CLI commands:
# Summary of findings
sarif summary results.sarif
# List all results with details
sarif ls results.sarif
# Get results by severity
sarif ls --level error results.sarif
# Diff two SARIF files (find new/fixed issues)
sarif diff baseline.sarif current.sarif
# Convert to other formats
sarif csv results.sarif > results.csv
sarif html results.sarif > report.html
When combining results from multiple tools:
import json
from pathlib import Path
def aggregate_sarif_files(sarif_paths: list[str]) -> dict:
"""Combine multiple SARIF files into one."""
aggregated = {
"version": "2.1.0",
"$schema": "https://json.schemastore.org/sarif-2.1.0.json",
"runs": []
}
for path in sarif_paths:
with open(path) as f:
sarif = json.load(f)
aggregated["runs"].extend(sarif.get("runs", []))
return aggregated
def deduplicate_results(sarif: dict) -> dict:
"""Remove duplicate findings based on fingerprints."""
seen_fingerprints = set()
for run in sarif["runs"]:
unique_results = []
for result in run.get("results", []):
# Use partialFingerprints or create key from location
fp = None
if result.get("partialFingerprints"):
fp = tuple(sorted(result["partialFingerprints"].items()))
elif result.get("fingerprints"):
fp = tuple(sorted(result["fingerprints"].items()))
else:
# Fallback: create fingerprint from rule + location
loc = result.get("locations", [{}])[0]
phys = loc.get("physicalLocation", {})
fp = (
result.get("ruleId"),
phys.get("artifactLocation", {}).get("uri"),
phys.get("region", {}).get("startLine")
)
if fp not in seen_fingerprints:
seen_fingerprints.add(fp)
unique_results.append(result)
run["results"] = unique_results
return sarif
import json
from dataclasses import dataclass
from typing import Optional
@dataclass
class Finding:
rule_id: str
level: str
message: str
file_path: Optional[str]
start_line: Optional[int]
end_line: Optional[int]
fingerprint: Optional[str]
def extract_findings(sarif_path: str) -> list[Finding]:
"""Extract structured findings from SARIF file."""
with open(sarif_path) as f:
sarif = json.load(f)
findings = []
for run in sarif.get("runs", []):
for result in run.get("results", []):
loc = result.get("locations", [{}])[0]
phys = loc.get("physicalLocation", {})
region = phys.get("region", {})
findings.append(Finding(
rule_id=result.get("ruleId", "unknown"),
level=result.get("level", "warning"),
message=result.get("message", {}).get("text", ""),
file_path=phys.get("artifactLocation", {}).get("uri"),
start_line=region.get("startLine"),
end_line=region.get("endLine"),
fingerprint=next(iter(result.get("partialFingerprints", {}).values()), None)
))
return findings
# Filter and prioritize
def prioritize_findings(findings: list[Finding]) -> list[Finding]:
"""Sort findings by severity."""
severity_order = {"error": 0, "warning": 1, "note": 2, "none": 3}
return sorted(findings, key=lambda f: severity_order.get(f.level, 99))
Different tools report paths differently (absolute, relative, URI-encoded):
from urllib.parse import unquote
from pathlib import Path
def normalize_path(uri: str, base_path: str = "") -> str:
"""Normalize SARIF artifact URI to consistent path."""
# Remove file:// prefix if present
if uri.startswith("file://"):
uri = uri[7:]
# URL decode
uri = unquote(uri)
# Handle relative paths
if not Path(uri).is_absolute() and base_path:
uri = str(Path(base_path) / uri)
# Normalize separators
return str(Path(uri))
Fingerprints may not match if:
Solution: Use multiple fingerprint strategies:
def compute_stable_fingerprint(result: dict, file_content: str = None) -> str:
"""Compute environment-independent fingerprint."""
import hashlib
components = [
result.get("ruleId", ""),
result.get("message", {}).get("text", "")[:100], # First 100 chars
]
# Add code snippet if available
if file_content and result.get("locations"):
region = result["locations"][0].get("physicalLocation", {}).get("region", {})
if region.get("startLine"):
lines = file_content.split("\n")
line_idx = region["startLine"] - 1
if 0 <= line_idx < len(lines):
# Normalize whitespace
components.append(lines[line_idx].strip())
return hashlib.sha256("".join(components).encode()).hexdigest()[:16]
SARIF allows many optional fields. Always use defensive access:
def safe_get_location(result: dict) -> tuple[str, int]:
"""Safely extract file and line from result."""
try:
loc = result.get("locations", [{}])[0]
phys = loc.get("physicalLocation", {})
file_path = phys.get("artifactLocation", {}).get("uri", "unknown")
line = phys.get("region", {}).get("startLine", 0)
return file_path, line
except (IndexError, KeyError, TypeError):
return "unknown", 0
For very large SARIF files (100MB+):
import ijson # pip install ijson
def stream_results(sarif_path: str):
"""Stream results without loading entire file."""
with open(sarif_path, "rb") as f:
# Stream through results arrays
for result in ijson.items(f, "runs.item.results.item"):
yield result
Validate before processing to catch malformed files:
# Using ajv-cli
npm install -g ajv-cli
ajv validate -s sarif-schema-2.1.0.json -d results.sarif
# Using Python jsonschema
pip install jsonschema
from jsonschema import validate, ValidationError
import json
def validate_sarif(sarif_path: str, schema_path: str) -> bool:
"""Validate SARIF file against schema."""
with open(sarif_path) as f:
sarif = json.load(f)
with open(schema_path) as f:
schema = json.load(f)
try:
validate(sarif, schema)
return True
except ValidationError as e:
print(f"Validation error: {e.message}")
return False
- name: Upload SARIF
uses: github/codeql-action/upload-sarif@v3
with:
sarif_file: results.sarif
- name: Check for high severity
run: |
HIGH_COUNT=$(jq '[.runs[].results[] | select(.level == "error")] | length' results.sarif)
if [ "$HIGH_COUNT" -gt 0 ]; then
echo "Found $HIGH_COUNT high severity issues"
exit 1
fi
from sarif import loader
def check_for_regressions(baseline: str, current: str) -> int:
"""Return count of new issues not in baseline."""
baseline_data = loader.load_sarif_file(baseline)
current_data = loader.load_sarif_file(current)
baseline_fps = {get_fingerprint(r) for r in baseline_data.get_results()}
new_issues = [r for r in current_data.get_results()
if get_fingerprint(r) not in baseline_fps]
return len(new_issues)
For ready-to-use query templates, see {baseDir}/resources/jq-queries.md:
For Python utilities, see {baseDir}/resources/sarif_helpers.py:
normalize_path() - Handle tool-specific path formatscompute_fingerprint() - Stable fingerprinting ignoring pathsdeduplicate_results() - Remove duplicates across runsWeekly Installs
1.2K
Repository
GitHub Stars
3.9K
First Seen
Jan 19, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
claude-code1.0K
codex977
opencode923
gemini-cli902
cursor877
github-copilot843
Azure 升级评估与自动化工具 - 轻松迁移 Functions 计划、托管层级和 SKU
59,200 周安装
go get github.com/chavacava/garif |
| Validation | SARIF Validator | sarifweb.azurewebsites.net |