重要前提
安装AI Skills的关键前提是:必须科学上网,且开启TUN模式,这一点至关重要,直接决定安装能否顺利完成,在此郑重提醒三遍:科学上网,科学上网,科学上网。查看完整安装教程 →
data-quality-auditor by dkyazzentwatwa/chatgpt-skills
npx skills add https://github.com/dkyazzentwatwa/chatgpt-skills --skill data-quality-auditor针对 CSV/Excel 数据集进行全面的数据质量评估。
from data_quality_auditor import DataQualityAuditor
auditor = DataQualityAuditor()
auditor.load_csv("customers.csv")
# 运行完整审计
report = auditor.audit()
print(f"Quality Score: {report['quality_score']}/100")
# 检查特定问题
missing = auditor.check_missing()
duplicates = auditor.check_duplicates()
# 完整审计
python data_quality_auditor.py --input data.csv
# 生成 HTML 报告
python data_quality_auditor.py --input data.csv --report report.html
# 检查特定方面
python data_quality_auditor.py --input data.csv --missing
python data_quality_auditor.py --input data.csv --duplicates
python data_quality_auditor.py --input data.csv --types
# JSON 输出
python data_quality_auditor.py --input data.csv --json
# 根据规则验证
python data_quality_auditor.py --input data.csv --rules rules.json
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
class DataQualityAuditor:
def __init__(self)
# 数据加载
def load_csv(self, filepath: str, **kwargs) -> 'DataQualityAuditor'
def load_dataframe(self, df: pd.DataFrame) -> 'DataQualityAuditor'
# 完整审计
def audit(self) -> dict
def quality_score(self) -> float
# 单项检查
def check_missing(self) -> dict
def check_duplicates(self, subset: list = None) -> dict
def check_types(self) -> dict
def check_uniqueness(self) -> dict
def check_patterns(self, column: str, pattern: str) -> dict
# 验证
def validate_column(self, column: str, rules: dict) -> dict
def validate_dataset(self, rules: dict) -> dict
# 报告
def generate_report(self, output: str, format: str = "html") -> str
def summary(self) -> str
missing = auditor.check_missing()
# 返回:
{
"total_cells": 10000,
"missing_cells": 150,
"missing_percent": 1.5,
"by_column": {
"email": {"count": 50, "percent": 5.0},
"phone": {"count": 100, "percent": 10.0}
},
"rows_with_missing": 120
}
dups = auditor.check_duplicates()
# 返回:
{
"total_rows": 1000,
"duplicate_rows": 25,
"duplicate_percent": 2.5,
"duplicate_groups": [...],
"by_columns": {
"email": {"duplicates": 15},
"phone": {"duplicates": 20}
}
}
types = auditor.check_types()
# 返回:
{
"columns": {
"age": {
"detected_type": "int64",
"unique_values": 75,
"sample_values": [25, 30, 45],
"issues": []
},
"date": {
"detected_type": "object",
"unique_values": 365,
"sample_values": ["2023-01-01", "invalid"],
"issues": ["Mixed date formats detected"]
}
}
}
定义自定义验证规则:
{
"columns": {
"email": {
"required": true,
"unique": true,
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
},
"age": {
"type": "integer",
"min": 0,
"max": 120
},
"status": {
"allowed_values": ["active", "inactive", "pending"]
},
"created_at": {
"type": "date",
"format": "%Y-%m-%d"
}
}
}
results = auditor.validate_dataset(rules)
质量评分(0-100)根据以下因素计算:
完整性 (30%): 缺失值比例
唯一性 (25%): 重复行比例
有效性 (25%): 类型和约束合规性
一致性 (20%): 格式和模式遵循度
score = auditor.quality_score()
{
"file": "data.csv",
"rows": 1000,
"columns": 15,
"quality_score": 85.5,
"completeness": {
"score": 92.0,
"missing_cells": 800,
"details": {...}
},
"uniqueness": {
"score": 97.5,
"duplicate_rows": 25,
"details": {...}
},
"validity": {
"score": 78.0,
"type_issues": [...],
"details": {...}
},
"consistency": {
"score": 80.0,
"pattern_issues": [...],
"details": {...}
},
"recommendations": [
"列 'phone' 有 10% 的缺失值",
"检测到 25 行重复数据",
"列 'date' 的格式不一致"
]
}
auditor = DataQualityAuditor()
auditor.load_csv("import_data.csv")
report = auditor.audit()
if report['quality_score'] < 80:
print("数据质量低于阈值!")
for rec in report['recommendations']:
print(f" - {rec}")
exit(1)
auditor = DataQualityAuditor()
auditor.load_dataframe(transformed_df)
# 检查关键列
email_check = auditor.validate_column("email", {
"required": True,
"unique": True,
"pattern": r"^[\w.+-]+@[\w-]+\.[\w.-]+$"
})
if email_check['issues']:
raise ValueError(f"电子邮件验证失败: {email_check['issues']}")
auditor = DataQualityAuditor()
auditor.load_csv("dataset.csv")
# 生成综合报告
auditor.generate_report("quality_report.html", format="html")
# 或获取摘要文本
print(auditor.summary())
每周安装次数
67
代码仓库
GitHub 星标数
39
首次出现
2026年1月24日
安全审计
安装于
opencode55
gemini-cli50
codex48
cursor47
claude-code44
github-copilot43
Comprehensive data quality assessment for CSV/Excel datasets.
from data_quality_auditor import DataQualityAuditor
auditor = DataQualityAuditor()
auditor.load_csv("customers.csv")
# Run full audit
report = auditor.audit()
print(f"Quality Score: {report['quality_score']}/100")
# Check specific issues
missing = auditor.check_missing()
duplicates = auditor.check_duplicates()
# Full audit
python data_quality_auditor.py --input data.csv
# Generate HTML report
python data_quality_auditor.py --input data.csv --report report.html
# Check specific aspects
python data_quality_auditor.py --input data.csv --missing
python data_quality_auditor.py --input data.csv --duplicates
python data_quality_auditor.py --input data.csv --types
# JSON output
python data_quality_auditor.py --input data.csv --json
# Validate against rules
python data_quality_auditor.py --input data.csv --rules rules.json
class DataQualityAuditor:
def __init__(self)
# Data loading
def load_csv(self, filepath: str, **kwargs) -> 'DataQualityAuditor'
def load_dataframe(self, df: pd.DataFrame) -> 'DataQualityAuditor'
# Full audit
def audit(self) -> dict
def quality_score(self) -> float
# Individual checks
def check_missing(self) -> dict
def check_duplicates(self, subset: list = None) -> dict
def check_types(self) -> dict
def check_uniqueness(self) -> dict
def check_patterns(self, column: str, pattern: str) -> dict
# Validation
def validate_column(self, column: str, rules: dict) -> dict
def validate_dataset(self, rules: dict) -> dict
# Reports
def generate_report(self, output: str, format: str = "html") -> str
def summary(self) -> str
missing = auditor.check_missing()
# Returns:
{
"total_cells": 10000,
"missing_cells": 150,
"missing_percent": 1.5,
"by_column": {
"email": {"count": 50, "percent": 5.0},
"phone": {"count": 100, "percent": 10.0}
},
"rows_with_missing": 120
}
dups = auditor.check_duplicates()
# Returns:
{
"total_rows": 1000,
"duplicate_rows": 25,
"duplicate_percent": 2.5,
"duplicate_groups": [...],
"by_columns": {
"email": {"duplicates": 15},
"phone": {"duplicates": 20}
}
}
types = auditor.check_types()
# Returns:
{
"columns": {
"age": {
"detected_type": "int64",
"unique_values": 75,
"sample_values": [25, 30, 45],
"issues": []
},
"date": {
"detected_type": "object",
"unique_values": 365,
"sample_values": ["2023-01-01", "invalid"],
"issues": ["Mixed date formats detected"]
}
}
}
Define custom validation rules:
{
"columns": {
"email": {
"required": true,
"unique": true,
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
},
"age": {
"type": "integer",
"min": 0,
"max": 120
},
"status": {
"allowed_values": ["active", "inactive", "pending"]
},
"created_at": {
"type": "date",
"format": "%Y-%m-%d"
}
}
}
results = auditor.validate_dataset(rules)
The quality score (0-100) is calculated from:
Completeness (30%): Missing value ratio
Uniqueness (25%): Duplicate row ratio
Validity (25%): Type and constraint compliance
Consistency (20%): Format and pattern adherence
score = auditor.quality_score()
{
"file": "data.csv",
"rows": 1000,
"columns": 15,
"quality_score": 85.5,
"completeness": {
"score": 92.0,
"missing_cells": 800,
"details": {...}
},
"uniqueness": {
"score": 97.5,
"duplicate_rows": 25,
"details": {...}
},
"validity": {
"score": 78.0,
"type_issues": [...],
"details": {...}
},
"consistency": {
"score": 80.0,
"pattern_issues": [...],
"details": {...}
},
"recommendations": [
"Column 'phone' has 10% missing values",
"25 duplicate rows detected",
"Column 'date' has inconsistent formats"
]
}
auditor = DataQualityAuditor()
auditor.load_csv("import_data.csv")
report = auditor.audit()
if report['quality_score'] < 80:
print("Data quality below threshold!")
for rec in report['recommendations']:
print(f" - {rec}")
exit(1)
auditor = DataQualityAuditor()
auditor.load_dataframe(transformed_df)
# Check critical columns
email_check = auditor.validate_column("email", {
"required": True,
"unique": True,
"pattern": r"^[\w.+-]+@[\w-]+\.[\w.-]+$"
})
if email_check['issues']:
raise ValueError(f"Email validation failed: {email_check['issues']}")
auditor = DataQualityAuditor()
auditor.load_csv("dataset.csv")
# Generate comprehensive report
auditor.generate_report("quality_report.html", format="html")
# Or get summary text
print(auditor.summary())
Weekly Installs
67
Repository
GitHub Stars
39
First Seen
Jan 24, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
opencode55
gemini-cli50
codex48
cursor47
claude-code44
github-copilot43
前端代码审计工具 - 自动化检测可访问性、性能、响应式设计、主题化与反模式
49,600 周安装