npx skills add https://github.com/borghei/claude-skills --skill data-scientist专家级数据科学,创造业务影响力。
PROBLEM DEFINITION → DATA → FEATURES → MODEL → EVALUATION → DEPLOYMENT
1. 问题定义
- 业务目标
- 成功指标
- 约束条件
2. 数据收集
- 数据源
- 数据质量
- 样本大小
3. 特征工程
- 特征创建
- 特征选择
- 转换
4. 模型开发
- 算法选择
- 训练
- 调优
5. 评估
- 指标
- 验证
- 业务影响
6. 部署
- 生产环境流水线
- 监控
- 迭代
| 算法 | 使用场景 | 优点 | 缺点 |
|---|---|---|---|
| 线性回归 | 连续值预测 | 可解释性强,速度快 | 仅适用于线性关系 |
| 逻辑回归 | 二分类 | 可解释性强,概率输出 |
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
| 线性边界 |
| 随机森林 | 分类/回归 | 能处理非线性关系 | 可解释性较差 |
| XGBoost | 分类/回归 | 准确率高 | 有过拟合风险 |
| 神经网络 | 复杂模式 | 灵活性强 | 需要大量数据 |
def select_model(problem_type, data_size, interpretability_need, accuracy_need):
"""
problem_type: 'classification' or 'regression'
data_size: 'small' (<10K), 'medium' (10K-1M), 'large' (>1M)
interpretability_need: 'high', 'medium', 'low'
accuracy_need: 'high', 'medium', 'low'
"""
if interpretability_need == 'high':
if problem_type == 'classification':
return 'Logistic Regression'
else:
return 'Linear Regression'
if data_size == 'small':
return 'Random Forest'
if accuracy_need == 'high':
if data_size == 'large':
return 'Neural Network'
else:
return 'XGBoost'
return 'Random Forest'
# 数值特征
def engineer_numerical(df, col):
features = {
f'{col}_log': np.log1p(df[col]),
f'{col}_sqrt': np.sqrt(df[col]),
f'{col}_squared': df[col] ** 2,
f'{col}_binned': pd.cut(df[col], bins=5, labels=False)
}
return pd.DataFrame(features)
# 分类特征
def engineer_categorical(df, col):
# 独热编码
dummies = pd.get_dummies(df[col], prefix=col)
# 目标编码
target_mean = df.groupby(col)['target'].mean()
target_encoded = df[col].map(target_mean)
# 频率编码
freq = df[col].value_counts(normalize=True)
freq_encoded = df[col].map(freq)
return dummies, target_encoded, freq_encoded
# 时间特征
def engineer_time(df, col):
df[col] = pd.to_datetime(df[col])
features = {
f'{col}_hour': df[col].dt.hour,
f'{col}_day': df[col].dt.day,
f'{col}_dayofweek': df[col].dt.dayofweek,
f'{col}_month': df[col].dt.month,
f'{col}_is_weekend': df[col].dt.dayofweek.isin([5, 6]).astype(int),
f'{col}_hour_sin': np.sin(2 * np.pi * df[col].dt.hour / 24),
f'{col}_hour_cos': np.cos(2 * np.pi * df[col].dt.hour / 24)
}
return pd.DataFrame(features)
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
def select_features(X, y, method='importance', n_features=20):
if method == 'importance':
model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)
importance = pd.Series(model.feature_importances_, index=X.columns)
return importance.nlargest(n_features).index.tolist()
elif method == 'mutual_info':
mi_scores = mutual_info_classif(X, y)
mi_series = pd.Series(mi_scores, index=X.columns)
return mi_series.nlargest(n_features).index.tolist()
elif method == 'rfe':
model = RandomForestClassifier(n_estimators=100)
rfe = RFE(model, n_features_to_select=n_features)
rfe.fit(X, y)
return X.columns[rfe.support_].tolist()
from sklearn.metrics import (
accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, confusion_matrix,
classification_report
)
def evaluate_classification(y_true, y_pred, y_proba=None):
metrics = {
'accuracy': accuracy_score(y_true, y_pred),
'precision': precision_score(y_true, y_pred),
'recall': recall_score(y_true, y_pred),
'f1': f1_score(y_true, y_pred),
}
if y_proba is not None:
metrics['auc_roc'] = roc_auc_score(y_true, y_proba)
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))
return metrics
from sklearn.metrics import (
mean_absolute_error, mean_squared_error,
r2_score, mean_absolute_percentage_error
)
def evaluate_regression(y_true, y_pred):
metrics = {
'mae': mean_absolute_error(y_true, y_pred),
'mse': mean_squared_error(y_true, y_pred),
'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
'r2': r2_score(y_true, y_pred),
'mape': mean_absolute_percentage_error(y_true, y_pred)
}
return metrics
from scipy import stats
def calculate_sample_size(baseline_rate, mde, alpha=0.05, power=0.8):
"""
计算每个变体所需的样本量
baseline_rate: 当前转化率(例如 0.05)
mde: 最小可检测效应(例如 0.1 表示提升 10%)
alpha: 显著性水平
power: 统计功效
"""
effect_size = baseline_rate * mde
z_alpha = stats.norm.ppf(1 - alpha / 2)
z_beta = stats.norm.ppf(power)
p = baseline_rate
p_new = p + effect_size
n = (2 * p * (1 - p) * (z_alpha + z_beta) ** 2) / (effect_size ** 2)
return int(np.ceil(n))
def analyze_ab_test(control, treatment, alpha=0.05):
"""
分析 A/B 测试结果
control: 对照组的 0/1 结果数组
treatment: 实验组的 0/1 结果数组
"""
n_control = len(control)
n_treatment = len(treatment)
p_control = control.mean()
p_treatment = treatment.mean()
# 合并比例
p_pool = (control.sum() + treatment.sum()) / (n_control + n_treatment)
# 标准误差
se = np.sqrt(p_pool * (1 - p_pool) * (1/n_control + 1/n_treatment))
# Z 统计量
z = (p_treatment - p_control) / se
# P 值(双尾)
p_value = 2 * (1 - stats.norm.cdf(abs(z)))
# 置信区间
ci_low = (p_treatment - p_control) - 1.96 * se
ci_high = (p_treatment - p_control) + 1.96 * se
return {
'control_rate': p_control,
'treatment_rate': p_treatment,
'lift': (p_treatment - p_control) / p_control,
'z_statistic': z,
'p_value': p_value,
'significant': p_value < alpha,
'confidence_interval': (ci_low, ci_high)
}
from scipy import stats
# T 检验
def compare_means(group1, group2):
stat, p_value = stats.ttest_ind(group1, group2)
effect_size = (group1.mean() - group2.mean()) / np.sqrt(
(group1.std()**2 + group2.std()**2) / 2
)
return {'t_statistic': stat, 'p_value': p_value, 'cohens_d': effect_size}
# 卡方检验
def test_independence(contingency_table):
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
return {'chi2': chi2, 'p_value': p_value, 'degrees_of_freedom': dof}
# 相关性分析
def analyze_correlation(x, y):
pearson_r, pearson_p = stats.pearsonr(x, y)
spearman_r, spearman_p = stats.spearmanr(x, y)
return {
'pearson': {'r': pearson_r, 'p_value': pearson_p},
'spearman': {'r': spearman_r, 'p_value': spearman_p}
}
# 数据科学项目:[项目名称]
## 业务目标
[我们要解决什么业务问题?]
## 成功指标
- 主要指标:[指标]
- 次要指标:[指标]
## 数据
- 来源:[列表]
- 规模:[行数/特征数]
- 时间范围:[日期]
## 方法论
1. [步骤 1]
2. [步骤 2]
## 结果
### 模型性能
| 指标 | 值 |
|--------|-------|
| [指标] | [值] |
### 业务影响
- [影响 1]
- [影响 2]
## 建议
1. [建议]
## 后续步骤
- [下一步]
## 附录
[技术细节]
references/ml_algorithms.md - 算法深度解析references/feature_engineering.md - 特征工程模式references/experimentation.md - A/B 测试指南references/statistics.md - 统计方法# 模型训练器
python scripts/train_model.py --config model_config.yaml
# 特征重要性分析器
python scripts/feature_importance.py --model model.pkl --data test.csv
# A/B 测试分析器
python scripts/ab_analyzer.py --control control.csv --treatment treatment.csv
# 模型评估器
python scripts/evaluate_model.py --model model.pkl --test test.csv
每周安装量
95
仓库
GitHub 星标数
30
首次出现
Jan 24, 2026
安全审计
安装于
opencode70
gemini-cli66
claude-code64
codex62
cursor59
github-copilot58
Expert-level data science for business impact.
PROBLEM DEFINITION → DATA → FEATURES → MODEL → EVALUATION → DEPLOYMENT
1. Problem Definition
- Business objective
- Success metrics
- Constraints
2. Data Collection
- Data sources
- Data quality
- Sample size
3. Feature Engineering
- Feature creation
- Feature selection
- Transformation
4. Model Development
- Algorithm selection
- Training
- Tuning
5. Evaluation
- Metrics
- Validation
- Business impact
6. Deployment
- Production pipeline
- Monitoring
- Iteration
| Algorithm | Use Case | Pros | Cons |
|---|---|---|---|
| Linear Regression | Continuous prediction | Interpretable, fast | Linear relationships only |
| Logistic Regression | Binary classification | Interpretable, probabilistic | Linear boundaries |
| Random Forest | Classification/Regression | Handles non-linearity | Less interpretable |
| XGBoost | Classification/Regression | High accuracy | Overfitting risk |
| Neural Networks | Complex patterns | Flexible | Requires lots of data |
def select_model(problem_type, data_size, interpretability_need, accuracy_need):
"""
problem_type: 'classification' or 'regression'
data_size: 'small' (<10K), 'medium' (10K-1M), 'large' (>1M)
interpretability_need: 'high', 'medium', 'low'
accuracy_need: 'high', 'medium', 'low'
"""
if interpretability_need == 'high':
if problem_type == 'classification':
return 'Logistic Regression'
else:
return 'Linear Regression'
if data_size == 'small':
return 'Random Forest'
if accuracy_need == 'high':
if data_size == 'large':
return 'Neural Network'
else:
return 'XGBoost'
return 'Random Forest'
# Numerical Features
def engineer_numerical(df, col):
features = {
f'{col}_log': np.log1p(df[col]),
f'{col}_sqrt': np.sqrt(df[col]),
f'{col}_squared': df[col] ** 2,
f'{col}_binned': pd.cut(df[col], bins=5, labels=False)
}
return pd.DataFrame(features)
# Categorical Features
def engineer_categorical(df, col):
# One-hot encoding
dummies = pd.get_dummies(df[col], prefix=col)
# Target encoding
target_mean = df.groupby(col)['target'].mean()
target_encoded = df[col].map(target_mean)
# Frequency encoding
freq = df[col].value_counts(normalize=True)
freq_encoded = df[col].map(freq)
return dummies, target_encoded, freq_encoded
# Time Features
def engineer_time(df, col):
df[col] = pd.to_datetime(df[col])
features = {
f'{col}_hour': df[col].dt.hour,
f'{col}_day': df[col].dt.day,
f'{col}_dayofweek': df[col].dt.dayofweek,
f'{col}_month': df[col].dt.month,
f'{col}_is_weekend': df[col].dt.dayofweek.isin([5, 6]).astype(int),
f'{col}_hour_sin': np.sin(2 * np.pi * df[col].dt.hour / 24),
f'{col}_hour_cos': np.cos(2 * np.pi * df[col].dt.hour / 24)
}
return pd.DataFrame(features)
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
def select_features(X, y, method='importance', n_features=20):
if method == 'importance':
model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)
importance = pd.Series(model.feature_importances_, index=X.columns)
return importance.nlargest(n_features).index.tolist()
elif method == 'mutual_info':
mi_scores = mutual_info_classif(X, y)
mi_series = pd.Series(mi_scores, index=X.columns)
return mi_series.nlargest(n_features).index.tolist()
elif method == 'rfe':
model = RandomForestClassifier(n_estimators=100)
rfe = RFE(model, n_features_to_select=n_features)
rfe.fit(X, y)
return X.columns[rfe.support_].tolist()
from sklearn.metrics import (
accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, confusion_matrix,
classification_report
)
def evaluate_classification(y_true, y_pred, y_proba=None):
metrics = {
'accuracy': accuracy_score(y_true, y_pred),
'precision': precision_score(y_true, y_pred),
'recall': recall_score(y_true, y_pred),
'f1': f1_score(y_true, y_pred),
}
if y_proba is not None:
metrics['auc_roc'] = roc_auc_score(y_true, y_proba)
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))
return metrics
from sklearn.metrics import (
mean_absolute_error, mean_squared_error,
r2_score, mean_absolute_percentage_error
)
def evaluate_regression(y_true, y_pred):
metrics = {
'mae': mean_absolute_error(y_true, y_pred),
'mse': mean_squared_error(y_true, y_pred),
'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
'r2': r2_score(y_true, y_pred),
'mape': mean_absolute_percentage_error(y_true, y_pred)
}
return metrics
from scipy import stats
def calculate_sample_size(baseline_rate, mde, alpha=0.05, power=0.8):
"""
Calculate required sample size per variant
baseline_rate: Current conversion rate (e.g., 0.05)
mde: Minimum detectable effect (e.g., 0.1 for 10% lift)
alpha: Significance level
power: Statistical power
"""
effect_size = baseline_rate * mde
z_alpha = stats.norm.ppf(1 - alpha / 2)
z_beta = stats.norm.ppf(power)
p = baseline_rate
p_new = p + effect_size
n = (2 * p * (1 - p) * (z_alpha + z_beta) ** 2) / (effect_size ** 2)
return int(np.ceil(n))
def analyze_ab_test(control, treatment, alpha=0.05):
"""
Analyze A/B test results
control: array of 0/1 outcomes for control
treatment: array of 0/1 outcomes for treatment
"""
n_control = len(control)
n_treatment = len(treatment)
p_control = control.mean()
p_treatment = treatment.mean()
# Pooled proportion
p_pool = (control.sum() + treatment.sum()) / (n_control + n_treatment)
# Standard error
se = np.sqrt(p_pool * (1 - p_pool) * (1/n_control + 1/n_treatment))
# Z-statistic
z = (p_treatment - p_control) / se
# P-value (two-tailed)
p_value = 2 * (1 - stats.norm.cdf(abs(z)))
# Confidence interval
ci_low = (p_treatment - p_control) - 1.96 * se
ci_high = (p_treatment - p_control) + 1.96 * se
return {
'control_rate': p_control,
'treatment_rate': p_treatment,
'lift': (p_treatment - p_control) / p_control,
'z_statistic': z,
'p_value': p_value,
'significant': p_value < alpha,
'confidence_interval': (ci_low, ci_high)
}
from scipy import stats
# T-test
def compare_means(group1, group2):
stat, p_value = stats.ttest_ind(group1, group2)
effect_size = (group1.mean() - group2.mean()) / np.sqrt(
(group1.std()**2 + group2.std()**2) / 2
)
return {'t_statistic': stat, 'p_value': p_value, 'cohens_d': effect_size}
# Chi-square
def test_independence(contingency_table):
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
return {'chi2': chi2, 'p_value': p_value, 'degrees_of_freedom': dof}
# Correlation
def analyze_correlation(x, y):
pearson_r, pearson_p = stats.pearsonr(x, y)
spearman_r, spearman_p = stats.spearmanr(x, y)
return {
'pearson': {'r': pearson_r, 'p_value': pearson_p},
'spearman': {'r': spearman_r, 'p_value': spearman_p}
}
# Data Science Project: [Name]
## Business Objective
[What business problem are we solving?]
## Success Metrics
- Primary: [Metric]
- Secondary: [Metric]
## Data
- Sources: [List]
- Size: [Rows/Features]
- Time period: [Dates]
## Methodology
1. [Step 1]
2. [Step 2]
## Results
### Model Performance
| Metric | Value |
|--------|-------|
| [Metric] | [Value] |
### Business Impact
- [Impact 1]
- [Impact 2]
## Recommendations
1. [Recommendation]
## Next Steps
- [Next step]
## Appendix
[Technical details]
references/ml_algorithms.md - Algorithm deep divesreferences/feature_engineering.md - Feature engineering patternsreferences/experimentation.md - A/B testing guidereferences/statistics.md - Statistical methods# Model trainer
python scripts/train_model.py --config model_config.yaml
# Feature importance analyzer
python scripts/feature_importance.py --model model.pkl --data test.csv
# A/B test analyzer
python scripts/ab_analyzer.py --control control.csv --treatment treatment.csv
# Model evaluator
python scripts/evaluate_model.py --model model.pkl --test test.csv
Weekly Installs
95
Repository
GitHub Stars
30
First Seen
Jan 24, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
opencode70
gemini-cli66
claude-code64
codex62
cursor59
github-copilot58
AI Elements:基于shadcn/ui的AI原生应用组件库,快速构建对话界面
66,200 周安装