Classification Modeling by aj-geddes/useful-ai-prompts
npx skills add https://github.com/aj-geddes/useful-ai-prompts --skill 'Classification Modeling'分类建模用于预测分类目标值,根据输入特征将观测值分配到离散的类别或组别中。
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
confusion_matrix, classification_report, roc_auc_score, roc_curve,
precision_recall_curve, f1_score, accuracy_score
)
import seaborn as sns
# 生成样本二元分类数据
np.random.seed(42)
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 逻辑回归
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
y_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]
print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_lr):.4f}\n")
# 决策树
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
y_proba_dt = dt_model.predict_proba(X_test)[:, 1]
print("Decision Tree:")
print(classification_report(y_test, y_pred_dt))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_dt):.4f}\n")
# 随机森林
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_rf):.4f}\n")
# 梯度提升
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
y_proba_gb = gb_model.predict_proba(X_test)[:, 1]
print("Gradient Boosting:")
print(classification_report(y_test, y_pred_gb))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_gb):.4f}\n")
# 混淆矩阵
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
models = [
(y_pred_lr, 'Logistic Regression'),
(y_pred_dt, 'Decision Tree'),
(y_pred_rf, 'Random Forest'),
(y_pred_gb, 'Gradient Boosting'),
]
for idx, (y_pred, title) in enumerate(models):
cm = confusion_matrix(y_test, y_pred)
ax = axes[idx // 2, idx % 2]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title(title)
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')
plt.tight_layout()
plt.show()
# ROC 曲线
plt.figure(figsize=(10, 8))
probas = [
(y_proba_lr, 'Logistic Regression'),
(y_proba_dt, 'Decision Tree'),
(y_proba_rf, 'Random Forest'),
(y_proba_gb, 'Gradient Boosting'),
]
for y_proba, label in probas:
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
plt.plot(fpr, tpr, label=f'{label} (AUC={auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 精确率-召回率曲线
plt.figure(figsize=(10, 8))
for y_proba, label in probas:
precision, recall, _ = precision_recall_curve(y_test, y_proba)
f1 = f1_score(y_test, (y_proba > 0.5).astype(int))
plt.plot(recall, precision, label=f'{label} (F1={f1:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 特征重要性
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 基于树的特征重要性
feature_importance_rf = pd.Series(
rf_model.feature_importances_, index=range(X.shape[1])
).sort_values(ascending=False)
axes[0].barh(range(10), feature_importance_rf.values[:10])
axes[0].set_yticks(range(10))
axes[0].set_yticklabels([f'Feature {i}' for i in feature_importance_rf.index[:10]])
axes[0].set_title('Random Forest - Top 10 Features')
axes[0].set_xlabel('Importance')
# 逻辑回归系数
lr_coef = pd.Series(lr_model.coef_[0], index=range(X.shape[1])).abs().sort_values(ascending=False)
axes[1].barh(range(10), lr_coef.values[:10])
axes[1].set_yticks(range(10))
axes[1].set_yticklabels([f'Feature {i}' for i in lr_coef.index[:10]])
axes[1].set_title('Logistic Regression - Top 10 Features (abs coef)')
axes[1].set_xlabel('Absolute Coefficient')
plt.tight_layout()
plt.show()
# 模型比较
results = pd.DataFrame({
'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting'],
'Accuracy': [
accuracy_score(y_test, y_pred_lr),
accuracy_score(y_test, y_pred_dt),
accuracy_score(y_test, y_pred_rf),
accuracy_score(y_test, y_pred_gb),
],
'AUC-ROC': [
roc_auc_score(y_test, y_proba_lr),
roc_auc_score(y_test, y_proba_dt),
roc_auc_score(y_test, y_proba_rf),
roc_auc_score(y_test, y_proba_gb),
],
'F1-Score': [
f1_score(y_test, y_pred_lr),
f1_score(y_test, y_pred_dt),
f1_score(y_test, y_pred_rf),
f1_score(y_test, y_pred_gb),
]
})
print("Model Comparison:")
print(results)
# 交叉验证
cv_scores = cross_val_score(
RandomForestClassifier(n_estimators=100, random_state=42),
X_train, y_train, cv=5, scoring='roc_auc'
)
print(f"\nCross-validation AUC scores: {cv_scores}")
print(f"Mean CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
# 概率校准
from sklearn.calibration import calibration_curve
prob_true, prob_pred = calibration_curve(y_test, y_proba_rf, n_bins=10)
plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, 'o-', label='Random Forest')
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
每周安装量
0
代码仓库
GitHub 星标数
126
首次出现
1970年1月1日
安全审计
Classification modeling predicts categorical target values, assigning observations to discrete classes or categories based on input features.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
confusion_matrix, classification_report, roc_auc_score, roc_curve,
precision_recall_curve, f1_score, accuracy_score
)
import seaborn as sns
# Generate sample binary classification data
np.random.seed(42)
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
y_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]
print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_lr):.4f}\n")
# Decision Tree
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
y_proba_dt = dt_model.predict_proba(X_test)[:, 1]
print("Decision Tree:")
print(classification_report(y_test, y_pred_dt))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_dt):.4f}\n")
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_rf):.4f}\n")
# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
y_proba_gb = gb_model.predict_proba(X_test)[:, 1]
print("Gradient Boosting:")
print(classification_report(y_test, y_pred_gb))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_gb):.4f}\n")
# Confusion matrices
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
models = [
(y_pred_lr, 'Logistic Regression'),
(y_pred_dt, 'Decision Tree'),
(y_pred_rf, 'Random Forest'),
(y_pred_gb, 'Gradient Boosting'),
]
for idx, (y_pred, title) in enumerate(models):
cm = confusion_matrix(y_test, y_pred)
ax = axes[idx // 2, idx % 2]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title(title)
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')
plt.tight_layout()
plt.show()
# ROC Curves
plt.figure(figsize=(10, 8))
probas = [
(y_proba_lr, 'Logistic Regression'),
(y_proba_dt, 'Decision Tree'),
(y_proba_rf, 'Random Forest'),
(y_proba_gb, 'Gradient Boosting'),
]
for y_proba, label in probas:
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
plt.plot(fpr, tpr, label=f'{label} (AUC={auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Precision-Recall Curves
plt.figure(figsize=(10, 8))
for y_proba, label in probas:
precision, recall, _ = precision_recall_curve(y_test, y_proba)
f1 = f1_score(y_test, (y_proba > 0.5).astype(int))
plt.plot(recall, precision, label=f'{label} (F1={f1:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Feature importance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Tree-based feature importance
feature_importance_rf = pd.Series(
rf_model.feature_importances_, index=range(X.shape[1])
).sort_values(ascending=False)
axes[0].barh(range(10), feature_importance_rf.values[:10])
axes[0].set_yticks(range(10))
axes[0].set_yticklabels([f'Feature {i}' for i in feature_importance_rf.index[:10]])
axes[0].set_title('Random Forest - Top 10 Features')
axes[0].set_xlabel('Importance')
# Logistic regression coefficients
lr_coef = pd.Series(lr_model.coef_[0], index=range(X.shape[1])).abs().sort_values(ascending=False)
axes[1].barh(range(10), lr_coef.values[:10])
axes[1].set_yticks(range(10))
axes[1].set_yticklabels([f'Feature {i}' for i in lr_coef.index[:10]])
axes[1].set_title('Logistic Regression - Top 10 Features (abs coef)')
axes[1].set_xlabel('Absolute Coefficient')
plt.tight_layout()
plt.show()
# Model comparison
results = pd.DataFrame({
'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting'],
'Accuracy': [
accuracy_score(y_test, y_pred_lr),
accuracy_score(y_test, y_pred_dt),
accuracy_score(y_test, y_pred_rf),
accuracy_score(y_test, y_pred_gb),
],
'AUC-ROC': [
roc_auc_score(y_test, y_proba_lr),
roc_auc_score(y_test, y_proba_dt),
roc_auc_score(y_test, y_proba_rf),
roc_auc_score(y_test, y_proba_gb),
],
'F1-Score': [
f1_score(y_test, y_pred_lr),
f1_score(y_test, y_pred_dt),
f1_score(y_test, y_pred_rf),
f1_score(y_test, y_pred_gb),
]
})
print("Model Comparison:")
print(results)
# Cross-validation
cv_scores = cross_val_score(
RandomForestClassifier(n_estimators=100, random_state=42),
X_train, y_train, cv=5, scoring='roc_auc'
)
print(f"\nCross-validation AUC scores: {cv_scores}")
print(f"Mean CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
# Probability calibration
from sklearn.calibration import calibration_curve
prob_true, prob_pred = calibration_curve(y_test, y_proba_rf, n_bins=10)
plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, 'o-', label='Random Forest')
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
Weekly Installs
0
Repository
GitHub Stars
126
First Seen
Jan 1, 1970
Security Audits
专业SEO审计工具:全面网站诊断、技术SEO优化与页面分析指南
62,600 周安装