ML Model Explanation by aj-geddes/useful-ai-prompts
npx skills add https://github.com/aj-geddes/useful-ai-prompts --skill 'ML Model Explanation'模型可解释性使机器学习决策变得透明且易于理解,从而建立信任、确保合规性、便于调试,并从预测中获得可操作的见解。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.inspection import partial_dependence, permutation_importance
import warnings
warnings.filterwarnings('ignore')
print("=== 1. 特征重要性分析 ===")
# 创建数据集
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, random_state=42)
feature_names = [f'Feature_{i}' for i in range(20)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
# 特征重要性方法
print("\n=== 特征重要性对比 ===")
# 1. 基于不纯度的特征重要性(默认方法)
impurity_importance = rf_model.feature_importances_
# 2. 置换重要性
perm_importance = permutation_importance(rf_model, X_test, y_test, n_repeats=10, random_state=42)
# 创建对比数据框
importance_df = pd.DataFrame({
'Feature': feature_names,
'Impurity': impurity_importance,
'Permutation': perm_importance.importances_mean
}).sort_values('Impurity', ascending=False)
print("\n前 10 个最重要的特征(按不纯度排序):")
print(importance_df.head(10)[['Feature', 'Impurity']])
# 2. 类 SHAP 特征归因
print("\n=== 类 SHAP 特征归因 ===")
class SimpleShapCalculator:
def __init__(self, model, X_background):
self.model = model
self.X_background = X_background
self.baseline = model.predict_proba(X_background.mean(axis=0).reshape(1, -1))[0]
def predict_difference(self, X_sample):
"""获取与基准预测的差值"""
pred = self.model.predict_proba(X_sample)[0]
return pred - self.baseline
def calculate_shap_values(self, X_instance, n_iterations=100):
"""近似计算 SHAP 值"""
shap_values = np.zeros(X_instance.shape[1])
n_features = X_instance.shape[1]
for i in range(n_iterations):
# 随机特征子集
subset_mask = np.random.random(n_features) > 0.5
# 包含与不包含特征的情况
X_with = X_instance.copy()
X_without = X_instance.copy()
X_without[0, ~subset_mask] = self.X_background[0, ~subset_mask]
# 边际贡献
contribution = (self.predict_difference(X_with)[1] -
self.predict_difference(X_without)[1])
shap_values[~subset_mask] += contribution / n_iterations
return shap_values
shap_calc = SimpleShapCalculator(rf_model, X_train)
# 为样本计算 SHAP 值
sample_idx = 0
shap_vals = shap_calc.calculate_shap_values(X_test[sample_idx:sample_idx+1], n_iterations=50)
print(f"\n样本 {sample_idx} 的 SHAP 值:")
shap_df = pd.DataFrame({
'Feature': feature_names,
'SHAP_Value': shap_vals
}).sort_values('SHAP_Value', key=abs, ascending=False)
print(shap_df.head(10)[['Feature', 'SHAP_Value']])
# 3. 部分依赖分析
print("\n=== 3. 部分依赖分析 ===")
# 计算顶部特征的部分依赖
top_features = importance_df['Feature'].head(3).values
top_feature_indices = [feature_names.index(f) for f in top_features]
pd_data = {}
for feature_idx in top_feature_indices:
pd_result = partial_dependence(rf_model, X_test, [feature_idx])
pd_data[feature_names[feature_idx]] = pd_result
print(f"已计算部分依赖的特征: {list(pd_data.keys())}")
# 4. LIME - 局部可解释模型无关解释
print("\n=== 4. LIME(局部代理模型) ===")
class SimpleLIME:
def __init__(self, model, X_train):
self.model = model
self.X_train = X_train
self.scaler = StandardScaler()
self.scaler.fit(X_train)
def explain_instance(self, instance, n_samples=1000, n_features=10):
"""使用局部线性模型解释预测"""
# 生成扰动样本
scaled_instance = self.scaler.transform(instance.reshape(1, -1))
perturbations = np.random.normal(scaled_instance, 0.3, (n_samples, instance.shape[0]))
# 获取预测
predictions = self.model.predict_proba(perturbations)[:, 1]
# 训练局部线性模型
distances = np.sum((perturbations - scaled_instance) ** 2, axis=1)
weights = np.exp(-distances)
# 线性回归权重
local_model = LogisticRegression()
local_model.fit(perturbations, predictions, sample_weight=weights)
# 获取特征重要性
feature_weights = np.abs(local_model.coef_[0])
top_indices = np.argsort(feature_weights)[-n_features:]
return {
'features': [feature_names[i] for i in top_indices],
'weights': feature_weights[top_indices],
'prediction': self.model.predict(instance.reshape(1, -1))[0]
}
lime = SimpleLIME(rf_model, X_train)
lime_explanation = lime.explain_instance(X_test[0])
print(f"\n样本 0 的 LIME 解释:")
for feat, weight in zip(lime_explanation['features'], lime_explanation['weights']):
print(f" {feat}: {weight:.4f}")
# 5. 决策树可视化
print("\n=== 5. 决策树解释 ===")
# 训练用于可视化的小型树
small_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
small_tree.fit(X_train, y_train)
print(f"决策树(深度=3)已训练")
print(f"树模型准确率: {small_tree.score(X_test, y_test):.4f}")
# 6. 模型无关的全局解释
print("\n=== 6. 全局模型行为 ===")
class GlobalExplainer:
def __init__(self, model):
self.model = model
def get_prediction_distribution(self, X):
"""分析预测分布"""
predictions = self.model.predict_proba(X)
return {
'class_0_mean': predictions[:, 0].mean(),
'class_1_mean': predictions[:, 1].mean(),
'class_1_std': predictions[:, 1].std()
}
def feature_sensitivity(self, X, feature_idx, n_perturbations=10):
"""测量对特征变化的敏感性"""
original_pred = self.model.predict_proba(X)[:, 1].mean()
sensitivities = []
for perturbation_level in np.linspace(0.1, 1.0, n_perturbations):
X_perturbed = X.copy()
X_perturbed[:, feature_idx] = np.random.normal(
X[:, feature_idx].mean(),
X[:, feature_idx].std() * perturbation_level,
len(X)
)
perturbed_pred = self.model.predict_proba(X_perturbed)[:, 1].mean()
sensitivities.append(abs(perturbed_pred - original_pred))
return np.array(sensitivities)
explainer = GlobalExplainer(rf_model)
pred_dist = explainer.get_prediction_distribution(X_test)
print(f"\n预测分布:")
print(f" 类别 0 的平均概率: {pred_dist['class_0_mean']:.4f}")
print(f" 类别 1 的平均概率: {pred_dist['class_1_mean']:.4f}")
# 7. 可视化
print("\n=== 7. 可解释性可视化 ===")
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
# 1. 特征重要性对比
top_features_plot = importance_df.head(10)
axes[0, 0].barh(top_features_plot['Feature'], top_features_plot['Impurity'], color='steelblue')
axes[0, 0].set_xlabel('重要性分数')
axes[0, 0].set_title('特征重要性(随机森林)')
axes[0, 0].invert_yaxis()
# 2. 置换重要性与不纯度重要性对比
axes[0, 1].scatter(importance_df['Impurity'], importance_df['Permutation'], alpha=0.6)
axes[0, 1].set_xlabel('不纯度重要性')
axes[0, 1].set_ylabel('置换重要性')
axes[0, 1].set_title('特征重要性方法对比')
axes[0, 1].grid(True, alpha=0.3)
# 3. SHAP 值
shap_sorted = shap_df.head(10).sort_values('SHAP_Value')
colors = ['red' if x < 0 else 'green' for x in shap_sorted['SHAP_Value']]
axes[0, 2].barh(shap_sorted['Feature'], shap_sorted['SHAP_Value'], color=colors)
axes[0, 2].set_xlabel('SHAP 值')
axes[0, 2].set_title('样本 0 的 SHAP 值')
axes[0, 2].axvline(x=0, color='black', linestyle='--', linewidth=0.8)
# 4. 部分依赖
feature_0_idx = top_feature_indices[0]
feature_0_values = np.linspace(X_test[:, feature_0_idx].min(), X_test[:, feature_0_idx].max(), 50)
predictions_pd = []
for val in feature_0_values:
X_temp = X_test.copy()
X_temp[:, feature_0_idx] = val
pred = rf_model.predict_proba(X_temp)[:, 1].mean()
predictions_pd.append(pred)
axes[1, 0].plot(feature_0_values, predictions_pd, linewidth=2, color='purple')
axes[1, 0].set_xlabel(feature_names[feature_0_idx])
axes[1, 0].set_ylabel('平均预测(类别 1)')
axes[1, 0].set_title('部分依赖图')
axes[1, 0].grid(True, alpha=0.3)
# 5. 模型预测分布
pred_proba = rf_model.predict_proba(X_test)[:, 1]
axes[1, 1].hist(pred_proba, bins=30, color='coral', edgecolor='black', alpha=0.7)
axes[1, 1].set_xlabel('预测概率(类别 1)')
axes[1, 1].set_ylabel('频率')
axes[1, 1].set_title('预测分布')
axes[1, 1].grid(True, alpha=0.3, axis='y')
# 6. 特征敏感性分析
sensitivities = []
for feat_idx in range(min(5, X_test.shape[1])):
sensitivity = explainer.feature_sensitivity(X_test, feat_idx, n_perturbations=5)
sensitivities.append(sensitivity.mean())
axes[1, 2].bar(range(min(5, X_test.shape[1])), sensitivities, color='lightgreen', edgecolor='black')
axes[1, 2].set_xticks(range(min(5, X_test.shape[1])))
axes[1, 2].set_xticklabels([f'F{i}' for i in range(min(5, X_test.shape[1]))])
axes[1, 2].set_ylabel('平均敏感性')
axes[1, 2].set_title('特征对扰动的敏感性')
axes[1, 2].grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('model_explainability.png', dpi=100, bbox_inches='tight')
print("\n可视化已保存为 'model_explainability.png'")
# 8. 总结
print("\n=== 可解释性总结 ===")
print(f"分析的总特征数: {len(feature_names)}")
print(f"最重要的特征: {importance_df.iloc[0]['Feature']}")
print(f"重要性分数: {importance_df.iloc[0]['Impurity']:.4f}")
print(f"模型准确率: {rf_model.score(X_test, y_test):.4f}")
print(f"平均预测置信度: {pred_proba.mean():.4f}")
print("\nML 模型解释设置完成!")
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
每周安装量
0
代码仓库
GitHub 星标数
126
首次出现
1970年1月1日
安全审计
Model explainability makes machine learning decisions transparent and interpretable, enabling trust, compliance, debugging, and actionable insights from predictions.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.inspection import partial_dependence, permutation_importance
import warnings
warnings.filterwarnings('ignore')
print("=== 1. Feature Importance Analysis ===")
# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, random_state=42)
feature_names = [f'Feature_{i}' for i in range(20)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
# Feature importance methods
print("\n=== Feature Importance Comparison ===")
# 1. Impurity-based importance (default)
impurity_importance = rf_model.feature_importances_
# 2. Permutation importance
perm_importance = permutation_importance(rf_model, X_test, y_test, n_repeats=10, random_state=42)
# Create comparison dataframe
importance_df = pd.DataFrame({
'Feature': feature_names,
'Impurity': impurity_importance,
'Permutation': perm_importance.importances_mean
}).sort_values('Impurity', ascending=False)
print("\nTop 10 Most Important Features (by Impurity):")
print(importance_df.head(10)[['Feature', 'Impurity']])
# 2. SHAP-like Feature Attribution
print("\n=== SHAP-like Feature Attribution ===")
class SimpleShapCalculator:
def __init__(self, model, X_background):
self.model = model
self.X_background = X_background
self.baseline = model.predict_proba(X_background.mean(axis=0).reshape(1, -1))[0]
def predict_difference(self, X_sample):
"""Get prediction difference from baseline"""
pred = self.model.predict_proba(X_sample)[0]
return pred - self.baseline
def calculate_shap_values(self, X_instance, n_iterations=100):
"""Approximate SHAP values"""
shap_values = np.zeros(X_instance.shape[1])
n_features = X_instance.shape[1]
for i in range(n_iterations):
# Random feature subset
subset_mask = np.random.random(n_features) > 0.5
# With and without feature
X_with = X_instance.copy()
X_without = X_instance.copy()
X_without[0, ~subset_mask] = self.X_background[0, ~subset_mask]
# Marginal contribution
contribution = (self.predict_difference(X_with)[1] -
self.predict_difference(X_without)[1])
shap_values[~subset_mask] += contribution / n_iterations
return shap_values
shap_calc = SimpleShapCalculator(rf_model, X_train)
# Calculate SHAP values for a sample
sample_idx = 0
shap_vals = shap_calc.calculate_shap_values(X_test[sample_idx:sample_idx+1], n_iterations=50)
print(f"\nSHAP Values for Sample {sample_idx}:")
shap_df = pd.DataFrame({
'Feature': feature_names,
'SHAP_Value': shap_vals
}).sort_values('SHAP_Value', key=abs, ascending=False)
print(shap_df.head(10)[['Feature', 'SHAP_Value']])
# 3. Partial Dependence Analysis
print("\n=== 3. Partial Dependence Analysis ===")
# Calculate partial dependence for top features
top_features = importance_df['Feature'].head(3).values
top_feature_indices = [feature_names.index(f) for f in top_features]
pd_data = {}
for feature_idx in top_feature_indices:
pd_result = partial_dependence(rf_model, X_test, [feature_idx])
pd_data[feature_names[feature_idx]] = pd_result
print(f"Partial dependence calculated for features: {list(pd_data.keys())}")
# 4. LIME - Local Interpretable Model-agnostic Explanations
print("\n=== 4. LIME (Local Surrogate Model) ===")
class SimpleLIME:
def __init__(self, model, X_train):
self.model = model
self.X_train = X_train
self.scaler = StandardScaler()
self.scaler.fit(X_train)
def explain_instance(self, instance, n_samples=1000, n_features=10):
"""Explain prediction using local linear model"""
# Generate perturbed samples
scaled_instance = self.scaler.transform(instance.reshape(1, -1))
perturbations = np.random.normal(scaled_instance, 0.3, (n_samples, instance.shape[0]))
# Get predictions
predictions = self.model.predict_proba(perturbations)[:, 1]
# Train local linear model
distances = np.sum((perturbations - scaled_instance) ** 2, axis=1)
weights = np.exp(-distances)
# Linear regression weights
local_model = LogisticRegression()
local_model.fit(perturbations, predictions, sample_weight=weights)
# Get feature importance
feature_weights = np.abs(local_model.coef_[0])
top_indices = np.argsort(feature_weights)[-n_features:]
return {
'features': [feature_names[i] for i in top_indices],
'weights': feature_weights[top_indices],
'prediction': self.model.predict(instance.reshape(1, -1))[0]
}
lime = SimpleLIME(rf_model, X_train)
lime_explanation = lime.explain_instance(X_test[0])
print(f"\nLIME Explanation for Sample 0:")
for feat, weight in zip(lime_explanation['features'], lime_explanation['weights']):
print(f" {feat}: {weight:.4f}")
# 5. Decision Tree Visualization
print("\n=== 5. Decision Tree Interpretation ===")
# Train small tree for visualization
small_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
small_tree.fit(X_train, y_train)
print(f"Decision Tree (depth=3) trained")
print(f"Tree accuracy: {small_tree.score(X_test, y_test):.4f}")
# 6. Model-agnostic global explanations
print("\n=== 6. Global Model Behavior ===")
class GlobalExplainer:
def __init__(self, model):
self.model = model
def get_prediction_distribution(self, X):
"""Analyze prediction distribution"""
predictions = self.model.predict_proba(X)
return {
'class_0_mean': predictions[:, 0].mean(),
'class_1_mean': predictions[:, 1].mean(),
'class_1_std': predictions[:, 1].std()
}
def feature_sensitivity(self, X, feature_idx, n_perturbations=10):
"""Measure sensitivity to feature changes"""
original_pred = self.model.predict_proba(X)[:, 1].mean()
sensitivities = []
for perturbation_level in np.linspace(0.1, 1.0, n_perturbations):
X_perturbed = X.copy()
X_perturbed[:, feature_idx] = np.random.normal(
X[:, feature_idx].mean(),
X[:, feature_idx].std() * perturbation_level,
len(X)
)
perturbed_pred = self.model.predict_proba(X_perturbed)[:, 1].mean()
sensitivities.append(abs(perturbed_pred - original_pred))
return np.array(sensitivities)
explainer = GlobalExplainer(rf_model)
pred_dist = explainer.get_prediction_distribution(X_test)
print(f"\nPrediction Distribution:")
print(f" Class 0 mean probability: {pred_dist['class_0_mean']:.4f}")
print(f" Class 1 mean probability: {pred_dist['class_1_mean']:.4f}")
# 7. Visualization
print("\n=== 7. Explanability Visualizations ===")
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
# 1. Feature Importance Comparison
top_features_plot = importance_df.head(10)
axes[0, 0].barh(top_features_plot['Feature'], top_features_plot['Impurity'], color='steelblue')
axes[0, 0].set_xlabel('Importance Score')
axes[0, 0].set_title('Feature Importance (Random Forest)')
axes[0, 0].invert_yaxis()
# 2. Permutation vs Impurity Importance
axes[0, 1].scatter(importance_df['Impurity'], importance_df['Permutation'], alpha=0.6)
axes[0, 1].set_xlabel('Impurity Importance')
axes[0, 1].set_ylabel('Permutation Importance')
axes[0, 1].set_title('Feature Importance Methods Comparison')
axes[0, 1].grid(True, alpha=0.3)
# 3. SHAP Values
shap_sorted = shap_df.head(10).sort_values('SHAP_Value')
colors = ['red' if x < 0 else 'green' for x in shap_sorted['SHAP_Value']]
axes[0, 2].barh(shap_sorted['Feature'], shap_sorted['SHAP_Value'], color=colors)
axes[0, 2].set_xlabel('SHAP Value')
axes[0, 2].set_title('SHAP Values for Sample 0')
axes[0, 2].axvline(x=0, color='black', linestyle='--', linewidth=0.8)
# 4. Partial Dependence
feature_0_idx = top_feature_indices[0]
feature_0_values = np.linspace(X_test[:, feature_0_idx].min(), X_test[:, feature_0_idx].max(), 50)
predictions_pd = []
for val in feature_0_values:
X_temp = X_test.copy()
X_temp[:, feature_0_idx] = val
pred = rf_model.predict_proba(X_temp)[:, 1].mean()
predictions_pd.append(pred)
axes[1, 0].plot(feature_0_values, predictions_pd, linewidth=2, color='purple')
axes[1, 0].set_xlabel(feature_names[feature_0_idx])
axes[1, 0].set_ylabel('Average Prediction (Class 1)')
axes[1, 0].set_title('Partial Dependence Plot')
axes[1, 0].grid(True, alpha=0.3)
# 5. Model Prediction Distribution
pred_proba = rf_model.predict_proba(X_test)[:, 1]
axes[1, 1].hist(pred_proba, bins=30, color='coral', edgecolor='black', alpha=0.7)
axes[1, 1].set_xlabel('Predicted Probability (Class 1)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Prediction Distribution')
axes[1, 1].grid(True, alpha=0.3, axis='y')
# 6. Feature Sensitivity Analysis
sensitivities = []
for feat_idx in range(min(5, X_test.shape[1])):
sensitivity = explainer.feature_sensitivity(X_test, feat_idx, n_perturbations=5)
sensitivities.append(sensitivity.mean())
axes[1, 2].bar(range(min(5, X_test.shape[1])), sensitivities, color='lightgreen', edgecolor='black')
axes[1, 2].set_xticks(range(min(5, X_test.shape[1])))
axes[1, 2].set_xticklabels([f'F{i}' for i in range(min(5, X_test.shape[1]))])
axes[1, 2].set_ylabel('Average Sensitivity')
axes[1, 2].set_title('Feature Sensitivity to Perturbations')
axes[1, 2].grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('model_explainability.png', dpi=100, bbox_inches='tight')
print("\nVisualization saved as 'model_explainability.png'")
# 8. Summary
print("\n=== Explainability Summary ===")
print(f"Total Features Analyzed: {len(feature_names)}")
print(f"Most Important Feature: {importance_df.iloc[0]['Feature']}")
print(f"Importance Score: {importance_df.iloc[0]['Impurity']:.4f}")
print(f"Model Accuracy: {rf_model.score(X_test, y_test):.4f}")
print(f"Average Prediction Confidence: {pred_proba.mean():.4f}")
print("\nML model explanation setup completed!")
Weekly Installs
0
Repository
GitHub Stars
126
First Seen
Jan 1, 1970
Security Audits
专业SEO审计工具:全面网站诊断、技术SEO优化与页面分析指南
61,300 周安装