Correlation Analysis by aj-geddes/useful-ai-prompts
npx skills add https://github.com/aj-geddes/useful-ai-prompts --skill 'Correlation Analysis'相关性分析用于衡量变量之间关系的强度和方向,有助于识别哪些特征相关以及检测多重共线性。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, kendalltau
# 样本数据
np.random.seed(42)
n = 200
age = np.random.uniform(20, 70, n)
income = age * 2000 + np.random.normal(0, 10000, n)
education_years = age / 2 + np.random.normal(0, 3, n)
satisfaction = income / 50000 + np.random.normal(0, 0.5, n)
df = pd.DataFrame({
'age': age,
'income': income,
'education_years': education_years,
'satisfaction': satisfaction,
'years_employed': age - education_years - 6
})
# Pearson 相关性(线性)
corr_matrix = df.corr(method='pearson')
print("Pearson 相关性矩阵:")
print(corr_matrix)
# 带 p 值的单个相关性
corr_coef, p_value = pearsonr(df['age'], df['income'])
print(f"\nPearson 相关性 (年龄 vs 收入): r={corr_coef:.4f}, p-value={p_value:.4f}")
# Spearman 相关性(基于等级)
spearman_matrix = df.corr(method='spearman')
print("\nSpearman 相关性矩阵:")
print(spearman_matrix)
spearman_coef, p_value = spearmanr(df['age'], df['income'])
print(f"Spearman 相关性 (年龄 vs 收入): rho={spearman_coef:.4f}, p-value={p_value:.4f}")
# Kendall tau 相关性
kendall_coef, p_value = kendalltau(df['age'], df['income'])
print(f"Kendall 相关性 (年龄 vs 收入): tau={kendall_coef:.4f}, p-value={p_value:.4f}")
# 相关性热图
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Pearson 热图
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
square=True, ax=axes[0], vmin=-1, vmax=1)
axes[0].set_title('Pearson 相关性热图')
# Spearman 热图
sns.heatmap(spearman_matrix, annot=True, cmap='coolwarm', center=0,
square=True, ax=axes[1], vmin=-1, vmax=1)
axes[1].set_title('Spearman 相关性热图')
plt.tight_layout()
plt.show()
# 带显著性检验的相关性
def correlation_with_pvalue(df):
rows, cols = [], []
for col1 in df.columns:
for col2 in df.columns:
if col1 < col2: # 避免重复
r, p = pearsonr(df[col1], df[col2])
rows.append({
'Variable 1': col1,
'Variable 2': col2,
'Correlation': r,
'P-value': p,
'Significant': 'Yes' if p < 0.05 else 'No'
})
return pd.DataFrame(rows)
corr_table = correlation_with_pvalue(df)
print("\n带 P 值的相关性:")
print(corr_table)
# 带回归线的散点图
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
pairs = [('age', 'income'), ('age', 'education_years'),
('income', 'satisfaction'), ('education_years', 'years_employed')]
for idx, (var1, var2) in enumerate(pairs):
ax = axes[idx // 2, idx % 2]
ax.scatter(df[var1], df[var2], alpha=0.5)
# 添加回归线
z = np.polyfit(df[var1], df[var2], 1)
p = np.poly1d(z)
x_line = np.linspace(df[var1].min(), df[var1].max(), 100)
ax.plot(x_line, p(x_line), "r--", linewidth=2)
r, p_val = pearsonr(df[var1], df[var2])
ax.set_title(f'{var1} vs {var2}\nr={r:.4f}, p={p_val:.4f}')
ax.set_xlabel(var1)
ax.set_ylabel(var2)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 多重共线性检测 (VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = df[['age', 'education_years', 'years_employed']]
vif_data = pd.DataFrame()
vif_data['Variable'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("\n方差膨胀因子 (VIF):")
print(vif_data)
print("\nVIF > 10: 高度多重共线性")
print("VIF > 5: 中度多重共线性")
# 偏相关(控制混杂因素)
def partial_correlation(df, x, y, control_vars):
from scipy.stats import linregress
# 移除控制变量后 x 的残差
x_residuals = df[x] - np.poly1d(
np.polyfit(df[control_vars].values, df[x], deg=1)
)(df[control_vars].values)
# 移除控制变量后 y 的残差
y_residuals = df[y] - np.poly1d(
np.polyfit(df[control_vars].values, df[y], deg=1)
)(df[control_vars].values)
return pearsonr(x_residuals, y_residuals)[0]
partial_corr = partial_correlation(df, 'income', 'satisfaction', ['age'])
print(f"\n偏相关 (收入 vs 满意度, 控制年龄): {partial_corr:.4f}")
# 距离相关性(非线性关系)
try:
from dcor import distance_correlation
dist_corr = distance_correlation(df['age'], df['income'])
print(f"距离相关性 (年龄 vs 收入): {dist_corr:.4f}")
except ImportError:
print("未安装 dcor 库,无法计算距离相关性")
# 随时间变化的相关性稳定性
fig, ax = plt.subplots(figsize=(12, 5))
rolling_corr = df['age'].rolling(window=50).corr(df['income'])
ax.plot(rolling_corr.index, rolling_corr.values)
ax.set_title('滚动相关性 (年龄 vs 收入, 窗口=50)')
ax.set_ylabel('相关系数')
ax.grid(True, alpha=0.3)
plt.show()
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
每周安装量
0
代码仓库
GitHub 星标数
126
首次出现时间
Jan 1, 1970
安全审计
Correlation analysis measures the strength and direction of relationships between variables, helping identify which features are related and detect multicollinearity.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, kendalltau
# Sample data
np.random.seed(42)
n = 200
age = np.random.uniform(20, 70, n)
income = age * 2000 + np.random.normal(0, 10000, n)
education_years = age / 2 + np.random.normal(0, 3, n)
satisfaction = income / 50000 + np.random.normal(0, 0.5, n)
df = pd.DataFrame({
'age': age,
'income': income,
'education_years': education_years,
'satisfaction': satisfaction,
'years_employed': age - education_years - 6
})
# Pearson correlation (linear)
corr_matrix = df.corr(method='pearson')
print("Pearson Correlation Matrix:")
print(corr_matrix)
# Individual correlation with p-value
corr_coef, p_value = pearsonr(df['age'], df['income'])
print(f"\nPearson correlation (age vs income): r={corr_coef:.4f}, p-value={p_value:.4f}")
# Spearman correlation (rank-based)
spearman_matrix = df.corr(method='spearman')
print("\nSpearman Correlation Matrix:")
print(spearman_matrix)
spearman_coef, p_value = spearmanr(df['age'], df['income'])
print(f"Spearman correlation (age vs income): rho={spearman_coef:.4f}, p-value={p_value:.4f}")
# Kendall tau correlation
kendall_coef, p_value = kendalltau(df['age'], df['income'])
print(f"Kendall correlation (age vs income): tau={kendall_coef:.4f}, p-value={p_value:.4f}")
# Correlation heatmap
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Pearson heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
square=True, ax=axes[0], vmin=-1, vmax=1)
axes[0].set_title('Pearson Correlation Heatmap')
# Spearman heatmap
sns.heatmap(spearman_matrix, annot=True, cmap='coolwarm', center=0,
square=True, ax=axes[1], vmin=-1, vmax=1)
axes[1].set_title('Spearman Correlation Heatmap')
plt.tight_layout()
plt.show()
# Correlation with significance testing
def correlation_with_pvalue(df):
rows, cols = [], []
for col1 in df.columns:
for col2 in df.columns:
if col1 < col2: # Avoid duplicates
r, p = pearsonr(df[col1], df[col2])
rows.append({
'Variable 1': col1,
'Variable 2': col2,
'Correlation': r,
'P-value': p,
'Significant': 'Yes' if p < 0.05 else 'No'
})
return pd.DataFrame(rows)
corr_table = correlation_with_pvalue(df)
print("\nCorrelation with P-values:")
print(corr_table)
# Scatter plots with regression lines
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
pairs = [('age', 'income'), ('age', 'education_years'),
('income', 'satisfaction'), ('education_years', 'years_employed')]
for idx, (var1, var2) in enumerate(pairs):
ax = axes[idx // 2, idx % 2]
ax.scatter(df[var1], df[var2], alpha=0.5)
# Add regression line
z = np.polyfit(df[var1], df[var2], 1)
p = np.poly1d(z)
x_line = np.linspace(df[var1].min(), df[var1].max(), 100)
ax.plot(x_line, p(x_line), "r--", linewidth=2)
r, p_val = pearsonr(df[var1], df[var2])
ax.set_title(f'{var1} vs {var2}\nr={r:.4f}, p={p_val:.4f}')
ax.set_xlabel(var1)
ax.set_ylabel(var2)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Multicollinearity detection (VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = df[['age', 'education_years', 'years_employed']]
vif_data = pd.DataFrame()
vif_data['Variable'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("\nVariance Inflation Factor (VIF):")
print(vif_data)
print("\nVIF > 10: High multicollinearity")
print("VIF > 5: Moderate multicollinearity")
# Partial correlation (controlling for confounding)
def partial_correlation(df, x, y, control_vars):
from scipy.stats import linregress
# Residuals of x after removing control variables
x_residuals = df[x] - np.poly1d(
np.polyfit(df[control_vars].values, df[x], deg=1)
)(df[control_vars].values)
# Residuals of y after removing control variables
y_residuals = df[y] - np.poly1d(
np.polyfit(df[control_vars].values, df[y], deg=1)
)(df[control_vars].values)
return pearsonr(x_residuals, y_residuals)[0]
partial_corr = partial_correlation(df, 'income', 'satisfaction', ['age'])
print(f"\nPartial correlation (income vs satisfaction, controlling for age): {partial_corr:.4f}")
# Distance correlation (non-linear relationships)
try:
from dcor import distance_correlation
dist_corr = distance_correlation(df['age'], df['income'])
print(f"Distance correlation (age vs income): {dist_corr:.4f}")
except ImportError:
print("dcor library not installed for distance correlation")
# Correlation stability over time
fig, ax = plt.subplots(figsize=(12, 5))
rolling_corr = df['age'].rolling(window=50).corr(df['income'])
ax.plot(rolling_corr.index, rolling_corr.values)
ax.set_title('Rolling Correlation (age vs income, window=50)')
ax.set_ylabel('Correlation Coefficient')
ax.grid(True, alpha=0.3)
plt.show()
Weekly Installs
0
Repository
GitHub Stars
126
First Seen
Jan 1, 1970
Security Audits
AI Elements:基于shadcn/ui的AI原生应用组件库,快速构建对话界面
60,400 周安装