statsmodels by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill statsmodelsStatsmodels 是 Python 中用于统计建模的首要库,提供了涵盖广泛统计方法的估计、推断和诊断工具。应用此技能可进行严谨的统计分析,从简单的线性回归到复杂的时间序列模型和计量经济学分析。
此技能应在以下情况下使用:
import statsmodels.api as sm
import numpy as np
import pandas as pd
# 准备数据 - 始终添加常数项以包含截距
X = sm.add_constant(X_data)
# 拟合 OLS 模型
model = sm.OLS(y, X)
results = model.fit()
# 查看综合结果
print(results.summary())
# 关键结果
print(f"R-squared: {results.rsquared:.4f}")
print(f"Coefficients:\\n{results.params}")
print(f"P-values:\\n{results.pvalues}")
# 包含置信区间的预测
predictions = results.get_prediction(X_new)
pred_summary = predictions.summary_frame()
print(pred_summary) # 包含均值、置信区间、预测区间
# 诊断
from statsmodels.stats.diagnostic import het_breuschpagan
bp_test = het_breuschpagan(results.resid, X)
print(f"Breusch-Pagan p-value: {bp_test[1]:.4f}")
# 可视化残差
import matplotlib.pyplot as plt
plt.scatter(results.fittedvalues, results.resid)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.show()
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
from statsmodels.discrete.discrete_model import Logit
# 添加常数项
X = sm.add_constant(X_data)
# 拟合 logit 模型
model = Logit(y_binary, X)
results = model.fit()
print(results.summary())
# 比值比
odds_ratios = np.exp(results.params)
print("Odds ratios:\\n", odds_ratios)
# 预测概率
probs = results.predict(X)
# 二元预测(0.5 阈值)
predictions = (probs > 0.5).astype(int)
# 模型评估
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_binary, predictions))
print(f"AUC: {roc_auc_score(y_binary, probs):.4f}")
# 边际效应
marginal = results.get_margeff()
print(marginal.summary())
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# 检查平稳性
from statsmodels.tsa.stattools import adfuller
adf_result = adfuller(y_series)
print(f"ADF p-value: {adf_result[1]:.4f}")
if adf_result[1] > 0.05:
# 序列非平稳,进行差分
y_diff = y_series.diff().dropna()
# 绘制 ACF/PACF 以识别 p, q
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
plot_acf(y_diff, lags=40, ax=ax1)
plot_pacf(y_diff, lags=40, ax=ax2)
plt.show()
# 拟合 ARIMA(p,d,q)
model = ARIMA(y_series, order=(1, 1, 1))
results = model.fit()
print(results.summary())
# 预测
forecast = results.forecast(steps=10)
forecast_obj = results.get_forecast(steps=10)
forecast_df = forecast_obj.summary_frame()
print(forecast_df) # 包含均值和置信区间
# 残差诊断
results.plot_diagnostics(figsize=(12, 8))
plt.show()
import statsmodels.api as sm
# 计数数据的泊松回归
X = sm.add_constant(X_data)
model = sm.GLM(y_counts, X, family=sm.families.Poisson())
results = model.fit()
print(results.summary())
# 比率比(对于使用对数链接的泊松回归)
rate_ratios = np.exp(results.params)
print("Rate ratios:\\n", rate_ratios)
# 检查过度离散
overdispersion = results.pearson_chi2 / results.df_resid
print(f"Overdispersion: {overdispersion:.2f}")
if overdispersion > 1.5:
# 改用负二项分布
from statsmodels.discrete.count_model import NegativeBinomial
nb_model = NegativeBinomial(y_counts, X)
nb_results = nb_model.fit()
print(nb_results.summary())
针对具有各种误差结构的连续结果的综合线性模型套件。
可用模型:
关键特性:
何时使用: 连续结果变量,需要对系数进行推断,需要诊断
参考: 有关模型选择、诊断和最佳实践的详细指导,请参阅 references/linear_models.md。
将线性模型扩展到非正态分布的灵活框架。
分布族:
链接函数:
关键特性:
何时使用: 非正态结果,需要灵活的方差和链接函数规范
参考: 有关族选择、链接函数、解释和诊断的详细指导,请参阅 references/glm.md。
适用于分类和计数结果的模型。
二元模型:
多项模型:
计数模型:
关键特性:
何时使用: 二元、分类或计数结果
参考: 有关模型选择、解释和评估的详细指导,请参阅 references/discrete_choice.md。
全面的时间序列建模和预测能力。
单变量模型:
多变量模型:
高级模型:
关键特性:
何时使用: 时间顺序数据、预测、理解时间动态
参考: 有关模型选择、诊断和预测方法的详细指导,请参阅 references/time_series.md。
用于模型验证的广泛测试和诊断能力。
残差诊断:
影响与异常值:
假设检验:
多重比较:
效应大小与功效:
稳健推断:
何时使用: 验证假设、检测问题、确保稳健推断
参考: 有关全面测试和诊断程序的详细指导,请参阅 references/stats_diagnostics.md。
Statsmodels 支持 R 风格的公式,用于直观的模型规范:
import statsmodels.formula.api as smf
# 使用公式的 OLS
results = smf.ols('y ~ x1 + x2 + x1:x2', data=df).fit()
# 分类变量(自动虚拟编码)
results = smf.ols('y ~ x1 + C(category)', data=df).fit()
# 交互作用
results = smf.ols('y ~ x1 * x2', data=df).fit() # x1 + x2 + x1:x2
# 多项式项
results = smf.ols('y ~ x + I(x**2)', data=df).fit()
# Logit
results = smf.logit('y ~ x1 + x2 + C(group)', data=df).fit()
# Poisson
results = smf.poisson('count ~ x1 + x2', data=df).fit()
# ARIMA(无法通过公式使用,请使用常规 API)
# 使用 AIC/BIC 比较模型
models = {
'Model 1': model1_results,
'Model 2': model2_results,
'Model 3': model3_results
}
comparison = pd.DataFrame({
'AIC': {name: res.aic for name, res in models.items()},
'BIC': {name: res.bic for name, res in models.items()},
'Log-Likelihood': {name: res.llf for name, res in models.items()}
})
print(comparison.sort_values('AIC'))
# 较低的 AIC/BIC 表示更好的模型
# 对于嵌套模型(一个模型是另一个的子集)
from scipy import stats
lr_stat = 2 * (full_model.llf - reduced_model.llf)
df = full_model.df_model - reduced_model.df_model
p_value = 1 - stats.chi2.cdf(lr_stat, df)
print(f"LR statistic: {lr_stat:.4f}")
print(f"p-value: {p_value:.4f}")
if p_value < 0.05:
print("Full model significantly better")
else:
print("Reduced model preferred (parsimony)")
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
for train_idx, val_idx in kf.split(X):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
# 拟合模型
model = sm.OLS(y_train, X_train).fit()
# 预测
y_pred = model.predict(X_val)
# 评分
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
cv_scores.append(rmse)
print(f"CV RMSE: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
sm.add_constant().summary() 获取详细输出此技能包含用于详细指导的综合参考文件:
线性回归模型的详细覆盖,包括:
广义线性模型的完整指南:
离散结果模型的综合指南:
深入的时间序列分析指导:
全面的统计检验和诊断:
何时参考:
搜索模式:
# 查找特定模型的信息
grep -r "Quantile Regression" references/
# 查找诊断检验
grep -r "Breusch-Pagan" references/stats_diagnostics.md
# 查找时间序列指导
grep -r "SARIMAX" references/time_series.md
sm.add_constant()如需详细文档和示例:
每周安装数
171
仓库
GitHub 星标数
22.6K
首次出现时间
2026年1月21日
安全审计
安装于
opencode138
claude-code138
gemini-cli129
cursor122
codex120
github-copilot113
Statsmodels is Python's premier library for statistical modeling, providing tools for estimation, inference, and diagnostics across a wide range of statistical methods. Apply this skill for rigorous statistical analysis, from simple linear regression to complex time series models and econometric analyses.
This skill should be used when:
import statsmodels.api as sm
import numpy as np
import pandas as pd
# Prepare data - ALWAYS add constant for intercept
X = sm.add_constant(X_data)
# Fit OLS model
model = sm.OLS(y, X)
results = model.fit()
# View comprehensive results
print(results.summary())
# Key results
print(f"R-squared: {results.rsquared:.4f}")
print(f"Coefficients:\\n{results.params}")
print(f"P-values:\\n{results.pvalues}")
# Predictions with confidence intervals
predictions = results.get_prediction(X_new)
pred_summary = predictions.summary_frame()
print(pred_summary) # includes mean, CI, prediction intervals
# Diagnostics
from statsmodels.stats.diagnostic import het_breuschpagan
bp_test = het_breuschpagan(results.resid, X)
print(f"Breusch-Pagan p-value: {bp_test[1]:.4f}")
# Visualize residuals
import matplotlib.pyplot as plt
plt.scatter(results.fittedvalues, results.resid)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.show()
from statsmodels.discrete.discrete_model import Logit
# Add constant
X = sm.add_constant(X_data)
# Fit logit model
model = Logit(y_binary, X)
results = model.fit()
print(results.summary())
# Odds ratios
odds_ratios = np.exp(results.params)
print("Odds ratios:\\n", odds_ratios)
# Predicted probabilities
probs = results.predict(X)
# Binary predictions (0.5 threshold)
predictions = (probs > 0.5).astype(int)
# Model evaluation
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_binary, predictions))
print(f"AUC: {roc_auc_score(y_binary, probs):.4f}")
# Marginal effects
marginal = results.get_margeff()
print(marginal.summary())
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Check stationarity
from statsmodels.tsa.stattools import adfuller
adf_result = adfuller(y_series)
print(f"ADF p-value: {adf_result[1]:.4f}")
if adf_result[1] > 0.05:
# Series is non-stationary, difference it
y_diff = y_series.diff().dropna()
# Plot ACF/PACF to identify p, q
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
plot_acf(y_diff, lags=40, ax=ax1)
plot_pacf(y_diff, lags=40, ax=ax2)
plt.show()
# Fit ARIMA(p,d,q)
model = ARIMA(y_series, order=(1, 1, 1))
results = model.fit()
print(results.summary())
# Forecast
forecast = results.forecast(steps=10)
forecast_obj = results.get_forecast(steps=10)
forecast_df = forecast_obj.summary_frame()
print(forecast_df) # includes mean and confidence intervals
# Residual diagnostics
results.plot_diagnostics(figsize=(12, 8))
plt.show()
import statsmodels.api as sm
# Poisson regression for count data
X = sm.add_constant(X_data)
model = sm.GLM(y_counts, X, family=sm.families.Poisson())
results = model.fit()
print(results.summary())
# Rate ratios (for Poisson with log link)
rate_ratios = np.exp(results.params)
print("Rate ratios:\\n", rate_ratios)
# Check overdispersion
overdispersion = results.pearson_chi2 / results.df_resid
print(f"Overdispersion: {overdispersion:.2f}")
if overdispersion > 1.5:
# Use Negative Binomial instead
from statsmodels.discrete.count_model import NegativeBinomial
nb_model = NegativeBinomial(y_counts, X)
nb_results = nb_model.fit()
print(nb_results.summary())
Comprehensive suite of linear models for continuous outcomes with various error structures.
Available models:
Key features:
When to use: Continuous outcome variable, want inference on coefficients, need diagnostics
Reference: See references/linear_models.md for detailed guidance on model selection, diagnostics, and best practices.
Flexible framework extending linear models to non-normal distributions.
Distribution families:
Link functions:
Key features:
When to use: Non-normal outcomes, need flexible variance and link specifications
Reference: See references/glm.md for family selection, link functions, interpretation, and diagnostics.
Models for categorical and count outcomes.
Binary models:
Multinomial models:
Count models:
Key features:
When to use: Binary, categorical, or count outcomes
Reference: See references/discrete_choice.md for model selection, interpretation, and evaluation.
Comprehensive time series modeling and forecasting capabilities.
Univariate models:
Multivariate models:
Advanced models:
Key features:
When to use: Time-ordered data, forecasting, understanding temporal dynamics
Reference: See references/time_series.md for model selection, diagnostics, and forecasting methods.
Extensive testing and diagnostic capabilities for model validation.
Residual diagnostics:
Influence and outliers:
Hypothesis testing:
Multiple comparisons:
Effect sizes and power:
Robust inference:
When to use: Validating assumptions, detecting problems, ensuring robust inference
Reference: See references/stats_diagnostics.md for comprehensive testing and diagnostic procedures.
Statsmodels supports R-style formulas for intuitive model specification:
import statsmodels.formula.api as smf
# OLS with formula
results = smf.ols('y ~ x1 + x2 + x1:x2', data=df).fit()
# Categorical variables (automatic dummy coding)
results = smf.ols('y ~ x1 + C(category)', data=df).fit()
# Interactions
results = smf.ols('y ~ x1 * x2', data=df).fit() # x1 + x2 + x1:x2
# Polynomial terms
results = smf.ols('y ~ x + I(x**2)', data=df).fit()
# Logit
results = smf.logit('y ~ x1 + x2 + C(group)', data=df).fit()
# Poisson
results = smf.poisson('count ~ x1 + x2', data=df).fit()
# ARIMA (not available via formula, use regular API)
# Compare models using AIC/BIC
models = {
'Model 1': model1_results,
'Model 2': model2_results,
'Model 3': model3_results
}
comparison = pd.DataFrame({
'AIC': {name: res.aic for name, res in models.items()},
'BIC': {name: res.bic for name, res in models.items()},
'Log-Likelihood': {name: res.llf for name, res in models.items()}
})
print(comparison.sort_values('AIC'))
# Lower AIC/BIC indicates better model
# For nested models (one is subset of the other)
from scipy import stats
lr_stat = 2 * (full_model.llf - reduced_model.llf)
df = full_model.df_model - reduced_model.df_model
p_value = 1 - stats.chi2.cdf(lr_stat, df)
print(f"LR statistic: {lr_stat:.4f}")
print(f"p-value: {p_value:.4f}")
if p_value < 0.05:
print("Full model significantly better")
else:
print("Reduced model preferred (parsimony)")
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
for train_idx, val_idx in kf.split(X):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
# Fit model
model = sm.OLS(y_train, X_train).fit()
# Predict
y_pred = model.predict(X_val)
# Score
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
cv_scores.append(rmse)
print(f"CV RMSE: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
sm.add_constant() unless excluding intercept.summary() for detailed outputThis skill includes comprehensive reference files for detailed guidance:
Detailed coverage of linear regression models including:
Complete guide to generalized linear models:
Comprehensive guide to discrete outcome models:
In-depth time series analysis guidance:
Comprehensive statistical testing and diagnostics:
When to reference:
Search patterns:
# Find information about specific models
grep -r "Quantile Regression" references/
# Find diagnostic tests
grep -r "Breusch-Pagan" references/stats_diagnostics.md
# Find time series guidance
grep -r "SARIMAX" references/time_series.md
sm.add_constant() unless no intercept desiredFor detailed documentation and examples:
Weekly Installs
171
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
opencode138
claude-code138
gemini-cli129
cursor122
codex120
github-copilot113
AI Elements:基于shadcn/ui的AI原生应用组件库,快速构建对话界面
62,200 周安装