statistics-math by pluginagentmarketplace/custom-plugin-data-engineer
npx skills add https://github.com/pluginagentmarketplace/custom-plugin-data-engineer --skill statistics-math数据科学、机器学习和统计分析的数学基础。
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
# Descriptive Statistics
data = np.array([23, 45, 67, 32, 45, 67, 89, 12, 34, 56])
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Std Dev: {np.std(data, ddof=1):.2f}")
print(f"IQR: {np.percentile(data, 75) - np.percentile(data, 25):.2f}")
# Hypothesis Testing
sample_a = [23, 45, 67, 32, 45]
sample_b = [56, 78, 45, 67, 89]
t_stat, p_value = stats.ttest_ind(sample_a, sample_b)
print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
if p_value < 0.05:
print("Reject null hypothesis: significant difference")
else:
print("Fail to reject null hypothesis")
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Normal Distribution
mu, sigma = 100, 15
normal_dist = stats.norm(loc=mu, scale=sigma)
x = np.linspace(50, 150, 100)
# PDF, CDF calculations
print(f"P(X < 85): {normal_dist.cdf(85):.4f}")
print(f"P(X > 115): {1 - normal_dist.cdf(115):.4f}")
print(f"95th percentile: {normal_dist.ppf(0.95):.2f}")
# Binomial Distribution (discrete)
n, p = 100, 0.3
binom_dist = stats.binom(n=n, p=p)
print(f"P(X = 30): {binom_dist.pmf(30):.4f}")
print(f"P(X <= 30): {binom_dist.cdf(30):.4f}")
# Poisson Distribution (events per time)
lambda_param = 5
poisson_dist = stats.poisson(mu=lambda_param)
print(f"P(X = 3): {poisson_dist.pmf(3):.4f}")
# Central Limit Theorem demonstration
population = np.random.exponential(scale=10, size=100000)
sample_means = [np.mean(np.random.choice(population, 30)) for _ in range(1000)]
print(f"Sample means are approximately normal: mean={np.mean(sample_means):.2f}")
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
from scipy import stats
import numpy as np
class HypothesisTest:
"""统计假设检验框架。"""
@staticmethod
def two_sample_ttest(group_a, group_b, alpha=0.05):
"""独立样本 t 检验。"""
t_stat, p_value = stats.ttest_ind(group_a, group_b)
effect_size = (np.mean(group_a) - np.mean(group_b)) / np.sqrt(
(np.var(group_a) + np.var(group_b)) / 2
)
return {
"t_statistic": t_stat,
"p_value": p_value,
"significant": p_value < alpha,
"effect_size_cohens_d": effect_size
}
@staticmethod
def chi_square_test(observed, expected=None, alpha=0.05):
"""分类数据的卡方检验。"""
if expected is None:
chi2, p_value, dof, expected = stats.chi2_contingency(observed)
else:
chi2, p_value = stats.chisquare(observed, expected)
dof = len(observed) - 1
return {
"chi2_statistic": chi2,
"p_value": p_value,
"degrees_of_freedom": dof,
"significant": p_value < alpha
}
@staticmethod
def ab_test_proportion(conversions_a, total_a, conversions_b, total_b, alpha=0.05):
"""用于 A/B 测试的双比例 z 检验。"""
p_a = conversions_a / total_a
p_b = conversions_b / total_b
p_pooled = (conversions_a + conversions_b) / (total_a + total_b)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total_a + 1/total_b))
z_stat = (p_a - p_b) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
return {
"conversion_a": p_a,
"conversion_b": p_b,
"lift": (p_b - p_a) / p_a * 100,
"z_statistic": z_stat,
"p_value": p_value,
"significant": p_value < alpha
}
# Usage
result = HypothesisTest.ab_test_proportion(
conversions_a=120, total_a=1000,
conversions_b=150, total_b=1000
)
print(f"Lift: {result['lift']:.1f}%, p-value: {result['p_value']:.4f}")
import numpy as np
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Basic operations
print("Matrix multiplication:", A @ B)
print("Element-wise:", A * B)
print("Transpose:", A.T)
print("Inverse:", np.linalg.inv(A))
print("Determinant:", np.linalg.det(A))
# Eigenvalues and eigenvectors (PCA foundation)
eigenvalues, eigenvectors = np.linalg.eig(A)
print(f"Eigenvalues: {eigenvalues}")
# Singular Value Decomposition (dimensionality reduction)
U, S, Vt = np.linalg.svd(A)
print(f"Singular values: {S}")
# Solving linear systems: Ax = b
b = np.array([5, 11])
x = np.linalg.solve(A, b)
print(f"Solution: {x}")
# Cosine similarity (NLP, recommendations)
def cosine_similarity(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
vec1 = np.array([1, 2, 3])
vec2 = np.array([4, 5, 6])
print(f"Cosine similarity: {cosine_similarity(vec1, vec2):.4f}")
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
# Multiple Linear Regression with statsmodels
X = np.random.randn(100, 3)
y = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(100)*0.5
X_with_const = sm.add_constant(X)
model = sm.OLS(y, X_with_const).fit()
print(model.summary())
print(f"R-squared: {model.rsquared:.4f}")
print(f"Coefficients: {model.params}")
print(f"P-values: {model.pvalues}")
# Regularization comparison
X_train, y_train = X[:80], y[:80]
X_test, y_test = X[80:], y[80:]
models = {
"OLS": LinearRegression(),
"Ridge": Ridge(alpha=1.0),
"Lasso": Lasso(alpha=0.1)
}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"{name}: R²={r2_score(y_test, y_pred):.4f}, RMSE={np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
| 工具 | 用途 | 版本 (2025) |
|---|---|---|
| NumPy | 数值计算 | 1.26+ |
| SciPy | 科学计算 | 1.12+ |
| pandas | 数据操作 | 2.2+ |
| statsmodels | 统计模型 | 0.14+ |
| scikit-learn | 机器学习算法 | 1.4+ |
| 问题 | 症状 | 根本原因 | 修复方法 |
|---|---|---|---|
| p 值低,效应量小 | 显著但无意义 | 样本量大 | 检查效应量 |
| 高方差 | 估计不稳定 | 样本小,存在异常值 | 更多数据,稳健方法 |
| 多重共线性 | 系数膨胀 | 特征相关 | 检查 VIF,移除特征 |
| 异方差性 | 推断无效 | 方差非常数 | 加权最小二乘法 |
# ✅ 建议:检验前检查假设
from scipy.stats import shapiro
stat, p = shapiro(data)
if p > 0.05:
print("Data is approximately normal")
# ✅ 建议:使用效应量,而不仅仅是 p 值
# ✅ 建议:校正多重比较 (Bonferroni)
# ✅ 建议:报告置信区间
# ❌ 不建议:通过尝试多次检验进行 p 值操纵
# ❌ 不建议:混淆相关性与因果关系
# ❌ 不建议:忽略样本量要求
技能认证清单:
每周安装量
184
仓库
GitHub 星标数
4
首次出现
2026年1月24日
安全审计
安装于
opencode173
codex165
gemini-cli164
github-copilot159
cursor157
kimi-cli148
Mathematical foundations for data science, machine learning, and statistical analysis.
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
# Descriptive Statistics
data = np.array([23, 45, 67, 32, 45, 67, 89, 12, 34, 56])
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Std Dev: {np.std(data, ddof=1):.2f}")
print(f"IQR: {np.percentile(data, 75) - np.percentile(data, 25):.2f}")
# Hypothesis Testing
sample_a = [23, 45, 67, 32, 45]
sample_b = [56, 78, 45, 67, 89]
t_stat, p_value = stats.ttest_ind(sample_a, sample_b)
print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
if p_value < 0.05:
print("Reject null hypothesis: significant difference")
else:
print("Fail to reject null hypothesis")
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Normal Distribution
mu, sigma = 100, 15
normal_dist = stats.norm(loc=mu, scale=sigma)
x = np.linspace(50, 150, 100)
# PDF, CDF calculations
print(f"P(X < 85): {normal_dist.cdf(85):.4f}")
print(f"P(X > 115): {1 - normal_dist.cdf(115):.4f}")
print(f"95th percentile: {normal_dist.ppf(0.95):.2f}")
# Binomial Distribution (discrete)
n, p = 100, 0.3
binom_dist = stats.binom(n=n, p=p)
print(f"P(X = 30): {binom_dist.pmf(30):.4f}")
print(f"P(X <= 30): {binom_dist.cdf(30):.4f}")
# Poisson Distribution (events per time)
lambda_param = 5
poisson_dist = stats.poisson(mu=lambda_param)
print(f"P(X = 3): {poisson_dist.pmf(3):.4f}")
# Central Limit Theorem demonstration
population = np.random.exponential(scale=10, size=100000)
sample_means = [np.mean(np.random.choice(population, 30)) for _ in range(1000)]
print(f"Sample means are approximately normal: mean={np.mean(sample_means):.2f}")
from scipy import stats
import numpy as np
class HypothesisTest:
"""Framework for statistical hypothesis testing."""
@staticmethod
def two_sample_ttest(group_a, group_b, alpha=0.05):
"""Independent samples t-test."""
t_stat, p_value = stats.ttest_ind(group_a, group_b)
effect_size = (np.mean(group_a) - np.mean(group_b)) / np.sqrt(
(np.var(group_a) + np.var(group_b)) / 2
)
return {
"t_statistic": t_stat,
"p_value": p_value,
"significant": p_value < alpha,
"effect_size_cohens_d": effect_size
}
@staticmethod
def chi_square_test(observed, expected=None, alpha=0.05):
"""Chi-square test for categorical data."""
if expected is None:
chi2, p_value, dof, expected = stats.chi2_contingency(observed)
else:
chi2, p_value = stats.chisquare(observed, expected)
dof = len(observed) - 1
return {
"chi2_statistic": chi2,
"p_value": p_value,
"degrees_of_freedom": dof,
"significant": p_value < alpha
}
@staticmethod
def ab_test_proportion(conversions_a, total_a, conversions_b, total_b, alpha=0.05):
"""Two-proportion z-test for A/B testing."""
p_a = conversions_a / total_a
p_b = conversions_b / total_b
p_pooled = (conversions_a + conversions_b) / (total_a + total_b)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total_a + 1/total_b))
z_stat = (p_a - p_b) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
return {
"conversion_a": p_a,
"conversion_b": p_b,
"lift": (p_b - p_a) / p_a * 100,
"z_statistic": z_stat,
"p_value": p_value,
"significant": p_value < alpha
}
# Usage
result = HypothesisTest.ab_test_proportion(
conversions_a=120, total_a=1000,
conversions_b=150, total_b=1000
)
print(f"Lift: {result['lift']:.1f}%, p-value: {result['p_value']:.4f}")
import numpy as np
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Basic operations
print("Matrix multiplication:", A @ B)
print("Element-wise:", A * B)
print("Transpose:", A.T)
print("Inverse:", np.linalg.inv(A))
print("Determinant:", np.linalg.det(A))
# Eigenvalues and eigenvectors (PCA foundation)
eigenvalues, eigenvectors = np.linalg.eig(A)
print(f"Eigenvalues: {eigenvalues}")
# Singular Value Decomposition (dimensionality reduction)
U, S, Vt = np.linalg.svd(A)
print(f"Singular values: {S}")
# Solving linear systems: Ax = b
b = np.array([5, 11])
x = np.linalg.solve(A, b)
print(f"Solution: {x}")
# Cosine similarity (NLP, recommendations)
def cosine_similarity(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
vec1 = np.array([1, 2, 3])
vec2 = np.array([4, 5, 6])
print(f"Cosine similarity: {cosine_similarity(vec1, vec2):.4f}")
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
# Multiple Linear Regression with statsmodels
X = np.random.randn(100, 3)
y = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(100)*0.5
X_with_const = sm.add_constant(X)
model = sm.OLS(y, X_with_const).fit()
print(model.summary())
print(f"R-squared: {model.rsquared:.4f}")
print(f"Coefficients: {model.params}")
print(f"P-values: {model.pvalues}")
# Regularization comparison
X_train, y_train = X[:80], y[:80]
X_test, y_test = X[80:], y[80:]
models = {
"OLS": LinearRegression(),
"Ridge": Ridge(alpha=1.0),
"Lasso": Lasso(alpha=0.1)
}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"{name}: R²={r2_score(y_test, y_pred):.4f}, RMSE={np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
| Tool | Purpose | Version (2025) |
|---|---|---|
| NumPy | Numerical computing | 1.26+ |
| SciPy | Scientific computing | 1.12+ |
| pandas | Data manipulation | 2.2+ |
| statsmodels | Statistical models | 0.14+ |
| scikit-learn | ML algorithms | 1.4+ |
| Issue | Symptoms | Root Cause | Fix |
|---|---|---|---|
| Low p-value, small effect | Significant but meaningless | Large sample size | Check effect size |
| High variance | Unstable estimates | Small sample, outliers | More data, robust methods |
| Multicollinearity | Inflated coefficients | Correlated features | VIF check, remove features |
| Heteroscedasticity | Invalid inference | Non-constant variance | Weighted least squares |
# ✅ DO: Check assumptions before testing
from scipy.stats import shapiro
stat, p = shapiro(data)
if p > 0.05:
print("Data is approximately normal")
# ✅ DO: Use effect sizes, not just p-values
# ✅ DO: Correct for multiple comparisons (Bonferroni)
# ✅ DO: Report confidence intervals
# ❌ DON'T: p-hack by trying many tests
# ❌ DON'T: Confuse correlation with causation
# ❌ DON'T: Ignore sample size requirements
Skill Certification Checklist:
Weekly Installs
184
Repository
GitHub Stars
4
First Seen
Jan 24, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
opencode173
codex165
gemini-cli164
github-copilot159
cursor157
kimi-cli148
专业SEO审计工具:全面网站诊断、技术SEO优化与页面分析指南
64,900 周安装
uni-app跨平台开发框架:基于Vue.js一键发布iOS、Android、小程序和Web应用
750 周安装
OpenClaw主机安全加固指南:评估风险、制定计划、分步执行
761 周安装
Apple Notes CLI 终端命令行工具 - 在 macOS 终端管理 Apple 笔记
761 周安装
video-frames:使用ffmpeg从视频中提取单帧和缩略图的命令行工具
768 周安装
Oracle到PostgreSQL迁移集成测试创建指南 | 数据库迁移测试最佳实践
768 周安装
Oracle到PostgreSQL数据库迁移审查工具:风险评估与验证指南
766 周安装