重要前提
安装AI Skills的关键前提是:必须科学上网,且开启TUN模式,这一点至关重要,直接决定安装能否顺利完成,在此郑重提醒三遍:科学上网,科学上网,科学上网。查看完整安装教程 →
distribution-analyzer by dengineproblem/agents-monorepo
npx skills add https://github.com/dengineproblem/agents-monorepo --skill distribution-analyzer统计分布分析专家。
Normal (Gaussian):
用例:自然现象、测量误差
参数:μ (均值), σ (标准差)
适用场景:对称、可加过程
检验:Shapiro-Wilk
Log-Normal:
用例:收入、股票价格、文件大小
参数:μ, σ (对数的)
适用场景:可乘过程、正偏态
检验:转换后检验正态性
Exponential:
用例:事件间隔时间、失效时间
参数:λ (速率)
适用场景:无记忆过程
检验:K-S 检验
Gamma:
用例:等待时间、保险索赔
参数:α (形状), β (速率)
适用场景:指数事件之和
Weibull:
用例:可靠性、生存分析
参数:λ (尺度), k (形状)
适用场景:失效时间建模
Beta:
用例:比例、概率
参数:α, β
适用场景:有界 [0,1] 数据
Pareto:
用例:财富、城市规模
参数:α (形状), xm (尺度)
适用场景:幂律、重尾
Student's t:
用例:小样本、重尾
参数:ν (自由度)
适用场景:类正态但尾部更重
Poisson:
用例:事件计数、稀有事件
参数:λ (速率)
适用场景:固定区间内的事件
Binomial:
用例:成功/失败试验
参数:n (试验次数), p (概率)
适用场景:固定次数的独立试验
Negative Binomial:
用例:过离散计数
参数:r, p
适用场景:方差 > 均值
Geometric:
用例:直到成功的试验次数
参数:p (概率)
适用场景:等待首次成功
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Optional
class DistributionAnalyzer:
"""综合分布分析工具包。"""
DISTRIBUTIONS = {
'norm': stats.norm,
'lognorm': stats.lognorm,
'expon': stats.expon,
'gamma': stats.gamma,
'weibull_min': stats.weibull_min,
'beta': stats.beta,
'pareto': stats.pareto,
't': stats.t
}
def __init__(self, data: np.ndarray):
self.data = np.array(data)
self.n = len(data)
self.results = {}
def descriptive_stats(self) -> Dict:
"""计算描述性统计量。"""
return {
'n': self.n,
'mean': np.mean(self.data),
'median': np.median(self.data),
'std': np.std(self.data),
'var': np.var(self.data),
'min': np.min(self.data),
'max': np.max(self.data),
'range': np.ptp(self.data),
'skewness': stats.skew(self.data),
'kurtosis': stats.kurtosis(self.data),
'q25': np.percentile(self.data, 25),
'q50': np.percentile(self.data, 50),
'q75': np.percentile(self.data, 75),
'iqr': stats.iqr(self.data)
}
def fit_distributions(self) -> Dict:
"""拟合多个分布并按拟合优度排序。"""
results = {}
for name, dist in self.DISTRIBUTIONS.items():
try:
# 拟合分布
params = dist.fit(self.data)
# 计算对数似然
log_likelihood = np.sum(dist.logpdf(self.data, *params))
# 计算 AIC 和 BIC
k = len(params)
aic = 2 * k - 2 * log_likelihood
bic = k * np.log(self.n) - 2 * log_likelihood
# K-S 检验
ks_stat, ks_pvalue = stats.kstest(self.data, name, params)
results[name] = {
'params': params,
'log_likelihood': log_likelihood,
'aic': aic,
'bic': bic,
'ks_statistic': ks_stat,
'ks_pvalue': ks_pvalue
}
except Exception as e:
results[name] = {'error': str(e)}
# 按 AIC 排序
valid_results = {k: v for k, v in results.items() if 'aic' in v}
ranked = sorted(valid_results.items(), key=lambda x: x[1]['aic'])
self.results = results
return {
'all': results,
'best': ranked[0] if ranked else None,
'ranking': [(name, res['aic']) for name, res in ranked]
}
def test_normality(self) -> Dict:
"""综合正态性检验。"""
results = {}
# Shapiro-Wilk (最适合 n < 5000)
if self.n < 5000:
stat, pvalue = stats.shapiro(self.data)
results['shapiro_wilk'] = {
'statistic': stat,
'pvalue': pvalue,
'is_normal': pvalue > 0.05
}
# D'Agostino-Pearson
if self.n >= 20:
stat, pvalue = stats.normaltest(self.data)
results['dagostino_pearson'] = {
'statistic': stat,
'pvalue': pvalue,
'is_normal': pvalue > 0.05
}
# Anderson-Darling
result = stats.anderson(self.data, dist='norm')
results['anderson_darling'] = {
'statistic': result.statistic,
'critical_values': dict(zip(
[f'{cv}%' for cv in result.significance_level],
result.critical_values
)),
'is_normal': result.statistic < result.critical_values[2] # 5%
}
# Kolmogorov-Smirnov
stat, pvalue = stats.kstest(
self.data, 'norm',
args=(np.mean(self.data), np.std(self.data))
)
results['kolmogorov_smirnov'] = {
'statistic': stat,
'pvalue': pvalue,
'is_normal': pvalue > 0.05
}
return results
def detect_outliers(self, method: str = 'iqr') -> Dict:
"""使用各种方法检测离群值。"""
results = {'method': method}
if method == 'iqr':
q1, q3 = np.percentile(self.data, [25, 75])
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
outliers = self.data[(self.data < lower) | (self.data > upper)]
results.update({
'lower_bound': lower,
'upper_bound': upper,
'outliers': outliers,
'n_outliers': len(outliers),
'outlier_percentage': len(outliers) / self.n * 100
})
elif method == 'zscore':
z_scores = np.abs(stats.zscore(self.data))
threshold = 3
outliers = self.data[z_scores > threshold]
results.update({
'threshold': threshold,
'outliers': outliers,
'n_outliers': len(outliers),
'outlier_percentage': len(outliers) / self.n * 100
})
elif method == 'mad':
median = np.median(self.data)
mad = np.median(np.abs(self.data - median))
threshold = 3.5
modified_z = 0.6745 * (self.data - median) / mad
outliers = self.data[np.abs(modified_z) > threshold]
results.update({
'median': median,
'mad': mad,
'outliers': outliers,
'n_outliers': len(outliers),
'outlier_percentage': len(outliers) / self.n * 100
})
return results
def plot_analysis(self, figsize: Tuple = (15, 10)):
"""生成综合诊断图。"""
fig, axes = plt.subplots(2, 3, figsize=figsize)
# 带 KDE 的直方图
axes[0, 0].hist(self.data, bins='auto', density=True, alpha=0.7)
kde_x = np.linspace(self.data.min(), self.data.max(), 100)
kde = stats.gaussian_kde(self.data)
axes[0, 0].plot(kde_x, kde(kde_x), 'r-', lw=2)
axes[0, 0].set_title('Histogram with KDE')
# Q-Q 图 (正态)
stats.probplot(self.data, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Q-Q Plot (Normal)')
# 箱线图
axes[0, 2].boxplot(self.data, vert=True)
axes[0, 2].set_title('Box Plot')
# ECDF
sorted_data = np.sort(self.data)
ecdf = np.arange(1, self.n + 1) / self.n
axes[1, 0].step(sorted_data, ecdf, where='post')
axes[1, 0].set_title('Empirical CDF')
# P-P 图
theoretical = stats.norm.cdf(sorted_data,
loc=np.mean(self.data),
scale=np.std(self.data))
axes[1, 1].scatter(theoretical, ecdf, alpha=0.5)
axes[1, 1].plot([0, 1], [0, 1], 'r--')
axes[1, 1].set_title('P-P Plot (Normal)')
# 小提琴图
axes[1, 2].violinplot(self.data)
axes[1, 2].set_title('Violin Plot')
plt.tight_layout()
return fig
def bootstrap_ci(self, statistic: str = 'mean',
n_bootstrap: int = 10000,
confidence: float = 0.95) -> Dict:
"""计算自助法置信区间。"""
stat_func = {
'mean': np.mean,
'median': np.median,
'std': np.std
}[statistic]
bootstrap_stats = []
for _ in range(n_bootstrap):
sample = np.random.choice(self.data, size=self.n, replace=True)
bootstrap_stats.append(stat_func(sample))
alpha = 1 - confidence
lower = np.percentile(bootstrap_stats, alpha / 2 * 100)
upper = np.percentile(bootstrap_stats, (1 - alpha / 2) * 100)
return {
'statistic': statistic,
'point_estimate': stat_func(self.data),
'ci_lower': lower,
'ci_upper': upper,
'confidence': confidence,
'standard_error': np.std(bootstrap_stats)
}
# 示例用法
import numpy as np
# 生成样本数据
np.random.seed(42)
data = np.random.lognormal(mean=2, sigma=0.5, size=1000)
# 创建分析器
analyzer = DistributionAnalyzer(data)
# 描述性统计
print("Descriptive Stats:")
print(analyzer.descriptive_stats())
# 拟合分布
print("\nDistribution Fitting:")
fit_results = analyzer.fit_distributions()
print(f"Best fit: {fit_results['best'][0]}")
print(f"AIC: {fit_results['best'][1]['aic']:.2f}")
# 检验正态性
print("\nNormality Tests:")
norm_results = analyzer.test_normality()
for test, result in norm_results.items():
print(f"{test}: p-value = {result.get('pvalue', 'N/A'):.4f}")
# 检测离群值
print("\nOutlier Detection:")
outliers = analyzer.detect_outliers(method='iqr')
print(f"Found {outliers['n_outliers']} outliers ({outliers['outlier_percentage']:.1f}%)")
# 自助法置信区间
print("\nBootstrap Confidence Interval:")
ci = analyzer.bootstrap_ci('mean', confidence=0.95)
print(f"Mean: {ci['point_estimate']:.2f} [{ci['ci_lower']:.2f}, {ci['ci_upper']:.2f}]")
# 生成图表
fig = analyzer.plot_analysis()
plt.savefig('distribution_analysis.png')
对称数据:
- 从正态分布开始
- 如果尾部较重:Student's t 分布
- 如果尾部较轻:考虑均匀分布
右偏数据:
- 可乘过程用对数正态
- 等待时间用 Gamma
- 可靠性用 Weibull
- 无记忆性用指数分布
左偏数据:
- 反射后拟合右偏分布
- 使用适当参数的 Beta 分布
有界数据 [0, 1]:
- Beta 分布
- 如果接近正态,使用截断正态分布
计数数据:
- 如果均值 ≈ 方差,用泊松分布
- 如果过离散,用负二项分布
- 固定试验次数用二项分布
重尾数据:
- 幂律用帕累托分布
- Student's t 分布
- Cauchy 分布 (极端情况)
仅依赖视觉检查:
问题:直方图可能具有误导性
解决方案:使用多种检验和 Q-Q 图
忽略样本量:
问题:检验在不同 n 下表现不同
解决方案:小样本用 Shapiro-Wilk,大样本用 K-S
依赖单一检验:
问题:每种检验都有其假设
解决方案:使用多种互补的检验
过拟合:
问题:复杂分布拟合了噪声
解决方案:AIC/BIC 惩罚参数数量
参数不确定性:
问题:点估计隐藏了不确定性
解决方案:自助法置信区间
每周安装量
48
代码仓库
GitHub 星标数
3
首次出现
2026 年 1 月 29 日
安全审计
安装于
github-copilot48
opencode47
codex47
kimi-cli47
gemini-cli47
amp47
Эксперт по анализу статистических распределений.
Normal (Gaussian):
Use cases: Natural phenomena, measurement errors
Parameters: μ (mean), σ (std)
When: Symmetric, additive processes
Test: Shapiro-Wilk
Log-Normal:
Use cases: Income, stock prices, file sizes
Parameters: μ, σ (of log)
When: Multiplicative processes, positive skew
Test: Transform and test normality
Exponential:
Use cases: Time between events, failure times
Parameters: λ (rate)
When: Memoryless processes
Test: K-S test
Gamma:
Use cases: Wait times, insurance claims
Parameters: α (shape), β (rate)
When: Sum of exponential events
Weibull:
Use cases: Reliability, survival analysis
Parameters: λ (scale), k (shape)
When: Failure time modeling
Beta:
Use cases: Proportions, probabilities
Parameters: α, β
When: Bounded [0,1] data
Pareto:
Use cases: Wealth, city sizes
Parameters: α (shape), xm (scale)
When: Power law, heavy tail
Student's t:
Use cases: Small samples, heavy tails
Parameters: ν (degrees of freedom)
When: Normal-like but heavier tails
Poisson:
Use cases: Event counts, rare events
Parameters: λ (rate)
When: Events in fixed interval
Binomial:
Use cases: Success/failure trials
Parameters: n (trials), p (probability)
When: Fixed number of independent trials
Negative Binomial:
Use cases: Overdispersed counts
Parameters: r, p
When: Variance > Mean
Geometric:
Use cases: Number of trials until success
Parameters: p (probability)
When: Waiting for first success
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Optional
class DistributionAnalyzer:
"""Comprehensive distribution analysis toolkit."""
DISTRIBUTIONS = {
'norm': stats.norm,
'lognorm': stats.lognorm,
'expon': stats.expon,
'gamma': stats.gamma,
'weibull_min': stats.weibull_min,
'beta': stats.beta,
'pareto': stats.pareto,
't': stats.t
}
def __init__(self, data: np.ndarray):
self.data = np.array(data)
self.n = len(data)
self.results = {}
def descriptive_stats(self) -> Dict:
"""Calculate descriptive statistics."""
return {
'n': self.n,
'mean': np.mean(self.data),
'median': np.median(self.data),
'std': np.std(self.data),
'var': np.var(self.data),
'min': np.min(self.data),
'max': np.max(self.data),
'range': np.ptp(self.data),
'skewness': stats.skew(self.data),
'kurtosis': stats.kurtosis(self.data),
'q25': np.percentile(self.data, 25),
'q50': np.percentile(self.data, 50),
'q75': np.percentile(self.data, 75),
'iqr': stats.iqr(self.data)
}
def fit_distributions(self) -> Dict:
"""Fit multiple distributions and rank by goodness-of-fit."""
results = {}
for name, dist in self.DISTRIBUTIONS.items():
try:
# Fit distribution
params = dist.fit(self.data)
# Calculate log-likelihood
log_likelihood = np.sum(dist.logpdf(self.data, *params))
# Calculate AIC and BIC
k = len(params)
aic = 2 * k - 2 * log_likelihood
bic = k * np.log(self.n) - 2 * log_likelihood
# K-S test
ks_stat, ks_pvalue = stats.kstest(self.data, name, params)
results[name] = {
'params': params,
'log_likelihood': log_likelihood,
'aic': aic,
'bic': bic,
'ks_statistic': ks_stat,
'ks_pvalue': ks_pvalue
}
except Exception as e:
results[name] = {'error': str(e)}
# Rank by AIC
valid_results = {k: v for k, v in results.items() if 'aic' in v}
ranked = sorted(valid_results.items(), key=lambda x: x[1]['aic'])
self.results = results
return {
'all': results,
'best': ranked[0] if ranked else None,
'ranking': [(name, res['aic']) for name, res in ranked]
}
def test_normality(self) -> Dict:
"""Comprehensive normality testing."""
results = {}
# Shapiro-Wilk (best for n < 5000)
if self.n < 5000:
stat, pvalue = stats.shapiro(self.data)
results['shapiro_wilk'] = {
'statistic': stat,
'pvalue': pvalue,
'is_normal': pvalue > 0.05
}
# D'Agostino-Pearson
if self.n >= 20:
stat, pvalue = stats.normaltest(self.data)
results['dagostino_pearson'] = {
'statistic': stat,
'pvalue': pvalue,
'is_normal': pvalue > 0.05
}
# Anderson-Darling
result = stats.anderson(self.data, dist='norm')
results['anderson_darling'] = {
'statistic': result.statistic,
'critical_values': dict(zip(
[f'{cv}%' for cv in result.significance_level],
result.critical_values
)),
'is_normal': result.statistic < result.critical_values[2] # 5%
}
# Kolmogorov-Smirnov
stat, pvalue = stats.kstest(
self.data, 'norm',
args=(np.mean(self.data), np.std(self.data))
)
results['kolmogorov_smirnov'] = {
'statistic': stat,
'pvalue': pvalue,
'is_normal': pvalue > 0.05
}
return results
def detect_outliers(self, method: str = 'iqr') -> Dict:
"""Detect outliers using various methods."""
results = {'method': method}
if method == 'iqr':
q1, q3 = np.percentile(self.data, [25, 75])
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
outliers = self.data[(self.data < lower) | (self.data > upper)]
results.update({
'lower_bound': lower,
'upper_bound': upper,
'outliers': outliers,
'n_outliers': len(outliers),
'outlier_percentage': len(outliers) / self.n * 100
})
elif method == 'zscore':
z_scores = np.abs(stats.zscore(self.data))
threshold = 3
outliers = self.data[z_scores > threshold]
results.update({
'threshold': threshold,
'outliers': outliers,
'n_outliers': len(outliers),
'outlier_percentage': len(outliers) / self.n * 100
})
elif method == 'mad':
median = np.median(self.data)
mad = np.median(np.abs(self.data - median))
threshold = 3.5
modified_z = 0.6745 * (self.data - median) / mad
outliers = self.data[np.abs(modified_z) > threshold]
results.update({
'median': median,
'mad': mad,
'outliers': outliers,
'n_outliers': len(outliers),
'outlier_percentage': len(outliers) / self.n * 100
})
return results
def plot_analysis(self, figsize: Tuple = (15, 10)):
"""Generate comprehensive diagnostic plots."""
fig, axes = plt.subplots(2, 3, figsize=figsize)
# Histogram with KDE
axes[0, 0].hist(self.data, bins='auto', density=True, alpha=0.7)
kde_x = np.linspace(self.data.min(), self.data.max(), 100)
kde = stats.gaussian_kde(self.data)
axes[0, 0].plot(kde_x, kde(kde_x), 'r-', lw=2)
axes[0, 0].set_title('Histogram with KDE')
# Q-Q Plot (Normal)
stats.probplot(self.data, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Q-Q Plot (Normal)')
# Box Plot
axes[0, 2].boxplot(self.data, vert=True)
axes[0, 2].set_title('Box Plot')
# ECDF
sorted_data = np.sort(self.data)
ecdf = np.arange(1, self.n + 1) / self.n
axes[1, 0].step(sorted_data, ecdf, where='post')
axes[1, 0].set_title('Empirical CDF')
# P-P Plot
theoretical = stats.norm.cdf(sorted_data,
loc=np.mean(self.data),
scale=np.std(self.data))
axes[1, 1].scatter(theoretical, ecdf, alpha=0.5)
axes[1, 1].plot([0, 1], [0, 1], 'r--')
axes[1, 1].set_title('P-P Plot (Normal)')
# Violin Plot
axes[1, 2].violinplot(self.data)
axes[1, 2].set_title('Violin Plot')
plt.tight_layout()
return fig
def bootstrap_ci(self, statistic: str = 'mean',
n_bootstrap: int = 10000,
confidence: float = 0.95) -> Dict:
"""Calculate bootstrap confidence intervals."""
stat_func = {
'mean': np.mean,
'median': np.median,
'std': np.std
}[statistic]
bootstrap_stats = []
for _ in range(n_bootstrap):
sample = np.random.choice(self.data, size=self.n, replace=True)
bootstrap_stats.append(stat_func(sample))
alpha = 1 - confidence
lower = np.percentile(bootstrap_stats, alpha / 2 * 100)
upper = np.percentile(bootstrap_stats, (1 - alpha / 2) * 100)
return {
'statistic': statistic,
'point_estimate': stat_func(self.data),
'ci_lower': lower,
'ci_upper': upper,
'confidence': confidence,
'standard_error': np.std(bootstrap_stats)
}
# Example usage
import numpy as np
# Generate sample data
np.random.seed(42)
data = np.random.lognormal(mean=2, sigma=0.5, size=1000)
# Create analyzer
analyzer = DistributionAnalyzer(data)
# Descriptive statistics
print("Descriptive Stats:")
print(analyzer.descriptive_stats())
# Fit distributions
print("\nDistribution Fitting:")
fit_results = analyzer.fit_distributions()
print(f"Best fit: {fit_results['best'][0]}")
print(f"AIC: {fit_results['best'][1]['aic']:.2f}")
# Test normality
print("\nNormality Tests:")
norm_results = analyzer.test_normality()
for test, result in norm_results.items():
print(f"{test}: p-value = {result.get('pvalue', 'N/A'):.4f}")
# Detect outliers
print("\nOutlier Detection:")
outliers = analyzer.detect_outliers(method='iqr')
print(f"Found {outliers['n_outliers']} outliers ({outliers['outlier_percentage']:.1f}%)")
# Bootstrap CI
print("\nBootstrap Confidence Interval:")
ci = analyzer.bootstrap_ci('mean', confidence=0.95)
print(f"Mean: {ci['point_estimate']:.2f} [{ci['ci_lower']:.2f}, {ci['ci_upper']:.2f}]")
# Generate plots
fig = analyzer.plot_analysis()
plt.savefig('distribution_analysis.png')
Symmetric data:
- Start with Normal
- If heavy tails: Student's t
- If lighter tails: Consider uniform
Right-skewed data:
- Log-normal for multiplicative
- Gamma for waiting times
- Weibull for reliability
- Exponential if memoryless
Left-skewed data:
- Reflect and fit right-skewed
- Beta with appropriate parameters
Bounded data [0, 1]:
- Beta distribution
- Truncated normal if near-normal
Count data:
- Poisson if mean ≈ variance
- Negative binomial if overdispersed
- Binomial for fixed trials
Heavy tails:
- Pareto for power law
- Student's t
- Cauchy (extreme)
Visual inspection alone:
Problem: Histograms can be misleading
Solution: Use multiple tests and Q-Q plots
Ignoring sample size:
Problem: Tests behave differently with n
Solution: Shapiro-Wilk for small, K-S for large
Single test reliance:
Problem: Each test has assumptions
Solution: Use multiple complementary tests
Overfitting:
Problem: Complex distribution fits noise
Solution: AIC/BIC penalize parameters
Parameter uncertainty:
Problem: Point estimates hide uncertainty
Solution: Bootstrap confidence intervals
Weekly Installs
48
Repository
GitHub Stars
3
First Seen
Jan 29, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
github-copilot48
opencode47
codex47
kimi-cli47
gemini-cli47
amp47
前端代码审计工具 - 自动化检测可访问性、性能、响应式设计、主题化与反模式
57,700 周安装
DynamoDB-Toolbox v2 TypeScript 模式指南:单表设计与类型安全CRUD
167 周安装
自动化生命科学API发现与工具创建 | 生物信息学API集成与验证工作流
167 周安装
.NET 架构师专家:C#、ASP.NET Core、微服务与云原生架构设计与优化
170 周安装
Tauri v2 桌面应用开发指南:Rust 核心与 WebView 前端构建跨平台应用
168 周安装
单元测试框架指南:AAA模式、最佳实践与多语言示例
170 周安装
TypeScript React 模式指南:35+规则构建类型安全、可维护的React应用
167 周安装