本文目录导读:
我来介绍几个Python统计工具的实战案例,从基础到高级,包含完整的代码实现。
基础统计分析工具
描述性统计分析器
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
class DescriptiveStatsAnalyzer:
"""描述性统计分析工具"""
def __init__(self, data):
self.data = np.array(data)
def basic_stats(self):
"""计算基本统计量"""
stats_dict = {
'样本数量': len(self.data),
'均值': np.mean(self.data),
'中位数': np.median(self.data),
'众数': stats.mode(self.data)[0][0],
'标准差': np.std(self.data, ddof=1),
'方差': np.var(self.data, ddof=1),
'最小值': np.min(self.data),
'最大值': np.max(self.data),
'范围': np.ptp(self.data),
'偏度': stats.skew(self.data),
'峰度': stats.kurtosis(self.data)
}
return stats_dict
def percentile_stats(self):
"""百分位数统计"""
percentiles = [25, 50, 75, 90, 95, 99]
return {
f'{p}%分位数': np.percentile(self.data, p)
for p in percentiles
}
def outlier_detection(self):
"""异常值检测(IQR方法)"""
Q1 = np.percentile(self.data, 25)
Q3 = np.percentile(self.data, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = self.data[(self.data < lower_bound) | (self.data > upper_bound)]
return {
'下界': lower_bound,
'上界': upper_bound,
'异常值数量': len(outliers),
'异常值比例': len(outliers) / len(self.data) * 100,
'异常值': outliers.tolist()
}
# 使用示例
if __name__ == "__main__":
# 生成示例数据
np.random.seed(42)
data = np.random.normal(50, 15, 1000)
# 添加一些异常值
data = np.append(data, [150, -20, 200, -30])
analyzer = DescriptiveStatsAnalyzer(data)
print("=== 基本统计量 ===")
for key, value in analyzer.basic_stats().items():
print(f"{key}: {value:.2f}")
print("\n=== 百分位数 ===")
for key, value in analyzer.percentile_stats().items():
print(f"{key}: {value:.2f}")
print("\n=== 异常值检测 ===")
outlier_info = analyzer.outlier_detection()
for key, value in outlier_info.items():
print(f"{key}: {value}")
假设检验工具
import scipy.stats as stats
import numpy as np
from typing import Tuple, Dict
class HypothesisTestTool:
"""假设检验工具"""
@staticmethod
def t_test_one_sample(sample: np.ndarray,
pop_mean: float,
alternative: str = 'two-sided') -> Dict:
"""单样本t检验"""
t_stat, p_value = stats.ttest_1samp(sample, pop_mean, alternative=alternative)
return {
'检验方法': '单样本t检验',
't统计量': t_stat,
'p值': p_value,
'样本均值': np.mean(sample),
'原假设均值': pop_mean,
'#39;: '拒绝原假设' if p_value < 0.05 else '不能拒绝原假设'
}
@staticmethod
def t_test_two_samples(sample1: np.ndarray,
sample2: np.ndarray,
equal_var: bool = True) -> Dict:
"""双样本t检验"""
t_stat, p_value = stats.ttest_ind(sample1, sample2, equal_var=equal_var)
return {
'检验方法': '独立样本t检验',
't统计量': t_stat,
'p值': p_value,
'样本1均值': np.mean(sample1),
'样本2均值': np.mean(sample2),
'均值差': np.mean(sample1) - np.mean(sample2),
'#39;: '两组均值有显著差异' if p_value < 0.05 else '两组均值无显著差异'
}
@staticmethod
def chi_square_test(observed: np.ndarray,
expected: np.ndarray = None) -> Dict:
"""卡方检验"""
if expected is None:
chi2_stat, p_value, dof, expected = stats.chi2_contingency(observed)
else:
chi2_stat, p_value = stats.chisquare(observed, expected)
dof = len(observed) - 1
return {
'检验方法': '卡方检验',
'卡方统计量': chi2_stat,
'自由度': dof,
'p值': p_value,
'#39;: '拒绝原假设' if p_value < 0.05 else '不能拒绝原假设'
}
@staticmethod
def anova_test(*groups: np.ndarray) -> Dict:
"""方差分析(ANOVA)"""
f_stat, p_value = stats.f_oneway(*groups)
group_means = [np.mean(g) for g in groups]
return {
'检验方法': '单因素方差分析',
'F统计量': f_stat,
'p值': p_value,
'各组均值': group_means,
'#39;: '组间存在显著差异' if p_value < 0.05 else '组间无显著差异'
}
# 使用示例
if __name__ == "__main__":
np.random.seed(42)
# 单样本t检验
sample = np.random.normal(52, 10, 100)
result = HypothesisTestTool.t_test_one_sample(sample, 50)
print("=== 单样本t检验 ===")
for key, value in result.items():
print(f"{key}: {value}")
# 双样本t检验
sample1 = np.random.normal(50, 10, 100)
sample2 = np.random.normal(55, 10, 100)
result = HypothesisTestTool.t_test_two_samples(sample1, sample2)
print("\n=== 双样本t检验 ===")
for key, value in result.items():
print(f"{key}: {value}")
相关性分析工具
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
class CorrelationAnalyzer:
"""相关性分析工具"""
def __init__(self, dataframe: pd.DataFrame):
self.df = dataframe
def pearson_correlation(self, col1: str, col2: str) -> Dict:
"""皮尔逊相关系数"""
corr, p_value = stats.pearsonr(self.df[col1], self.df[col2])
return {
'相关系数': corr,
'p值': p_value,
'相关性强度': self._interpret_correlation(corr),
'显著性': '显著' if p_value < 0.05 else '不显著'
}
def spearman_correlation(self, col1: str, col2: str) -> Dict:
"""斯皮尔曼等级相关系数"""
corr, p_value = stats.spearmanr(self.df[col1], self.df[col2])
return {
'相关系数': corr,
'p值': p_value,
'相关性强度': self._interpret_correlation(corr),
'显著性': '显著' if p_value < 0.05 else '不显著'
}
@staticmethod
def _interpret_correlation(corr: float) -> str:
"""解释相关性强度"""
abs_corr = abs(corr)
if abs_corr >= 0.8:
return "极强相关"
elif abs_corr >= 0.6:
return "强相关"
elif abs_corr >= 0.4:
return "中等相关"
elif abs_corr >= 0.2:
return "弱相关"
else:
return "极弱或无关"
def correlation_matrix(self) -> pd.DataFrame:
"""计算相关矩阵"""
return self.df.corr()
def plot_correlation_matrix(self, figsize: Tuple = (10, 8)):
"""绘制相关矩阵热力图"""
plt.figure(figsize=figsize)
sns.heatmap(self.df.corr(),
annot=True,
cmap='coolwarm',
center=0,
fmt='.2f')
plt.title('相关矩阵热力图')
plt.tight_layout()
return plt.gcf()
# 使用示例
if __name__ == "__main__":
# 创建示例数据
np.random.seed(42)
data = {
'年龄': np.random.randint(20, 60, 100),
'收入': np.random.normal(50000, 15000, 100),
'教育年限': np.random.randint(8, 22, 100),
'工作经验': np.random.randint(0, 35, 100)
}
df = pd.DataFrame(data)
# 添加一些相关性
df['收入'] = df['收入'] + df['教育年限'] * 2000 + np.random.normal(0, 5000, 100)
analyzer = CorrelationAnalyzer(df)
print("=== 皮尔逊相关分析 ===")
result = analyzer.pearson_correlation('教育年限', '收入')
for key, value in result.items():
print(f"{key}: {value}")
print("\n=== 相关矩阵 ===")
print(analyzer.correlation_matrix())
回归分析工具
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
class RegressionAnalyzer:
"""回归分析工具"""
def __init__(self, X, y):
self.X = np.array(X)
self.y = np.array(y)
if len(self.X.shape) == 1:
self.X = self.X.reshape(-1, 1)
def linear_regression(self, add_constant: bool = True) -> Dict:
"""线性回归分析"""
if add_constant:
X_with_const = sm.add_constant(self.X)
model = sm.OLS(self.y, X_with_const).fit()
else:
model = sm.OLS(self.y, self.X).fit()
return {
'模型摘要': model.summary(),
'R²': model.rsquared,
'调整R²': model.rsquared_adj,
'系数': model.params,
'p值': model.pvalues,
'AIC': model.aic,
'BIC': model.bic,
'F统计量': model.fvalue,
'F_p值': model.f_pvalue
}
def predict_and_evaluate(self, X_test, y_test) -> Dict:
"""预测与评估"""
# 使用sklearn进行预测
model = LinearRegression()
model.fit(self.X, self.y)
y_pred = model.predict(X_test)
return {
'R²分数': r2_score(y_test, y_pred),
'均方误差': mean_squared_error(y_test, y_pred),
'均方根误差': np.sqrt(mean_squared_error(y_test, y_pred)),
'预测值': y_pred,
'实际值': y_test
}
def plot_regression(self, x_label: str = 'X', y_label: str = 'y'):
"""绘制回归图"""
plt.figure(figsize=(10, 6))
# 散点图
plt.scatter(self.X, self.y, alpha=0.7, label='实际数据')
# 回归线
model = LinearRegression()
model.fit(self.X, self.y)
x_range = np.linspace(self.X.min(), self.X.max(), 100).reshape(-1, 1)
y_pred = model.predict(x_range)
plt.plot(x_range, y_pred, 'r-', label='回归线', linewidth=2)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(f'线性回归分析 (R² = {model.score(self.X, self.y):.3f})')
plt.legend()
plt.grid(True, alpha=0.3)
return plt.gcf()
# 使用示例
if __name__ == "__main__":
# 生成示例数据
np.random.seed(42)
X = np.random.randn(100, 2)
y = 3 + 2 * X[:, 0] + 1.5 * X[:, 1] + np.random.randn(100) * 0.5
analyzer = RegressionAnalyzer(X, y)
print("=== 线性回归分析 ===")
result = analyzer.linear_regression()
print(f"R²: {result['R²']:.4f}")
print(f"调整R²: {result['调整R²']:.4f}")
print(f"系数:\n{result['系数']}")
print(f"p值:\n{result['p值']}")
数据可视化统计工具
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Tuple
class StatisticalVisualizer:
"""统计可视化工具"""
def __init__(self, style: str = 'seaborn-v0_8-darkgrid'):
plt.style.use(style)
self.figures = []
def distribution_plot(self, data, bins: int = 30,
title: str = '分布图'):
"""分布图(直方图+核密度估计)"""
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 直方图
axes[0].hist(data, bins=bins, density=True, alpha=0.7,
color='skyblue', edgecolor='black')
axes[0].set_title(f'{title} - 直方图')
axes[0].set_xlabel('值')
axes[0].set_ylabel('频率')
# 核密度估计 + 直方图
axes[1] = sns.histplot(data, kde=True, bins=bins,
color='salmon', alpha=0.6)
axes[1].set_title(f'{title} - 核密度估计')
axes[1].set_xlabel('值')
axes[1].set_ylabel('密度')
plt.tight_layout()
self.figures.append(fig)
return fig
def boxplot_comparison(self, data_dict: dict,
title: str = '箱线图比较'):
"""箱线图比较"""
fig, ax = plt.subplots(figsize=(10, 6))
# 准备数据
data_list = list(data_dict.values())
labels = list(data_dict.keys())
# 绘制箱线图
bp = ax.boxplot(data_list, labels=labels, patch_artist=True)
# 美化
colors = ['lightblue', 'lightgreen', 'salmon', 'lightyellow']
for patch, color in zip(bp['boxes'], colors[:len(data_list)]):
patch.set_facecolor(color)
# 添加统计信息
for i, (label, data) in enumerate(data_dict.items()):
stats_text = f'均值: {np.mean(data):.2f}\n中位数: {np.median(data):.2f}'
ax.text(i+1, ax.get_ylim()[1], stats_text,
ha='center', va='bottom', fontsize=8)
ax.set_title(title)
ax.set_ylabel('值')
ax.grid(True, alpha=0.3)
plt.tight_layout()
self.figures.append(fig)
return fig
def qq_plot(self, data, dist='norm',
title: str = 'Q-Q图'):
"""Q-Q图(正态性检验)"""
from scipy import stats
fig, ax = plt.subplots(figsize=(8, 8))
# 计算分位数
stats.probplot(data, dist=dist, plot=ax)
ax.set_title(title)
ax.set_xlabel('理论分位数')
ax.set_ylabel('样本分位数')
ax.grid(True, alpha=0.3)
# 添加统计信息
shapiro_stat, shapiro_p = stats.shapiro(data)
ax.text(0.05, 0.95, f'Shapiro-Wilk p = {shapiro_p:.4f}',
transform=ax.transAxes, fontsize=10,
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.tight_layout()
self.figures.append(fig)
return fig
# 使用示例
if __name__ == "__main__":
visualizer = StatisticalVisualizer()
# 生成示例数据
np.random.seed(42)
normal_data = np.random.normal(50, 15, 500)
skewed_data = np.random.exponential(10, 500)
uniform_data = np.random.uniform(0, 100, 500)
# 绘制分布图
visualizer.distribution_plot(normal_data, title='正态分布数据')
# 绘制箱线图比较
data_dict = {
'正态分布': normal_data,
'偏态分布': skewed_data,
'均匀分布': uniform_data
}
visualizer.boxplot_comparison(data_dict, title='不同分布比较')
# 绘制Q-Q图
visualizer.qq_plot(normal_data, title='正态性检验Q-Q图')
plt.show()
完整统计分析流程案例
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
class CompleteStatisticalAnalysis:
"""完整的统计分析流程"""
def __init__(self, data: pd.DataFrame, target_col: str = None):
self.data = data
self.target_col = target_col
self.results = {}
def data_exploration(self) -> Dict:
"""数据探索阶段"""
print("=" * 50)
print("数据探索")
print("=" * 50)
# 基本统计信息
self.results['basic_info'] = {
'shape': self.data.shape,
'dtypes': self.data.dtypes,
'missing_values': self.data.isnull().sum(),
'missing_percentage': self.data.isnull().sum() / len(self.data) * 100
}
# 描述性统计
self.results['descriptive'] = self.data.describe()
# 数值变量的偏度和峰度
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
skew_kurt = {}
for col in numeric_cols:
skew_kurt[col] = {
'skewness': stats.skew(self.data[col].dropna()),
'kurtosis': stats.kurtosis(self.data[col].dropna())
}
self.results['skew_kurt'] = skew_kurt
return self.results
def hypothesis_testing(self, alpha: float = 0.05) -> Dict:
"""假设检验阶段"""
print("=" * 50)
print("假设检验")
print("=" * 50)
test_results = {}
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
# 正态性检验(对所有数值变量)
normality_tests = {}
for col in numeric_cols:
_, p_value = stats.shapiro(self.data[col].dropna())
normality_tests[col] = {
'test': 'Shapiro-Wilk',
'p_value': p_value,
'is_normal': p_value > alpha
}
test_results['normality'] = normality_tests
# 如果有目标变量,进行t检验或ANOVA
if self.target_col and self.target_col in self.data.columns:
target = self.data[self.target_col]
if target.nunique() == 2:
# 二分类:进行t检验
group1 = self.data[self.data[self.target_col] == target.unique()[0]]
group2 = self.data[self.data[self.target_col] == target.unique()[1]]
for col in numeric_cols:
if col != self.target_col:
t_stat, p_value = stats.ttest_ind(
group1[col].dropna(),
group2[col].dropna()
)
test_results[f't_test_{col}'] = {
't_statistic': t_stat,
'p_value': p_value,
'significant': p_value < alpha
}
self.results['hypothesis_tests'] = test_results
return test_results
def correlation_analysis(self, method: str = 'pearson') -> pd.DataFrame:
"""相关性分析"""
print("=" * 50)
print("相关性分析")
print("=" * 50)
numeric_data = self.data.select_dtypes(include=[np.number])
corr_matrix = numeric_data.corr(method=method)
# 找出高相关性对
high_corr = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
if abs(corr_matrix.iloc[i, j]) > 0.7:
high_corr.append({
'var1': corr_matrix.columns[i],
'var2': corr_matrix.columns[j],
'correlation': corr_matrix.iloc[i, j]
})
self.results['correlation'] = {
'matrix': corr_matrix,
'high_correlations': high_corr
}
return corr_matrix
def regression_modeling(self, features: List[str] = None) -> Dict:
"""回归建模"""
print("=" * 50)
print("回归建模")
print("=" * 50)
if not self.target_col:
raise ValueError("需要指定目标变量")
# 准备数据
if features is None:
features = [col for col in self.data.columns
if col != self.target_col and self.data[col].dtype in ['int64', 'float64']]
X = self.data[features]
y = self.data[self.target_col]
# 处理缺失值
X = X.fillna(X.mean())
y = y.fillna(y.mean())
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 训练模型
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# 预测
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)
# 模型评估
model_results = {
'train_r2': r2_score(y_train, y_pred_train),
'test_r2': r2_score(y_test, y_pred_test),
'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_train)),
'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
'coefficients': dict(zip(features, model.coef_)),
'intercept': model.intercept_,
'feature_importance': pd.DataFrame({
'feature': features,
'coefficient': model.coef_
}).sort_values('coefficient', key=abs, ascending=False)
}
self.results['regression'] = model_results
return model_results
def generate_report(self) -> str:
"""生成分析报告"""
report = []
report.append("=" * 60)
report.append("统计分析报告")
report.append("=" * 60)
report.append("")
# 数据基本信息
report.append("1. 数据基本信息")
report.append(f" 数据形状: {self.results.get('basic_info', {}).get('shape', 'N/A')}")
report.append(f" 缺失值数量: {self.results.get('basic_info', {}).get('missing_values', 'N/A').sum()}")
report.append("")
# 描述性统计
report.append("2. 描述性统计")
desc = self.results.get('descriptive', pd.DataFrame())
report.append(str(desc))
report.append("")
# 异常值检测
if 'outliers' in self.results:
report.append("3. 异常值检测结果")
report.append(f" 异常值数量: {self.results['outliers']['count']}")
report.append(f" 异常值比例: {self.results['outliers']['percentage']:.2f}%")
report.append("")
# 假设检验结果
if 'hypothesis_tests' in self.results:
report.append("4. 假设检验结果")
norm_results = self.results['hypothesis_tests'].get('normality', {})
for col, result in norm_results.items():
report.append(f" {col}: {'正态分布' if result['is_normal'] else '非正态分布'}")
report.append("")
# 回归结果
if 'regression' in self.results:
report.append("5. 回归分析结果")
reg = self.results['regression']
report.append(f" 训练集 R²: {reg.get('train_r2', 'N/A'):.4f}")
report.append(f" 测试集 R²: {reg.get('test_r2', 'N/A'):.4f}")
report.append(f" 训练集 RMSE: {reg.get('train_rmse', 'N/A'):.4f}")
report.append(f" 测试集 RMSE: {reg.get('test_rmse', 'N/A'):.4f}")
report.append("")
report.append(" 重要特征(按系数绝对值排序):")
if 'feature_importance' in reg:
report.append(str(reg['feature_importance'].head()))
return "\n".join(report)
# 使用示例
if __name__ == "__main__":
# 生成示例数据
np.random.seed(42)
n_samples = 1000
data = pd.DataFrame({
'age': np.random.randint(18, 80, n_samples),
'income': np.random.normal(50000, 20000, n_samples),
'education': np.random.randint(8, 22, n_samples),
'experience': np.random.randint(0, 40, n_samples),
'spending': np.random.normal(1000, 500, n_samples)
})
# 添加一些关系
data['income'] = data['income'] + 2000 * data['education'] + 1000 * data['experience']
data['spending'] = data['spending'] + 0.3 * data['income'] + np.random.normal(0, 100, n_samples)
# 创建分析实例
analyzer = CompleteStatisticalAnalysis(data, target_col='spending')
# 执行分析
analyzer.data_exploration()
analyzer.hypothesis_testing()
analyzer.correlation_analysis()
analyzer.regression_modeling()
# 生成报告
report = analyzer.generate_report()
print(report)
实用统计函数库
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Union
import warnings
warnings.filterwarnings('ignore')
class StatisticsUtils:
"""统计工具函数库"""
@staticmethod
def calculate_confidence_interval(data: np.ndarray,
confidence: float = 0.95) -> Tuple[float, float]:
"""计算置信区间"""
import scipy.stats as stats
n = len(data)
mean = np.mean(data)
se = stats.sem(data)
h = se * stats.t.ppf((1 + confidence) / 2., n-1)
return (mean - h, mean + h)
@staticmethod
def effect_size_cohens_d(sample1: np.ndarray,
sample2: np.ndarray) -> float:
"""计算Cohen's d效应量"""
n1, n2 = len(sample1), len(sample2)
var1, var2 = np.var(sample1, ddof=1), np.var(sample2, ddof=1)
# 合并标准差
pooled_se = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
d = (np.mean(sample1) - np.mean(sample2)) / pooled_se
return d
@staticmethod
def bootstrap_ci(data: np.ndarray,
stat_function: callable = np.mean,
n_bootstrap: int = 1000,
ci: float = 0.95) -> Dict:
"""自助法计算置信区间"""
np.random.seed(42)
boot_stats = []
for _ in range(n_bootstrap):
boot_sample = np.random.choice(data, len(data), replace=True)
boot_stats.append(stat_function(boot_sample))
boot_stats = np.array(boot_stats)
lower_percentile = (1 - ci) / 2 * 100
upper_percentile = (1 + ci) / 2 * 100
return {
'original_stat': stat_function(data),
'bootstrap_mean': np.mean(boot_stats),
'bootstrap_se': np.std(boot_stats),
'ci_lower': np.percentile(boot_stats, lower_percentile),
'ci_upper': np.percentile(boot_stats, upper_percentile)
}
@staticmethod
def power_analysis(effect_size: float,
alpha: float = 0.05,
n_samples: int = None,
power: float = 0.8) -> Dict:
"""功效分析"""
from statsmodels.stats.power import TTestIndPower
analysis = TTestIndPower()
if n_samples is None:
# 计算所需样本量
n = analysis.solve_power(
effect_size=effect_size,
alpha=alpha,
power=power,
alternative='two-sided'
)
return {
'required_n_per_group': int(np.ceil(n)),
'effect_size': effect_size,
'alpha': alpha,
'power': power
}
else:
# 计算功效
calculated_power = analysis.solve_power(
effect_size=effect_size,
nobs1=n_samples,
alpha=alpha,
alternative='two-sided'
)
return {
'power': calculated_power,
'effect_size': effect_size,
'alpha': alpha,
'n_per_group': n_samples
}
@staticmethod
def outlier_removal_iqr(data: np.ndarray,
multiplier: float = 1.5) -> Dict:
"""基于IQR的异常值剔除"""
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR
mask = (data >= lower_bound) & (data <= upper_bound)
return {
'original_data': data,
'cleaned_data': data[mask],
'outliers': data[~mask],
'n_outliers': np.sum(~mask),
'outlier_percentage': np.sum(~mask) / len(data) * 100,
'bounds': (lower_bound, upper_bound),
'mask': mask
}
# 使用示例
if __name__ == "__main__":
np.random.seed(42)
# 生成数据
data1 = np.random.normal(50, 10, 100)
data2 = np.random.normal(55, 10, 100)
# 置信区间
ci = StatisticsUtils.calculate_confidence_interval(data1)
print(f"95%置信区间: ({ci[0]:.2f}, {ci[1]:.2f})")
# 效应量
d = StatisticsUtils.effect_size_cohens_d(data1, data2)
print(f"Cohen's d效应量: {d:.3f}")
# 自助法
bootstrap_result = StatisticsUtils.bootstrap_ci(data1)
print(f"自助法置信区间: ({bootstrap_result['ci_lower']:.2f}, {bootstrap_result['ci_upper']:.2f})")
# 功效分析
power_result = StatisticsUtils.power_analysis(effect_size=0.5)
print(f"所需样本量: {power_result['required_n_per_group']}")
# 异常值检测
outlier_result = StatisticsUtils.outlier_removal_iqr(data1)
print(f"异常值数量: {outlier_result['n_outliers']}")
这些统计工具案例涵盖了:
- 描述性统计:基本统计量、百分位数、异常值检测
- 假设检验:t检验、卡方检验、ANOVA
- 相关性分析:皮尔逊、斯皮尔曼相关系数
- 回归分析:线性回归、模型评估
- 数据可视化:分布图、箱线图、Q-Q图
- 完整分析流程:数据探索到报告生成
- 实用函数库:置信区间、效应量、功效分析
你可以根据实际需求选择和使用这些工具,并在此基础上进行扩展和优化。