Python统计工具案例如何实现?

wen python案例 3

本文目录导读:

  1. 基础统计分析工具
  2. 完整统计分析流程案例
  3. 实用统计函数库

我来介绍几个Python统计工具的实战案例,从基础到高级,包含完整的代码实现。

基础统计分析工具

描述性统计分析器

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
class DescriptiveStatsAnalyzer:
    """描述性统计分析工具"""
    def __init__(self, data):
        self.data = np.array(data)
    def basic_stats(self):
        """计算基本统计量"""
        stats_dict = {
            '样本数量': len(self.data),
            '均值': np.mean(self.data),
            '中位数': np.median(self.data),
            '众数': stats.mode(self.data)[0][0],
            '标准差': np.std(self.data, ddof=1),
            '方差': np.var(self.data, ddof=1),
            '最小值': np.min(self.data),
            '最大值': np.max(self.data),
            '范围': np.ptp(self.data),
            '偏度': stats.skew(self.data),
            '峰度': stats.kurtosis(self.data)
        }
        return stats_dict
    def percentile_stats(self):
        """百分位数统计"""
        percentiles = [25, 50, 75, 90, 95, 99]
        return {
            f'{p}%分位数': np.percentile(self.data, p)
            for p in percentiles
        }
    def outlier_detection(self):
        """异常值检测(IQR方法)"""
        Q1 = np.percentile(self.data, 25)
        Q3 = np.percentile(self.data, 75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = self.data[(self.data < lower_bound) | (self.data > upper_bound)]
        return {
            '下界': lower_bound,
            '上界': upper_bound,
            '异常值数量': len(outliers),
            '异常值比例': len(outliers) / len(self.data) * 100,
            '异常值': outliers.tolist()
        }
# 使用示例
if __name__ == "__main__":
    # 生成示例数据
    np.random.seed(42)
    data = np.random.normal(50, 15, 1000)
    # 添加一些异常值
    data = np.append(data, [150, -20, 200, -30])
    analyzer = DescriptiveStatsAnalyzer(data)
    print("=== 基本统计量 ===")
    for key, value in analyzer.basic_stats().items():
        print(f"{key}: {value:.2f}")
    print("\n=== 百分位数 ===")
    for key, value in analyzer.percentile_stats().items():
        print(f"{key}: {value:.2f}")
    print("\n=== 异常值检测 ===")
    outlier_info = analyzer.outlier_detection()
    for key, value in outlier_info.items():
        print(f"{key}: {value}")

假设检验工具

import scipy.stats as stats
import numpy as np
from typing import Tuple, Dict
class HypothesisTestTool:
    """假设检验工具"""
    @staticmethod
    def t_test_one_sample(sample: np.ndarray, 
                         pop_mean: float, 
                         alternative: str = 'two-sided') -> Dict:
        """单样本t检验"""
        t_stat, p_value = stats.ttest_1samp(sample, pop_mean, alternative=alternative)
        return {
            '检验方法': '单样本t检验',
            't统计量': t_stat,
            'p值': p_value,
            '样本均值': np.mean(sample),
            '原假设均值': pop_mean,
            '#39;: '拒绝原假设' if p_value < 0.05 else '不能拒绝原假设'
        }
    @staticmethod
    def t_test_two_samples(sample1: np.ndarray, 
                          sample2: np.ndarray,
                          equal_var: bool = True) -> Dict:
        """双样本t检验"""
        t_stat, p_value = stats.ttest_ind(sample1, sample2, equal_var=equal_var)
        return {
            '检验方法': '独立样本t检验',
            't统计量': t_stat,
            'p值': p_value,
            '样本1均值': np.mean(sample1),
            '样本2均值': np.mean(sample2),
            '均值差': np.mean(sample1) - np.mean(sample2),
            '#39;: '两组均值有显著差异' if p_value < 0.05 else '两组均值无显著差异'
        }
    @staticmethod
    def chi_square_test(observed: np.ndarray, 
                       expected: np.ndarray = None) -> Dict:
        """卡方检验"""
        if expected is None:
            chi2_stat, p_value, dof, expected = stats.chi2_contingency(observed)
        else:
            chi2_stat, p_value = stats.chisquare(observed, expected)
            dof = len(observed) - 1
        return {
            '检验方法': '卡方检验',
            '卡方统计量': chi2_stat,
            '自由度': dof,
            'p值': p_value,
            '#39;: '拒绝原假设' if p_value < 0.05 else '不能拒绝原假设'
        }
    @staticmethod
    def anova_test(*groups: np.ndarray) -> Dict:
        """方差分析(ANOVA)"""
        f_stat, p_value = stats.f_oneway(*groups)
        group_means = [np.mean(g) for g in groups]
        return {
            '检验方法': '单因素方差分析',
            'F统计量': f_stat,
            'p值': p_value,
            '各组均值': group_means,
            '#39;: '组间存在显著差异' if p_value < 0.05 else '组间无显著差异'
        }
# 使用示例
if __name__ == "__main__":
    np.random.seed(42)
    # 单样本t检验
    sample = np.random.normal(52, 10, 100)
    result = HypothesisTestTool.t_test_one_sample(sample, 50)
    print("=== 单样本t检验 ===")
    for key, value in result.items():
        print(f"{key}: {value}")
    # 双样本t检验
    sample1 = np.random.normal(50, 10, 100)
    sample2 = np.random.normal(55, 10, 100)
    result = HypothesisTestTool.t_test_two_samples(sample1, sample2)
    print("\n=== 双样本t检验 ===")
    for key, value in result.items():
        print(f"{key}: {value}")

相关性分析工具

import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
class CorrelationAnalyzer:
    """相关性分析工具"""
    def __init__(self, dataframe: pd.DataFrame):
        self.df = dataframe
    def pearson_correlation(self, col1: str, col2: str) -> Dict:
        """皮尔逊相关系数"""
        corr, p_value = stats.pearsonr(self.df[col1], self.df[col2])
        return {
            '相关系数': corr,
            'p值': p_value,
            '相关性强度': self._interpret_correlation(corr),
            '显著性': '显著' if p_value < 0.05 else '不显著'
        }
    def spearman_correlation(self, col1: str, col2: str) -> Dict:
        """斯皮尔曼等级相关系数"""
        corr, p_value = stats.spearmanr(self.df[col1], self.df[col2])
        return {
            '相关系数': corr,
            'p值': p_value,
            '相关性强度': self._interpret_correlation(corr),
            '显著性': '显著' if p_value < 0.05 else '不显著'
        }
    @staticmethod
    def _interpret_correlation(corr: float) -> str:
        """解释相关性强度"""
        abs_corr = abs(corr)
        if abs_corr >= 0.8:
            return "极强相关"
        elif abs_corr >= 0.6:
            return "强相关"
        elif abs_corr >= 0.4:
            return "中等相关"
        elif abs_corr >= 0.2:
            return "弱相关"
        else:
            return "极弱或无关"
    def correlation_matrix(self) -> pd.DataFrame:
        """计算相关矩阵"""
        return self.df.corr()
    def plot_correlation_matrix(self, figsize: Tuple = (10, 8)):
        """绘制相关矩阵热力图"""
        plt.figure(figsize=figsize)
        sns.heatmap(self.df.corr(), 
                   annot=True, 
                   cmap='coolwarm', 
                   center=0,
                   fmt='.2f')
        plt.title('相关矩阵热力图')
        plt.tight_layout()
        return plt.gcf()
# 使用示例
if __name__ == "__main__":
    # 创建示例数据
    np.random.seed(42)
    data = {
        '年龄': np.random.randint(20, 60, 100),
        '收入': np.random.normal(50000, 15000, 100),
        '教育年限': np.random.randint(8, 22, 100),
        '工作经验': np.random.randint(0, 35, 100)
    }
    df = pd.DataFrame(data)
    # 添加一些相关性
    df['收入'] = df['收入'] + df['教育年限'] * 2000 + np.random.normal(0, 5000, 100)
    analyzer = CorrelationAnalyzer(df)
    print("=== 皮尔逊相关分析 ===")
    result = analyzer.pearson_correlation('教育年限', '收入')
    for key, value in result.items():
        print(f"{key}: {value}")
    print("\n=== 相关矩阵 ===")
    print(analyzer.correlation_matrix())

回归分析工具

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
class RegressionAnalyzer:
    """回归分析工具"""
    def __init__(self, X, y):
        self.X = np.array(X)
        self.y = np.array(y)
        if len(self.X.shape) == 1:
            self.X = self.X.reshape(-1, 1)
    def linear_regression(self, add_constant: bool = True) -> Dict:
        """线性回归分析"""
        if add_constant:
            X_with_const = sm.add_constant(self.X)
            model = sm.OLS(self.y, X_with_const).fit()
        else:
            model = sm.OLS(self.y, self.X).fit()
        return {
            '模型摘要': model.summary(),
            'R²': model.rsquared,
            '调整R²': model.rsquared_adj,
            '系数': model.params,
            'p值': model.pvalues,
            'AIC': model.aic,
            'BIC': model.bic,
            'F统计量': model.fvalue,
            'F_p值': model.f_pvalue
        }
    def predict_and_evaluate(self, X_test, y_test) -> Dict:
        """预测与评估"""
        # 使用sklearn进行预测
        model = LinearRegression()
        model.fit(self.X, self.y)
        y_pred = model.predict(X_test)
        return {
            'R²分数': r2_score(y_test, y_pred),
            '均方误差': mean_squared_error(y_test, y_pred),
            '均方根误差': np.sqrt(mean_squared_error(y_test, y_pred)),
            '预测值': y_pred,
            '实际值': y_test
        }
    def plot_regression(self, x_label: str = 'X', y_label: str = 'y'):
        """绘制回归图"""
        plt.figure(figsize=(10, 6))
        # 散点图
        plt.scatter(self.X, self.y, alpha=0.7, label='实际数据')
        # 回归线
        model = LinearRegression()
        model.fit(self.X, self.y)
        x_range = np.linspace(self.X.min(), self.X.max(), 100).reshape(-1, 1)
        y_pred = model.predict(x_range)
        plt.plot(x_range, y_pred, 'r-', label='回归线', linewidth=2)
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        plt.title(f'线性回归分析 (R² = {model.score(self.X, self.y):.3f})')
        plt.legend()
        plt.grid(True, alpha=0.3)
        return plt.gcf()
# 使用示例
if __name__ == "__main__":
    # 生成示例数据
    np.random.seed(42)
    X = np.random.randn(100, 2)
    y = 3 + 2 * X[:, 0] + 1.5 * X[:, 1] + np.random.randn(100) * 0.5
    analyzer = RegressionAnalyzer(X, y)
    print("=== 线性回归分析 ===")
    result = analyzer.linear_regression()
    print(f"R²: {result['R²']:.4f}")
    print(f"调整R²: {result['调整R²']:.4f}")
    print(f"系数:\n{result['系数']}")
    print(f"p值:\n{result['p值']}")

数据可视化统计工具

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Tuple
class StatisticalVisualizer:
    """统计可视化工具"""
    def __init__(self, style: str = 'seaborn-v0_8-darkgrid'):
        plt.style.use(style)
        self.figures = []
    def distribution_plot(self, data, bins: int = 30, 
                         title: str = '分布图'):
        """分布图(直方图+核密度估计)"""
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        # 直方图
        axes[0].hist(data, bins=bins, density=True, alpha=0.7, 
                     color='skyblue', edgecolor='black')
        axes[0].set_title(f'{title} - 直方图')
        axes[0].set_xlabel('值')
        axes[0].set_ylabel('频率')
        # 核密度估计 + 直方图
        axes[1] = sns.histplot(data, kde=True, bins=bins, 
                              color='salmon', alpha=0.6)
        axes[1].set_title(f'{title} - 核密度估计')
        axes[1].set_xlabel('值')
        axes[1].set_ylabel('密度')
        plt.tight_layout()
        self.figures.append(fig)
        return fig
    def boxplot_comparison(self, data_dict: dict, 
                          title: str = '箱线图比较'):
        """箱线图比较"""
        fig, ax = plt.subplots(figsize=(10, 6))
        # 准备数据
        data_list = list(data_dict.values())
        labels = list(data_dict.keys())
        # 绘制箱线图
        bp = ax.boxplot(data_list, labels=labels, patch_artist=True)
        # 美化
        colors = ['lightblue', 'lightgreen', 'salmon', 'lightyellow']
        for patch, color in zip(bp['boxes'], colors[:len(data_list)]):
            patch.set_facecolor(color)
        # 添加统计信息
        for i, (label, data) in enumerate(data_dict.items()):
            stats_text = f'均值: {np.mean(data):.2f}\n中位数: {np.median(data):.2f}'
            ax.text(i+1, ax.get_ylim()[1], stats_text, 
                   ha='center', va='bottom', fontsize=8)
        ax.set_title(title)
        ax.set_ylabel('值')
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        self.figures.append(fig)
        return fig
    def qq_plot(self, data, dist='norm', 
                title: str = 'Q-Q图'):
        """Q-Q图(正态性检验)"""
        from scipy import stats
        fig, ax = plt.subplots(figsize=(8, 8))
        # 计算分位数
        stats.probplot(data, dist=dist, plot=ax)
        ax.set_title(title)
        ax.set_xlabel('理论分位数')
        ax.set_ylabel('样本分位数')
        ax.grid(True, alpha=0.3)
        # 添加统计信息
        shapiro_stat, shapiro_p = stats.shapiro(data)
        ax.text(0.05, 0.95, f'Shapiro-Wilk p = {shapiro_p:.4f}', 
               transform=ax.transAxes, fontsize=10,
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        plt.tight_layout()
        self.figures.append(fig)
        return fig
# 使用示例
if __name__ == "__main__":
    visualizer = StatisticalVisualizer()
    # 生成示例数据
    np.random.seed(42)
    normal_data = np.random.normal(50, 15, 500)
    skewed_data = np.random.exponential(10, 500)
    uniform_data = np.random.uniform(0, 100, 500)
    # 绘制分布图
    visualizer.distribution_plot(normal_data, title='正态分布数据')
    # 绘制箱线图比较
    data_dict = {
        '正态分布': normal_data,
        '偏态分布': skewed_data,
        '均匀分布': uniform_data
    }
    visualizer.boxplot_comparison(data_dict, title='不同分布比较')
    # 绘制Q-Q图
    visualizer.qq_plot(normal_data, title='正态性检验Q-Q图')
    plt.show()

完整统计分析流程案例

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
class CompleteStatisticalAnalysis:
    """完整的统计分析流程"""
    def __init__(self, data: pd.DataFrame, target_col: str = None):
        self.data = data
        self.target_col = target_col
        self.results = {}
    def data_exploration(self) -> Dict:
        """数据探索阶段"""
        print("=" * 50)
        print("数据探索")
        print("=" * 50)
        # 基本统计信息
        self.results['basic_info'] = {
            'shape': self.data.shape,
            'dtypes': self.data.dtypes,
            'missing_values': self.data.isnull().sum(),
            'missing_percentage': self.data.isnull().sum() / len(self.data) * 100
        }
        # 描述性统计
        self.results['descriptive'] = self.data.describe()
        # 数值变量的偏度和峰度
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns
        skew_kurt = {}
        for col in numeric_cols:
            skew_kurt[col] = {
                'skewness': stats.skew(self.data[col].dropna()),
                'kurtosis': stats.kurtosis(self.data[col].dropna())
            }
        self.results['skew_kurt'] = skew_kurt
        return self.results
    def hypothesis_testing(self, alpha: float = 0.05) -> Dict:
        """假设检验阶段"""
        print("=" * 50)
        print("假设检验")
        print("=" * 50)
        test_results = {}
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns
        # 正态性检验(对所有数值变量)
        normality_tests = {}
        for col in numeric_cols:
            _, p_value = stats.shapiro(self.data[col].dropna())
            normality_tests[col] = {
                'test': 'Shapiro-Wilk',
                'p_value': p_value,
                'is_normal': p_value > alpha
            }
        test_results['normality'] = normality_tests
        # 如果有目标变量,进行t检验或ANOVA
        if self.target_col and self.target_col in self.data.columns:
            target = self.data[self.target_col]
            if target.nunique() == 2:
                # 二分类:进行t检验
                group1 = self.data[self.data[self.target_col] == target.unique()[0]]
                group2 = self.data[self.data[self.target_col] == target.unique()[1]]
                for col in numeric_cols:
                    if col != self.target_col:
                        t_stat, p_value = stats.ttest_ind(
                            group1[col].dropna(), 
                            group2[col].dropna()
                        )
                        test_results[f't_test_{col}'] = {
                            't_statistic': t_stat,
                            'p_value': p_value,
                            'significant': p_value < alpha
                        }
        self.results['hypothesis_tests'] = test_results
        return test_results
    def correlation_analysis(self, method: str = 'pearson') -> pd.DataFrame:
        """相关性分析"""
        print("=" * 50)
        print("相关性分析")
        print("=" * 50)
        numeric_data = self.data.select_dtypes(include=[np.number])
        corr_matrix = numeric_data.corr(method=method)
        # 找出高相关性对
        high_corr = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                if abs(corr_matrix.iloc[i, j]) > 0.7:
                    high_corr.append({
                        'var1': corr_matrix.columns[i],
                        'var2': corr_matrix.columns[j],
                        'correlation': corr_matrix.iloc[i, j]
                    })
        self.results['correlation'] = {
            'matrix': corr_matrix,
            'high_correlations': high_corr
        }
        return corr_matrix
    def regression_modeling(self, features: List[str] = None) -> Dict:
        """回归建模"""
        print("=" * 50)
        print("回归建模")
        print("=" * 50)
        if not self.target_col:
            raise ValueError("需要指定目标变量")
        # 准备数据
        if features is None:
            features = [col for col in self.data.columns 
                       if col != self.target_col and self.data[col].dtype in ['int64', 'float64']]
        X = self.data[features]
        y = self.data[self.target_col]
        # 处理缺失值
        X = X.fillna(X.mean())
        y = y.fillna(y.mean())
        # 分割数据
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        # 标准化
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        # 训练模型
        model = LinearRegression()
        model.fit(X_train_scaled, y_train)
        # 预测
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
        # 模型评估
        model_results = {
            'train_r2': r2_score(y_train, y_pred_train),
            'test_r2': r2_score(y_test, y_pred_test),
            'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'coefficients': dict(zip(features, model.coef_)),
            'intercept': model.intercept_,
            'feature_importance': pd.DataFrame({
                'feature': features,
                'coefficient': model.coef_
            }).sort_values('coefficient', key=abs, ascending=False)
        }
        self.results['regression'] = model_results
        return model_results
    def generate_report(self) -> str:
        """生成分析报告"""
        report = []
        report.append("=" * 60)
        report.append("统计分析报告")
        report.append("=" * 60)
        report.append("")
        # 数据基本信息
        report.append("1. 数据基本信息")
        report.append(f"   数据形状: {self.results.get('basic_info', {}).get('shape', 'N/A')}")
        report.append(f"   缺失值数量: {self.results.get('basic_info', {}).get('missing_values', 'N/A').sum()}")
        report.append("")
        # 描述性统计
        report.append("2. 描述性统计")
        desc = self.results.get('descriptive', pd.DataFrame())
        report.append(str(desc))
        report.append("")
        # 异常值检测
        if 'outliers' in self.results:
            report.append("3. 异常值检测结果")
            report.append(f"   异常值数量: {self.results['outliers']['count']}")
            report.append(f"   异常值比例: {self.results['outliers']['percentage']:.2f}%")
            report.append("")
        # 假设检验结果
        if 'hypothesis_tests' in self.results:
            report.append("4. 假设检验结果")
            norm_results = self.results['hypothesis_tests'].get('normality', {})
            for col, result in norm_results.items():
                report.append(f"   {col}: {'正态分布' if result['is_normal'] else '非正态分布'}")
            report.append("")
        # 回归结果
        if 'regression' in self.results:
            report.append("5. 回归分析结果")
            reg = self.results['regression']
            report.append(f"   训练集 R²: {reg.get('train_r2', 'N/A'):.4f}")
            report.append(f"   测试集 R²: {reg.get('test_r2', 'N/A'):.4f}")
            report.append(f"   训练集 RMSE: {reg.get('train_rmse', 'N/A'):.4f}")
            report.append(f"   测试集 RMSE: {reg.get('test_rmse', 'N/A'):.4f}")
            report.append("")
            report.append("   重要特征(按系数绝对值排序):")
            if 'feature_importance' in reg:
                report.append(str(reg['feature_importance'].head()))
        return "\n".join(report)
# 使用示例
if __name__ == "__main__":
    # 生成示例数据
    np.random.seed(42)
    n_samples = 1000
    data = pd.DataFrame({
        'age': np.random.randint(18, 80, n_samples),
        'income': np.random.normal(50000, 20000, n_samples),
        'education': np.random.randint(8, 22, n_samples),
        'experience': np.random.randint(0, 40, n_samples),
        'spending': np.random.normal(1000, 500, n_samples)
    })
    # 添加一些关系
    data['income'] = data['income'] + 2000 * data['education'] + 1000 * data['experience']
    data['spending'] = data['spending'] + 0.3 * data['income'] + np.random.normal(0, 100, n_samples)
    # 创建分析实例
    analyzer = CompleteStatisticalAnalysis(data, target_col='spending')
    # 执行分析
    analyzer.data_exploration()
    analyzer.hypothesis_testing()
    analyzer.correlation_analysis()
    analyzer.regression_modeling()
    # 生成报告
    report = analyzer.generate_report()
    print(report)

实用统计函数库

import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Union
import warnings
warnings.filterwarnings('ignore')
class StatisticsUtils:
    """统计工具函数库"""
    @staticmethod
    def calculate_confidence_interval(data: np.ndarray, 
                                    confidence: float = 0.95) -> Tuple[float, float]:
        """计算置信区间"""
        import scipy.stats as stats
        n = len(data)
        mean = np.mean(data)
        se = stats.sem(data)
        h = se * stats.t.ppf((1 + confidence) / 2., n-1)
        return (mean - h, mean + h)
    @staticmethod
    def effect_size_cohens_d(sample1: np.ndarray, 
                            sample2: np.ndarray) -> float:
        """计算Cohen's d效应量"""
        n1, n2 = len(sample1), len(sample2)
        var1, var2 = np.var(sample1, ddof=1), np.var(sample2, ddof=1)
        # 合并标准差
        pooled_se = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
        d = (np.mean(sample1) - np.mean(sample2)) / pooled_se
        return d
    @staticmethod
    def bootstrap_ci(data: np.ndarray, 
                    stat_function: callable = np.mean,
                    n_bootstrap: int = 1000,
                    ci: float = 0.95) -> Dict:
        """自助法计算置信区间"""
        np.random.seed(42)
        boot_stats = []
        for _ in range(n_bootstrap):
            boot_sample = np.random.choice(data, len(data), replace=True)
            boot_stats.append(stat_function(boot_sample))
        boot_stats = np.array(boot_stats)
        lower_percentile = (1 - ci) / 2 * 100
        upper_percentile = (1 + ci) / 2 * 100
        return {
            'original_stat': stat_function(data),
            'bootstrap_mean': np.mean(boot_stats),
            'bootstrap_se': np.std(boot_stats),
            'ci_lower': np.percentile(boot_stats, lower_percentile),
            'ci_upper': np.percentile(boot_stats, upper_percentile)
        }
    @staticmethod
    def power_analysis(effect_size: float, 
                      alpha: float = 0.05,
                      n_samples: int = None,
                      power: float = 0.8) -> Dict:
        """功效分析"""
        from statsmodels.stats.power import TTestIndPower
        analysis = TTestIndPower()
        if n_samples is None:
            # 计算所需样本量
            n = analysis.solve_power(
                effect_size=effect_size,
                alpha=alpha,
                power=power,
                alternative='two-sided'
            )
            return {
                'required_n_per_group': int(np.ceil(n)),
                'effect_size': effect_size,
                'alpha': alpha,
                'power': power
            }
        else:
            # 计算功效
            calculated_power = analysis.solve_power(
                effect_size=effect_size,
                nobs1=n_samples,
                alpha=alpha,
                alternative='two-sided'
            )
            return {
                'power': calculated_power,
                'effect_size': effect_size,
                'alpha': alpha,
                'n_per_group': n_samples
            }
    @staticmethod
    def outlier_removal_iqr(data: np.ndarray, 
                           multiplier: float = 1.5) -> Dict:
        """基于IQR的异常值剔除"""
        Q1 = np.percentile(data, 25)
        Q3 = np.percentile(data, 75)
        IQR = Q3 - Q1
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR
        mask = (data >= lower_bound) & (data <= upper_bound)
        return {
            'original_data': data,
            'cleaned_data': data[mask],
            'outliers': data[~mask],
            'n_outliers': np.sum(~mask),
            'outlier_percentage': np.sum(~mask) / len(data) * 100,
            'bounds': (lower_bound, upper_bound),
            'mask': mask
        }
# 使用示例
if __name__ == "__main__":
    np.random.seed(42)
    # 生成数据
    data1 = np.random.normal(50, 10, 100)
    data2 = np.random.normal(55, 10, 100)
    # 置信区间
    ci = StatisticsUtils.calculate_confidence_interval(data1)
    print(f"95%置信区间: ({ci[0]:.2f}, {ci[1]:.2f})")
    # 效应量
    d = StatisticsUtils.effect_size_cohens_d(data1, data2)
    print(f"Cohen's d效应量: {d:.3f}")
    # 自助法
    bootstrap_result = StatisticsUtils.bootstrap_ci(data1)
    print(f"自助法置信区间: ({bootstrap_result['ci_lower']:.2f}, {bootstrap_result['ci_upper']:.2f})")
    # 功效分析
    power_result = StatisticsUtils.power_analysis(effect_size=0.5)
    print(f"所需样本量: {power_result['required_n_per_group']}")
    # 异常值检测
    outlier_result = StatisticsUtils.outlier_removal_iqr(data1)
    print(f"异常值数量: {outlier_result['n_outliers']}")

这些统计工具案例涵盖了:

  1. 描述性统计:基本统计量、百分位数、异常值检测
  2. 假设检验:t检验、卡方检验、ANOVA
  3. 相关性分析:皮尔逊、斯皮尔曼相关系数
  4. 回归分析:线性回归、模型评估
  5. 数据可视化:分布图、箱线图、Q-Q图
  6. 完整分析流程:数据探索到报告生成
  7. 实用函数库:置信区间、效应量、功效分析

你可以根据实际需求选择和使用这些工具,并在此基础上进行扩展和优化。

标签: 数据统计 自动化分析

抱歉,评论功能暂时关闭!