Python里做回归分析要经历哪些关键步骤？从数据准备到模型优化怎么一步步来？

## Python回归分析逐步实现教程 ### 一、回归分析核心概念解构回归分析是一种统计方法，用于研究自变量与因变量之间的相关关系，并建立数学模型进行预测[ref_5]。主要分为线性回归（预测连续值）和逻辑回归（预测分类概率）两大类[ref_3]。 | 回归类型 | 因变量类型 | 应用场景 | 核心函数 | |---------|-----------|---------|---------| | 线性回归 | 连续数值 | 房价预测、销量预测 | $y = β₀ + β₁x₁ + ... + βₙxₙ$ | | 逻辑回归 | 分类概率 | 用户流失预测、疾病诊断 | $P = 1/(1+e^{-z})$ | ### 二、实验环境准备与数据加载 ```python # 导入必需库 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import statsmodels.api as sm from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix # 设置中文显示和图形样式 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False sns.set_style("whitegrid") # 示例数据生成（实际应用时替换为真实数据） np.random.seed(42) n_samples = 200 # 生成3个特征和1个目标变量 X = np.random.randn(n_samples, 3) * 10 # 特征矩阵 # 创建线性关系：y = 2*x1 + 1.5*x2 - 3*x3 + 噪声 y = 2*X[:, 0] + 1.5*X[:, 1] - 3*X[:, 2] + np.random.randn(n_samples) * 5 # 转换为DataFrame便于处理 df = pd.DataFrame(X, columns=['特征1', '特征2', '特征3']) df['目标变量'] = y print("数据概览：") print(df.head()) print(f"\n数据形状：{df.shape}") print(f"描述性统计：\n{df.describe()}") ``` ### 三、数据预处理与探索性分析 ```python # 1. 检查缺失值 print("缺失值检查：") print(df.isnull().sum()) # 2. 数据可视化 - 散点图矩阵 sns.pairplot(df) plt.suptitle("特征与目标变量关系散点图矩阵", y=1.02) plt.show() # 3. 相关性分析 correlation_matrix = df.corr() plt.figure(figsize=(8, 6)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) plt.title("特征相关性热力图") plt.show() # 4. 数据标准化（对于多元回归很重要） scaler = StandardScaler() X_scaled = scaler.fit_transform(X) df_scaled = pd.DataFrame(X_scaled, columns=['特征1_标准化', '特征2_标准化', '特征3_标准化']) df_scaled['目标变量'] = y ``` ### 四、一元线性回归实现 ```python # 选取第一个特征进行一元线性回归演示 X_single = df[['特征1']].values y_single = df['目标变量'].values # 使用statsmodels进行详细分析（提供统计检验） X_single_sm = sm.add_constant(X_single) # 添加常数项 model_sm = sm.OLS(y_single, X_single_sm) results_sm = model_sm.fit() print("一元线性回归统计摘要：") print(results_sm.summary()) # 使用sklearn进行预测 from sklearn.linear_model import LinearRegression model_sk = LinearRegression() model_sk.fit(X_single, y_single) y_pred = model_sk.predict(X_single) # 模型评估 mse = mean_squared_error(y_single, y_pred) r2 = r2_score(y_single, y_pred) print(f"\n模型性能指标：") print(f"均方误差(MSE): {mse:.4f}") print(f"决定系数(R²): {r2:.4f}") print(f"截距: {model_sk.intercept_:.4f}") print(f"系数: {model_sk.coef_[0]:.4f}") # 可视化回归线 plt.figure(figsize=(10, 6)) plt.scatter(X_single, y_single, alpha=0.6, label='实际值') plt.plot(X_single, y_pred, color='red', linewidth=2, label='回归线') plt.xlabel('特征1') plt.ylabel('目标变量') plt.title('一元线性回归拟合效果') plt.legend() plt.grid(True) plt.show() ``` ### 五、多元线性回归完整实现 ```python # 1. 划分训练集和测试集（70%训练，30%测试） X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, test_size=0.3, random_state=42 ) # 2. 使用statsmodels进行多元回归分析（提供详细统计信息） X_train_sm = sm.add_constant(X_train) # 添加常数项 model_multi_sm = sm.OLS(y_train, X_train_sm) results_multi = model_multi_sm.fit() print("多元线性回归统计摘要：") print(results_multi.summary()) # 3. 使用sklearn进行预测 model_multi_sk = LinearRegression() model_multi_sk.fit(X_train, y_train) # 4. 模型预测 y_train_pred = model_multi_sk.predict(X_train) y_test_pred = model_multi_sk.predict(X_test) # 5. 模型评估 train_mse = mean_squared_error(y_train, y_train_pred) test_mse = mean_squared_error(y_test, y_test_pred) train_r2 = r2_score(y_train, y_train_pred) test_r2 = r2_score(y_test, y_test_pred) print("\n多元线性回归性能评估：") print("训练集性能：") print(f" 均方误差(MSE): {train_mse:.4f}") print(f" 决定系数(R²): {train_r2:.4f}") print("\n测试集性能：") print(f" 均方误差(MSE): {test_mse:.4f}") print(f" 决定系数(R²): {test_r2:.4f}") print("\n回归系数：") for i, (col, coef) in enumerate(zip(['特征1', '特征2', '特征3'], model_multi_sk.coef_)): print(f" {col}: {coef:.4f}") print(f"截距: {model_multi_sk.intercept_:.4f}") # 6. 残差分析（检验模型假设） residuals = y_test - y_test_pred plt.figure(figsize=(12, 4)) # 残差分布图 plt.subplot(1, 3, 1) sns.histplot(residuals, kde=True) plt.xlabel('残差') plt.ylabel('频数') plt.title('残差分布') # 残差与预测值散点图 plt.subplot(1, 3, 2) plt.scatter(y_test_pred, residuals, alpha=0.6) plt.axhline(y=0, color='r', linestyle='--') plt.xlabel('预测值') plt.ylabel('残差') plt.title('残差vs预测值') # Q-Q图检验正态性 plt.subplot(1, 3, 3) sm.qqplot(residuals, line='s', ax=plt.gca()) plt.title('Q-Q图') plt.tight_layout() plt.show() ``` ### 六、逐步回归实现（变量选择）逐步回归通过假设检验筛选对目标变量影响显著的特征[ref_6]，包含三种主要方法： | 方法 | 原理 | 适用场景 | |------|------|---------| | 向前选择 | 从空模型开始，逐步加入最显著变量 | 特征较多时 | | 向后剔除 | 从全模型开始，逐步剔除最不显著变量 | 特征较少时 | | 双向选择 | 结合向前和向后，每步重新评估 | 复杂特征关系 | ```python # 使用toad库实现逐步回归（需先安装：pip install toad） try: import toad # 准备数据 df_reg = pd.DataFrame(X_scaled, columns=['特征1', '特征2', '特征3']) df_reg['目标变量'] = y # 向前逐步回归 print("向前逐步回归结果：") selected_data, remaining_vars = toad.selection.stepwise( df_reg, target='目标变量', direction='forward', criterion='aic', # 使用AIC准则 estimator='ols' ) print(f"选择的特征: {list(selected_data.columns)}") # 也可以使用statsmodels手动实现 def forward_selection(X, y, significance_level=0.05): """手动实现向前选择""" initial_features = [] selected_features = list(initial_features) while len(selected_features) < X.shape[1]: remaining_features = list(set(X.columns) - set(selected_features)) new_pval = pd.Series(index=remaining_features) for new_column in remaining_features: model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[selected_features + [new_column]]))).fit() new_pval[new_column] = model.pvalues[new_column] min_p_value = new_pval.min() if min_p_value < significance_level: best_feature = new_pval.idxmin() selected_features.append(best_feature) else: break return selected_features # 应用向前选择 X_df = pd.DataFrame(X_scaled, columns=['特征1', '特征2', '特征3']) selected = forward_selection(X_df, y) print(f"手动向前选择特征: {selected}") except ImportError: print("toad库未安装，使用statsmodels实现逐步回归") # 使用statsmodels的OLS进行变量选择 X_with_const = sm.add_constant(pd.DataFrame(X_scaled, columns=['特征1', '特征2', '特征3'])) model = sm.OLS(y, X_with_const).fit() print("全模型回归结果：") print(model.summary()) # 根据p值筛选变量（p<0.05认为显著） significant_vars = model.pvalues[model.pvalues < 0.05].index.tolist() if 'const' in significant_vars: significant_vars.remove('const') print(f"\n显著特征(p<0.05): {significant_vars}") ``` ### 七、逻辑回归实现（分类问题） ```python # 生成分类数据示例 np.random.seed(42) n_samples_class = 300 X_class = np.random.randn(n_samples_class, 2) * 2 # 创建分类边界：y = 1 if 2*x1 + x2 > 0 else 0 y_class = (2*X_class[:, 0] + X_class[:, 1] + np.random.randn(n_samples_class) > 0).astype(int) # 划分数据集 X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split( X_class, y_class, test_size=0.3, random_state=42 ) # 逻辑回归模型 logreg = LogisticRegression(random_state=42) logreg.fit(X_train_clf, y_train_clf) # 预测与评估 y_pred_clf = logreg.predict(X_test_clf) y_pred_proba = logreg.predict_proba(X_test_clf)[:, 1] accuracy = accuracy_score(y_test_clf, y_pred_clf) conf_matrix = confusion_matrix(y_test_clf, y_pred_clf) print("逻辑回归性能：") print(f"准确率: {accuracy:.4f}") print(f"混淆矩阵:\n{conf_matrix}") print(f"系数: {logreg.coef_}") print(f"截距: {logreg.intercept_}") # 可视化决策边界 plt.figure(figsize=(10, 6)) xx, yy = np.meshgrid( np.linspace(X_class[:, 0].min()-1, X_class[:, 0].max()+1, 100), np.linspace(X_class[:, 1].min()-1, X_class[:, 1].max()+1, 100) ) Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm) plt.scatter(X_class[:, 0], X_class[:, 1], c=y_class, edgecolors='k', cmap=plt.cm.coolwarm) plt.xlabel('特征1') plt.ylabel('特征2') plt.title('逻辑回归决策边界') plt.colorbar(label='类别') plt.show() ``` ### 八、模型优化与验证 ```python # 1. 交叉验证评估模型稳定性 from sklearn.model_selection import cross_val_score cv_scores = cross_val_score( LinearRegression(), X_scaled, y, cv=5, scoring='r2' ) print(f"5折交叉验证R²得分: {cv_scores}") print(f"平均R²: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})") # 2. 多项式回归（处理非线性关系） from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline # 创建多项式特征（2次） poly_features = PolynomialFeatures(degree=2, include_bias=False) X_poly = poly_features.fit_transform(X_scaled[:, :2]) # 只使用前两个特征演示 # 多项式回归模型 poly_model = LinearRegression() poly_model.fit(X_poly, y) print(f"\n多项式回归R²: {poly_model.score(X_poly, y):.4f}") # 3. 正则化（防止过拟合） from sklearn.linear_model import Ridge, Lasso # 岭回归(L2正则化) ridge = Ridge(alpha=1.0) ridge.fit(X_train, y_train) ridge_score = ridge.score(X_test, y_test) print(f"岭回归测试集R²: {ridge_score:.4f}") # LASSO回归(L1正则化，可用于特征选择) lasso = Lasso(alpha=0.1) lasso.fit(X_train, y_train) lasso_score = lasso.score(X_test, y_test) print(f"LASSO回归测试集R²: {lasso_score:.4f}") print(f"LASSO选择的特征数: {np.sum(lasso.coef_ != 0)}") ``` ### 九、完整实验流程总结 1. **数据准备阶段**：数据加载、清洗、探索性分析 2. **预处理阶段**：特征标准化、数据集划分 3. **模型构建阶段**：选择回归类型、训练模型 4. **变量选择阶段**：使用逐步回归筛选重要特征[ref_1] 5. **模型评估阶段**：计算性能指标、残差分析 6. **优化阶段**：交叉验证、正则化、多项式扩展 ### 十、常见问题与解决方案 | 问题 | 症状 | 解决方案 | |------|------|---------| | 多重共线性 | 特征高度相关，系数不稳定 | 使用VIF检验、岭回归、主成分分析 | | 异方差性 | 残差方差随预测值变化 | 数据变换、加权最小二乘法 | | 非线性关系 | R²值低，残差模式明显 | 多项式回归、添加交互项 | | 过拟合 | 训练集表现好，测试集差 | 正则化、交叉验证、减少特征 | ```python # 多重共线性检测示例 from statsmodels.stats.outliers_influence import variance_inflation_factor X_with_const = sm.add_constant(pd.DataFrame(X_scaled, columns=['特征1', '特征2', '特征3'])) vif_data = pd.DataFrame() vif_data["特征"] = X_with_const.columns vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])] print("方差膨胀因子(VIF)检测：") print(vif_data) # VIF>10表示存在严重多重共线性 ``` 通过以上完整实现，您可以系统掌握Python中回归分析的各个步骤。关键是根据具体问题选择合适的回归方法，并通过残差分析、交叉验证等手段确保模型质量[ref_2]。实际应用中需结合业务理解调整特征工程和模型参数。

创作声明：本文部分内容由AI辅助生成（AIGC），仅供参考

下一篇 Python里类名后面加括号，是继承父类还是创建对象？有什么区别？