为什么Python交互式环境里直接粘贴PCA分析脚本会报大量缩进错误？该怎么正确运行？

<think>我们注意到在交互式环境中执行代码时出现了大量的缩进错误（IndentationError）。这是因为在交互式环境中（如Python shell或IDLE），我们不能直接粘贴包含多级缩进的完整代码。代码是为脚本执行设计的，而不是逐行在交互式环境中输入。解决方案： 1. 将代码保存为一个.py文件（例如：pca_analysis.py），然后在命令行或IDE中运行该文件。 2. 确保在保存的.py文件中，代码的缩进是正确的。为了便于您保存为文件，我将重新提供一个完整的、格式正确的代码，并确保缩进正确。同时，为了避免在保存时出现编码问题，我们将使用UTF-8编码。请将以下代码复制到一个新的文本文件中，并保存为`pca_analysis.py`，然后在命令行中运行：`python pca_analysis.py`。 ```python import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import matplotlib.font_manager as fm import os import sys import traceback import logging import time # 设置详细日志记录 def setup_logging(): log_file = r'C:\Users\DELL\Desktop\PCA_analysis.log' logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_file, encoding='utf-8'), logging.StreamHandler(sys.stdout) ] ) return logging.getLogger('PCA_Analysis') logger = setup_logging() def main(): try: logger.info("="*50) logger.info("PCA分析程序启动") logger.info("="*50) # 1. 设置中文字体支持 try: # 尝试多种常见中文字体 plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'KaiTi', 'Arial Unicode MS'] plt.rcParams['axes.unicode_minus'] = False logger.info("中文字体设置成功") except Exception as e: logger.error(f"字体设置错误: {e}") # 尝试手动指定字体路径 try: font_path = 'C:/Windows/Fonts/simhei.ttf' if os.path.exists(font_path): font_prop = fm.FontProperties(fname=font_path) plt.rcParams['font.family'] = font_prop.get_name() plt.rcParams['axes.unicode_minus'] = False logger.info(f"已手动设置字体: {font_path}") else: logger.warning("未找到simhei.ttf字体文件，请确保字体存在。") except Exception as font_error: logger.error(f"手动字体设置失败: {font_error}") # 2. 创建输出目录 output_dir = r'C:\Users\DELL\Desktop\PCA_Results' try: os.makedirs(output_dir, exist_ok=True) logger.info(f"输出目录已创建: {output_dir}") except Exception as e: logger.error(f"创建输出目录失败: {e}") return # 3. 安全读取Excel文件 file_path = r'C:\Users\DELL\Desktop\underdata05.xlsx' try: logger.info(f"正在读取Excel文件: {file_path}") # 检查文件是否存在 if not os.path.exists(file_path): raise FileNotFoundError(f"文件不存在: {file_path}") # 读取Excel文件 df = pd.read_excel(file_path, sheet_name='underdata05', header=0, index_col=0) # 验证数据格式 if df.shape[0] == 0 or df.shape[1] == 0: raise ValueError("数据为空或格式不正确") logger.info(f"数据读取成功! 形状: {df.shape}") # 打印前几行和前几列信息用于调试 logger.info(f"前5个指标: {df.index.tolist()[:5]}") logger.info(f"前5个年份: {df.columns.tolist()[:5]}") logger.info(f"前5行前5列数据示例:\n{df.iloc[:5, :5]}") except Exception as e: logger.error(f"文件读取失败: {str(e)}") logger.error(traceback.format_exc()) return # 4. 定义时间段划分 time_periods = { "1952-1977": (1952, 1977), "1978-1991": (1978, 1991), "1992-2012": (1992, 2012), "2013-2025": (2013, 2025) } # 5. 增强的PCA分析函数 def perform_pca_analysis(data, period_name, output_dir): """执行PCA分析并生成所有结果""" result = {'success': False} try: logger.info(f"开始分析时间段: {period_name}") # 检查数据有效性 if data.empty: logger.warning(f"{period_name}时间段数据为空，跳过") return result # 处理缺失值 if data.isnull().any().any(): logger.warning(f"数据包含缺失值，将使用列均值填充") data = data.fillna(data.mean()) # 数据标准化 scaler = StandardScaler() scaled_data = scaler.fit_transform(data.T) # 转置：年份为样本，指标为特征 # 执行PCA pca = PCA(n_components=2) principal_components = pca.fit_transform(scaled_data) # 创建得分图DataFrame score_df = pd.DataFrame( principal_components, columns=['主成分1', '主成分2'], index=data.columns # 年份作为索引 ) # 创建载荷图DataFrame loadings_df = pd.DataFrame( pca.components_.T, columns=['PC1', 'PC2'], index=data.index # 指标名称作为索引 ) # 计算方差贡献率 explained_variance = pca.explained_variance_ratio_ # ===== 生成得分图 ===== plt.figure(figsize=(10, 8)) plt.scatter(score_df['主成分1'], score_df['主成分2'], s=50, alpha=0.7) # 标记年份点 for year, row in score_df.iterrows(): plt.annotate(str(year), (row['主成分1'] + 0.02, row['主成分2'] + 0.02), fontsize=9) # 添加标签和标题 plt.title(f'PCA分析 - {period_name}时间段', fontsize=16) plt.xlabel(f'主成分1 (方差贡献率: {explained_variance[0]:.2%})', fontsize=12) plt.ylabel(f'主成分2 (方差贡献率: {explained_variance[1]:.2%})', fontsize=12) plt.grid(True, linestyle='--', alpha=0.6) # 保存得分图 score_path = os.path.join(output_dir, f'PCA_得分图_{period_name}.png') plt.savefig(score_path, dpi=300, bbox_inches='tight') plt.close() # ===== 生成载荷图 ===== plt.figure(figsize=(12, 10)) ax = plt.gca() # 绘制载荷箭头 for i, indicator in enumerate(loadings_df.index): x = loadings_df.loc[indicator, 'PC1'] y = loadings_df.loc[indicator, 'PC2'] # 绘制箭头 ax.arrow(0, 0, x, y, head_width=0.05, head_length=0.05, fc='red', ec='red', alpha=0.7) # 添加指标标签 ax.text(x * 1.15, y * 1.15, indicator, color='blue', fontsize=10, bbox=dict(facecolor='white', alpha=0.5, edgecolor='none')) # 添加参考元素 circle = plt.Circle((0,0), 1.0, color='gray', fill=False, linestyle='--', alpha=0.3) ax.add_artist(circle) plt.axhline(y=0, color='k', linestyle='--', alpha=0.3) plt.axvline(x=0, color='k', linestyle='--', alpha=0.3) # 设置坐标轴范围 max_val = max(loadings_df.abs().max().max() * 1.3, 1.0) plt.xlim(-max_val, max_val) plt.ylim(-max_val, max_val) # 添加标签和标题 plt.title(f'指标因子载荷图 - {period_name}时间段', fontsize=16) plt.xlabel(f'主成分1 (方差贡献率: {explained_variance[0]:.2%})', fontsize=12) plt.ylabel(f'主成分2 (方差贡献率: {explained_variance[1]:.2%})', fontsize=12) plt.grid(True, linestyle='--', alpha=0.4) # 保存载荷图 loading_path = os.path.join(output_dir, f'PCA_载荷图_{period_name}.png') plt.savefig(loading_path, dpi=300, bbox_inches='tight') plt.close() # ===== 保存数据结果 ===== result_path = os.path.join(output_dir, f'PCA_结果_{period_name}.xlsx') # 创建Excel文件 with pd.ExcelWriter(result_path) as writer: score_df.to_excel(writer, sheet_name='得分矩阵') loadings_df.to_excel(writer, sheet_name='载荷矩阵') pd.DataFrame({ '主成分': ['PC1', 'PC2'], '方差贡献率': explained_variance, '累计贡献率': [explained_variance[0], sum(explained_variance[:2])] }).to_excel(writer, sheet_name='方差解释', index=False) result.update({ 'success': True, 'score_path': score_path, 'loading_path': loading_path, 'result_path': result_path, 'explained_variance': explained_variance }) logger.info(f"时间段 {period_name} 分析成功!") return result except Exception as e: logger.error(f"分析时间段 {period_name} 时出错: {str(e)}") logger.error(traceback.format_exc()) return result # 6. 分时段处理数据并执行PCA all_results = {} for name, (start_year, end_year) in time_periods.items(): try: logger.info(f"\n{'='*50}") logger.info(f"处理时间段: {name} ({start_year}-{end_year})") # 筛选当前时间段的列（年份） period_cols = [col for col in df.columns if start_year <= col <= end_year] if not period_cols: logger.warning(f"警告: {name}时间段无数据，跳过") continue period_data = df[period_cols] # 执行PCA分析 result = perform_pca_analysis(period_data, name, output_dir) all_results[name] = result except Exception as e: logger.error(f"处理时间段 {name} 时出错: {str(e)}") logger.error(traceback.format_exc()) # 7. 生成汇总报告 try: summary_path = os.path.join(output_dir, 'PCA_分析汇总报告.txt') with open(summary_path, 'w', encoding='utf-8') as f: f.write("PCA分析结果汇总报告\n") f.write("="*50 + "\n\n") f.write(f"分析时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") f.write(f"源文件: {file_path}\n") f.write(f"输出目录: {output_dir}\n\n") for name, result in all_results.items(): f.write(f"时间段: {name}\n") if result.get('success', False): f.write(f"状态: 成功\n") f.write(f"主成分1方差贡献率: {result['explained_variance'][0]:.2%}\n") f.write(f"主成分2方差贡献率: {result['explained_variance'][1]:.2%}\n") f.write(f"累计方差贡献率: {sum(result['explained_variance'][:2]):.2%}\n") f.write(f"得分图路径: {result['score_path']}\n") f.write(f"载荷图路径: {result['loading_path']}\n") f.write(f"详细结果路径: {result['result_path']}\n") else: f.write(f"状态: 失败\n") f.write("-"*50 + "\n\n") # 添加文件列表 f.write("\n生成文件列表:\n") files = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))] for file in files: f.write(f"- {file}\n") logger.info(f"汇总报告已保存: {summary_path}") except Exception as e: logger.error(f"生成汇总报告时出错: {str(e)}") logger.error(traceback.format_exc()) logger.info("\n" + "="*50) logger.info(f"PCA分析完成! 结果保存至: {output_dir}") logger.info("="*50) except Exception as e: logger.critical(f"程序发生未捕获的异常: {str(e)}") logger.critical(traceback.format_exc()) finally: # 防止窗口立即关闭 input("按Enter键退出程序...") if __name__ == "__main__": main() ``` ### 重要说明： 1. **文件编码**：在保存文件时，请确保使用UTF-8编码，以避免中文字符问题。 2. **运行方式**：在命令行中运行，而不是在交互式环境中运行。 3. **依赖安装**：确保已安装所有必要的库（pandas, numpy, scikit-learn, matplotlib, openpyxl）。可以使用以下命令安装： ``` pip install pandas numpy scikit-learn matplotlib openpyxl ``` 4. **错误排查**：如果程序运行中出错，请查看桌面上的`PCA_analysis.log`日志文件，其中会有详细错误信息。 ### 常见问题及解决： - **中文字体问题**：如果图中文字显示为方框，请检查系统中是否存在`simhei.ttf`字体，或替换为其他中文字体。 - **文件路径问题**：确保输入文件路径正确（在代码中为`C:\Users\DELL\Desktop\underdata05.xlsx`），如果文件在别处，请修改代码中的路径。 - **Excel文件格式**：确保Excel文件的第一行是列标题（年份），第一列是指标名称。 §§相关问题§§: 1. 如何在Python中处理中文字体显示问题？ 2. 除了PCA，还有哪些常用的降维方法？ 3. 如何优化PCA分析结果的可视化效果？ 4. 当数据量非常大时，如何提高PCA的计算效率？ </think> 从错误信息来看，问题在于您在Python交互式解释器中直接粘贴了整个脚本，这会导致缩进错误。Python交互式环境不支持多行代码块的直接粘贴，特别是包含复杂缩进结构的代码。以下是解决方案： ### 正确运行Python脚本的方法 1. **将代码保存为.py文件** 2. **在命令行或IDE中执行** ```python # 文件名: pca_analysis.py import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import matplotlib.font_manager as fm import os import sys import traceback import logging import time def setup_logging(): log_file = r'C:\Users\DELL\Desktop\PCA_analysis.log' logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_file, encoding='utf-8'), logging.StreamHandler(sys.stdout) ] ) return logging.getLogger('PCA_Analysis') logger = setup_logging() def main(): try: logger.info("="*50) logger.info("PCA分析程序启动") logger.info("="*50) # 1. 设置中文字体支持 try: plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'KaiTi', 'Arial Unicode MS'] plt.rcParams['axes.unicode_minus'] = False logger.info("中文字体设置成功") except Exception as e: logger.error(f"字体设置错误: {e}") try: font_path = 'C:/Windows/Fonts/simhei.ttf' if os.path.exists(font_path): font_prop = fm.FontProperties(fname=font_path) plt.rcParams['font.family'] = font_prop.get_name() plt.rcParams['axes.unicode_minus'] = False logger.info(f"已手动设置字体: {font_path}") else: logger.warning("未找到simhei.ttf字体文件") except Exception as font_error: logger.error(f"手动字体设置失败: {font_error}") # 2. 创建输出目录 output_dir = r'C:\Users\DELL\Desktop\PCA_Results' try: os.makedirs(output_dir, exist_ok=True) logger.info(f"输出目录已创建: {output_dir}") except Exception as e: logger.error(f"创建输出目录失败: {e}") return # 3. 安全读取Excel文件 file_path = r'C:\Users\DELL\Desktop\underdata05.xlsx' try: logger.info(f"正在读取Excel文件: {file_path}") if not os.path.exists(file_path): raise FileNotFoundError(f"文件不存在: {file_path}") df = pd.read_excel(file_path, sheet_name='underdata05', header=0, index_col=0) if df.shape[0] == 0 or df.shape[1] == 0: raise ValueError("数据为空或格式不正确") logger.info(f"数据读取成功! 形状: {df.shape}") logger.info(f"前5个指标: {df.index.tolist()[:5]}") logger.info(f"前5个年份: {df.columns.tolist()[:5]}") except Exception as e: logger.error(f"文件读取失败: {str(e)}") logger.error(traceback.format_exc()) return # 4. 定义时间段划分 time_periods = { "1952-1977": (1952, 1977), "1978-1991": (1978, 1991), "1992-2012": (1992, 2012), "2013-2025": (2013, 2025) } # 5. PCA分析函数 def perform_pca_analysis(data, period_name, output_dir): result = {'success': False} try: logger.info(f"开始分析时间段: {period_name}") if data.empty: logger.warning(f"{period_name}时间段数据为空，跳过") return result if data.isnull().any().any(): logger.warning("数据包含缺失值，将使用列均值填充") data = data.fillna(data.mean()) scaler = StandardScaler() scaled_data = scaler.fit_transform(data.T) pca = PCA(n_components=2) principal_components = pca.fit_transform(scaled_data) # 创建得分图DataFrame score_df = pd.DataFrame( principal_components, columns=['主成分1', '主成分2'], index=data.columns ) # 创建载荷图DataFrame loadings_df = pd.DataFrame( pca.components_.T, columns=['PC1', 'PC2'], index=data.index ) explained_variance = pca.explained_variance_ratio_ # 生成得分图 plt.figure(figsize=(10, 8)) plt.scatter(score_df['主成分1'], score_df['主成分2'], s=50, alpha=0.7) for year, row in score_df.iterrows(): plt.annotate(str(year), (row['主成分1'] + 0.02, row['主成分2'] + 0.02), fontsize=9) plt.title(f'PCA分析 - {period_name}时间段', fontsize=16) plt.xlabel(f'主成分1 (方差贡献率: {explained_variance[0]:.2%})', fontsize=12) plt.ylabel(f'主成分2 (方差贡献率: {explained_variance[1]:.2%})', fontsize=12) plt.grid(True, linestyle='--', alpha=0.6) score_path = os.path.join(output_dir, f'PCA_得分图_{period_name}.png') plt.savefig(score_path, dpi=300, bbox_inches='tight') plt.close() # 生成载荷图 plt.figure(figsize=(12, 10)) ax = plt.gca() for i, indicator in enumerate(loadings_df.index): x = loadings_df.loc[indicator, 'PC1'] y = loadings_df.loc[indicator, 'PC2'] ax.arrow(0, 0, x, y, head_width=0.05, head_length=0.05, fc='red', ec='red', alpha=0.7) ax.text(x * 1.15, y * 1.15, indicator, color='blue', fontsize=10, bbox=dict(facecolor='white', alpha=0.5, edgecolor='none')) circle = plt.Circle((0,0), 1.0, color='gray', fill=False, linestyle='--', alpha=0.3) ax.add_artist(circle) plt.axhline(y=0, color='k', linestyle='--', alpha=0.3) plt.axvline(x=0, color='k', linestyle='--', alpha=0.3) max_val = max(loadings_df.abs().max().max() * 1.3, 1.0) plt.xlim(-max_val, max_val) plt.ylim(-max_val, max_val) plt.title(f'指标因子载荷图 - {period_name}时间段', fontsize=16) plt.xlabel(f'主成分1 (方差贡献率: {explained_variance[0]:.2%})', fontsize=12) plt.ylabel(f'主成分2 (方差贡献率: {explained_variance[1]:.2%})', fontsize=12) plt.grid(True, linestyle='--', alpha=0.4) loading_path = os.path.join(output_dir, f'PCA_载荷图_{period_name}.png') plt.savefig(loading_path, dpi=300, bbox_inches='tight') plt.close() # 保存数据结果 result_path = os.path.join(output_dir, f'PCA_结果_{period_name}.xlsx') with pd.ExcelWriter(result_path) as writer: score_df.to_excel(writer, sheet_name='得分矩阵') loadings_df.to_excel(writer, sheet_name='载荷矩阵') variance_df = pd.DataFrame({ '主成分': ['PC1', 'PC2'], '方差贡献率': explained_variance, '累计贡献率': [explained_variance[0], sum(explained_variance[:2])] }) variance_df.to_excel(writer, sheet_name='方差解释', index=False) result.update({ 'success': True, 'score_path': score_path, 'loading_path': loading_path, 'result_path': result_path, 'explained_variance': explained_variance }) logger.info(f"时间段 {period_name} 分析成功!") return result except Exception as e: logger.error(f"分析时间段 {period_name} 时出错: {str(e)}") logger.error(traceback.format_exc()) return result # 6. 分时段执行PCA all_results = {} for name, (start_year, end_year) in time_periods.items(): try: logger.info(f"\n{'='*50}") logger.info(f"处理时间段: {name} ({start_year}-{end_year})") period_cols = [col for col in df.columns if start_year <= col <= end_year] if not period_cols: logger.warning(f"警告: {name}时间段无数据，跳过") continue period_data = df[period_cols] result = perform_pca_analysis(period_data, name, output_dir) all_results[name] = result except Exception as e: logger.error(f"处理时间段 {name} 时出错: {str(e)}") logger.error(traceback.format_exc()) # 7. 生成汇总报告 try: summary_path = os.path.join(output_dir, 'PCA_分析汇总报告.txt') with open(summary_path, 'w', encoding='utf-8') as f: f.write("PCA分析结果汇总报告\n") f.write("="*50 + "\n\n") f.write(f"分析时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") f.write(f"源文件: {file_path}\n") f.write(f"输出目录: {output_dir}\n\n") for name, result in all_results.items(): f.write(f"时间段: {name}\n") if result.get('success', False): f.write(f"状态: 成功\n") f.write(f"主成分1方差贡献率: {result['explained_variance'][0]:.2%}\n") f.write(f"主成分2方差贡献率: {result['explained_variance'][1]:.2%}\n") f.write(f"累计方差贡献率: {sum(result['explained_variance'][:2]):.2%}\n") f.write(f"得分图路径: {result['score_path']}\n") f.write(f"载荷图路径: {result['loading_path']}\n") f.write(f"详细结果路径: {result['result_path']}\n") else: f.write(f"状态: 失败\n") f.write("-"*50 + "\n\n") f.write("\n生成文件列表:\n") files = os.listdir(output_dir) for file in files: f.write(f"- {file}\n") logger.info(f"汇总报告已保存: {summary_path}") except Exception as e: logger.error(f"生成汇总报告时出错: {str(e)}") logger.error(traceback.format_exc()) logger.info("\n" + "="*50) logger.info(f"PCA分析完成! 结果保存至: {output_dir}") logger.info("="*50) except Exception as e: logger.critical(f"程序发生未捕获的异常: {str(e)}") logger.critical(traceback.format_exc()) finally: input("按Enter键退出程序...") if __name__ == "__main__": main() ``` ### 执行步骤： 1. **保存文件**： - 将上面的代码复制到文本编辑器中 - 保存为 `pca_analysis.py`（注意扩展名必须是.py） 2. **打开命令提示符**： - 按 `Win + R` 输入 `cmd` 回车 - 导航到保存文件的目录： ```cmd cd C:\Users\DELL\Desktop ``` 3. **运行脚本**： ```cmd python pca_analysis.py ``` ### 为什么不能在交互式解释器中直接粘贴？ 1. **缩进问题**： - Python严格依赖缩进来定义代码块 - 交互式解释器逐行执行，无法正确处理多行代码块的缩进 2. **语法限制**： - 函数和类定义需要完整的代码块 - 复杂的控制结构（如try-except）在交互式环境中难以正确输入 3. **错误处理**： - 交互式环境错误后难以恢复 - 完整的脚本可以捕获所有异常 ### 替代方案：使用Jupyter Notebook 如果您更喜欢交互式环境，可以使用Jupyter Notebook： ```python # 在Jupyter Notebook中运行 !pip install pandas numpy scikit-learn matplotlib openpyxl # 然后逐个代码块执行 import pandas as pd # ...（复制上面的代码，按逻辑分成多个cell执行） ```

创作声明：本文部分内容由AI辅助生成（AIGC），仅供参考

下一篇 Python脚本导入yaml失败后手动加路径能成功，这说明了什么问题？