# 代码文件拆分说明
我理解您的需求!您希望将每个带序号的部分都拆分成单独的Python文件。以下是完整的文件结构和每个文件的代码内容:
## 📁 项目文件结构
```
ai_basic_project/
├── 01_environment_check.py
├── 02_data_preparation.py
├── 03_model_training.py
├── 04_inference_engine.py
├── 05_interactive_interface.py
├── 06_model_optimization.py
├── 07_model_persistence.py
└── requirements.txt
```
## 📄 各文件详细代码
### 文件 1: 环境检查
```python
# 01_environment_check.py
import sys
import subprocess
def check_environment():
"""检查必要的库是否已安装"""
required_packages = [
'numpy', 'pandas', 'scikit-learn',
'matplotlib', 'jupyter', 'joblib'
]
missing_packages = []
for package in required_packages:
try:
__import__(package)
print(f"✅ {package} 已安装")
except ImportError:
missing_packages.append(package)
print(f"❌ {package} 未安装")
if missing_packages:
print(f"\n请安装缺失的包: pip install {' '.join(missing_packages)}")
return False
else:
print("\n🎉 环境配置完成,可以开始AI开发!")
return True
if __name__ == "__main__":
check_environment()
```
### 文件 2: 数据准备
```python
# 02_data_preparation.py
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
def load_and_explore_data():
"""加载和探索鸢尾花数据集"""
# 加载数据集
iris = load_iris()
X = iris.data
y = iris.target
# 创建DataFrame
iris_df = pd.DataFrame(X, columns=iris.feature_names)
iris_df['target'] = y
iris_df['species'] = iris_df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
print("数据集基本信息:")
print(f"样本数量: {len(iris_df)}")
print(f"特征数量: {X.shape[1]}")
print(f"类别分布:\n{iris_df['species'].value_counts()}")
return iris, X, y, iris_df
def visualize_data(iris_df):
"""数据可视化"""
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
for species in iris_df['species'].unique():
species_data = iris_df[iris_df['species'] == species]
plt.scatter(species_data['sepal length (cm)'],
species_data['sepal width (cm)'],
label=species, alpha=0.7)
plt.xlabel('花萼长度 (cm)')
plt.ylabel('花萼宽度 (cm)')
plt.legend()
plt.title('花萼尺寸分布')
plt.subplot(1, 2, 2)
for species in iris_df['species'].unique():
species_data = iris_df[iris_df['species'] == species]
plt.scatter(species_data['petal length (cm)'],
species_data['petal width (cm)'],
label=species, alpha=0.7)
plt.xlabel('花瓣长度 (cm)')
plt.ylabel('花瓣宽度 (cm)')
plt.legend()
plt.title('花瓣尺寸分布')
plt.tight_layout()
plt.savefig('data_visualization.png')
plt.show()
def prepare_training_data(X, y):
"""准备训练数据"""
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
print(f"训练集大小: {X_train.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")
return X_train, X_test, y_train, y_test, scaler
if __name__ == "__main__":
iris, X, y, iris_df = load_and_explore_data()
visualize_data(iris_df)
X_train, X_test, y_train, y_test, scaler = prepare_training_data(X, y)
```
### 文件 3: 模型训练
```python
# 03_model_training.py
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import joblib
def train_model(X_train, y_train):
"""训练随机森林模型"""
model = RandomForestClassifier(
n_estimators=100,
random_state=42,
max_depth=3
)
model.fit(X_train, y_train)
print("✅ 模型训练完成!")
return model
def evaluate_model(model, X_test, y_test, iris):
"""评估模型性能"""
# 预测
y_pred = model.predict(X_test)
# 准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"🎯 模型准确率: {accuracy:.4f}")
# 分类报告
print("\n详细分类报告:")
print(classification_report(y_test, y_pred,
target_names=iris.target_names))
# 混淆矩阵
plot_confusion_matrix(y_test, y_pred, iris)
return accuracy, y_pred
def plot_confusion_matrix(y_test, y_pred, iris):
"""绘制混淆矩阵"""
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('混淆矩阵')
plt.colorbar()
tick_marks = np.arange(len(iris.target_names))
plt.xticks(tick_marks, iris.target_names, rotation=45)
plt.yticks(tick_marks, iris.target_names)
plt.xlabel('预测标签')
plt.ylabel('真实标签')
# 在矩阵中显示数值
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(j, i, format(cm[i, j], 'd'),
horizontalalignment="center",
color="white" if cm[i, j] > cm.max()/2. else "black")
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()
if __name__ == "__main__":
# 加载数据
iris = load_iris()
X = iris.data
y = iris.target
# 数据预处理
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
# 训练模型
model = train_model(X_train, y_train)
# 评估模型
accuracy, y_pred = evaluate_model(model, X_test, y_test, iris)
# 保存模型和scaler
joblib.dump({'model': model, 'scaler': scaler}, 'trained_model.pkl')
print("✅ 模型已保存为 'trained_model.pkl'")
```
### 文件 4: 推理引擎
```python
# 04_inference_engine.py
import numpy as np
import joblib
class IrisClassifier:
"""鸢尾花分类器推理引擎"""
def __init__(self, model_path=None):
if model_path:
self.load_model(model_path)
else:
self.model = None
self.scaler = None
self.feature_names = None
self.target_names = None
def load_model(self, model_path):
"""加载预训练模型"""
model_data = joblib.load(model_path)
self.model = model_data['model']
self.scaler = model_data['scaler']
# 加载鸢尾花数据集信息
from sklearn.datasets import load_iris
iris = load_iris()
self.feature_names = iris.feature_names
self.target_names = iris.target_names
print("✅ 模型加载成功!")
def predict(self, features):
"""预测鸢尾花种类"""
if self.model is None:
raise ValueError("请先加载模型!")
# 输入验证
if len(features) != 4:
raise ValueError("需要提供4个特征值: 花萼长度、花萼宽度、花瓣长度、花瓣宽度")
# 数据预处理
features_array = np.array(features).reshape(1, -1)
features_scaled = self.scaler.transform(features_array)
# 模型预测
prediction = self.model.predict(features_scaled)[0]
probabilities = self.model.predict_proba(features_scaled)[0]
# 返回结果
result = {
'predicted_class': self.target_names[prediction],
'confidence': probabilities[prediction],
'all_probabilities': {
self.target_names[i]: prob
for i, prob in enumerate(probabilities)
}
}
return result
def test_inference():
"""测试推理功能"""
classifier = IrisClassifier('trained_model.pkl')
test_samples = [
[5.1, 3.5, 1.4, 0.2], # 应该是setosa
[6.0, 2.7, 5.1, 1.6], # 应该是virginica
[5.5, 2.4, 3.8, 1.1], # 应该是versicolor
]
print("🧪 模型推理测试:")
for i, sample in enumerate(test_samples, 1):
result = classifier.predict(sample)
print(f"\n样本 {i}: {sample}")
print(f"预测种类: {result['predicted_class']}")
print(f"置信度: {result['confidence']:.4f}")
print("各类别概率:")
for species, prob in result['all_probabilities'].items():
print(f" {species}: {prob:.4f}")
return classifier
if __name__ == "__main__":
classifier = test_inference()
```
### 文件 5: 交互界面
```python
# 05_interactive_interface.py
from inference_engine import IrisClassifier
import joblib
def interactive_classifier():
"""交互式鸢尾花分类器"""
print("🌷 鸢尾花分类器 v1.0")
print("=" * 40)
print("请输入鸢尾花的4个特征值:")
print("格式: 花萼长度 花萼宽度 花瓣长度 花瓣宽度")
print("示例: 5.1 3.5 1.4 0.2")
print("输入 'quit' 退出程序")
print("=" * 40)
# 加载模型
try:
classifier = IrisClassifier('trained_model.pkl')
except:
print("❌ 无法加载模型,请先运行训练脚本")
return
while True:
try:
user_input = input("\n请输入特征值: ").strip()
if user_input.lower() in ['quit', 'exit', 'q']:
print("感谢使用鸢尾花分类器!")
break
# 解析输入
features = [float(x) for x in user_input.split()]
if len(features) != 4:
print("❌ 错误:请输入4个数值")
continue
# 进行预测
result = classifier.predict(features)
# 显示结果
print(f"\n🎯 预测结果: {result['predicted_class']}")
print(f"📊 置信度: {result['confidence']:.2%}")
print("\n详细概率分布:")
for species, prob in result['all_probabilities'].items():
bar = "█" * int(prob * 20)
print(f" {species:12}: {prob:6.2%} {bar}")
except ValueError:
print("❌ 错误:请输入有效的数值")
except Exception as e:
print(f"❌ 发生错误: {e}")
if __name__ == "__main__":
interactive_classifier()
```
### 文件 6: 模型优化
```python
# 06_model_optimization.py
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
def optimize_model(X_train, y_train):
"""使用网格搜索优化模型参数"""
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, None],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
print("🔍 开始网格搜索优化...")
grid_search.fit(X_train, y_train)
print("🎯 最佳参数:", grid_search.best_params_)
print("🏆 最佳准确率:", grid_search.best_score_)
return grid_search.best_estimator_
if __name__ == "__main__":
# 加载数据
iris = load_iris()
X = iris.data
y = iris.target
# 数据预处理
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
# 优化模型
best_model = optimize_model(X_train, y_train)
# 保存优化后的模型
model_data = {
'model': best_model,
'scaler': scaler,
'optimized': True
}
joblib.dump(model_data, 'optimized_model.pkl')
print("✅ 优化后的模型已保存为 'optimized_model.pkl'")
```
### 文件 7: 模型持久化
```python
# 07_model_persistence.py
import joblib
import datetime
from sklearn.datasets import load_iris
def save_model_with_metadata(model, scaler, accuracy, filename=None):
"""保存模型和元数据"""
if filename is None:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"iris_classifier_{timestamp}.pkl"
# 加载数据集信息
iris = load_iris()
model_data = {
'model': model,
'scaler': scaler,
'metadata': {
'feature_names': iris.feature_names.tolist(),
'target_names': iris.target_names.tolist(),
'accuracy': accuracy,
'dataset': 'iris',
'created_at': datetime.datetime.now().isoformat(),
'version': '1.0'
}
}
joblib.dump(model_data, filename)
print(f"✅ 模型已保存: {filename}")
return filename
def load_model_with_metadata(filename):
"""加载模型和元数据"""
model_data = joblib.load(filename)
metadata = model_data['metadata']
print(f"✅ 模型加载成功")
print(f"📅 创建时间: {metadata['created_at']}")
print(f"🎯 准确率: {metadata['accuracy']:.4f}")
print(f"🔢 特征: {metadata['feature_names']}")
print(f"🏷️ 类别: {metadata['target_names']}")
return model_data
def list_saved_models():
"""列出所有保存的模型"""
import glob
model_files = glob.glob("iris_classifier_*.pkl")
if model_files:
print("📁 已保存的模型文件:")
for file in model_files:
print(f" - {file}")
else:
print("📁 没有找到保存的模型文件")
if __name__ == "__main__":
list_saved_models()
# 示例:加载并显示模型信息(如果有的话)
model_files = glob.glob("iris_classifier_*.pkl")
if model_files:
load_model_with_metadata(model_files[0])
```
### 依赖文件
```txt
# requirements.txt
numpy>=1.21.0
pandas>=1.3.0
scikit-learn>=1.0.0
matplotlib>=3.5.0
jupyter>=1.0.0
joblib>=1.1.0
```
## 🚀 使用说明
1. **环境准备**: 运行 `01_environment_check.py`
2. **数据探索**: 运行 `02_data_preparation.py`
3. **模型训练**: 运行 `03_model_training.py`
4. **模型测试**: 运行 `04_inference_engine.py`
5. **交互使用**: 运行 `05_interactive_interface.py`
6. **模型优化**: 运行 `06_model_optimization.py`(可选)
7. **模型管理**: 运行 `07_model_persistence.py`
每个文件都可以独立运行,也相互关联,构成了完整的AI项目开发流程!