根据您提供的文件内容,我将为您生成两个独立的Python程序。
## 一、决策树程序(基于打球数据)
```python
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
# 创建打球数据集(根据文档中提到的4种环境数据)
data = {
'outlook': ['sunny', 'sunny', 'overcast', 'rainy', 'rainy', 'rainy',
'overcast', 'sunny', 'sunny', 'rainy', 'sunny', 'overcast',
'overcast', 'rainy'],
'temperature': ['hot', 'hot', 'hot', 'mild', 'cool', 'cool', 'cool',
'mild', 'cool', 'mild', 'mild', 'mild', 'hot', 'mild'],
'humidity': ['high', 'high', 'high', 'high', 'normal', 'normal', 'normal',
'high', 'normal', 'normal', 'normal', 'high', 'normal', 'high'],
'windy': ['false', 'true', 'false', 'false', 'false', 'true', 'true',
'false', 'false', 'false', 'true', 'true', 'false', 'true'],
'play': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes',
'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}
# 创建DataFrame
df = pd.DataFrame(data)
# 显示数据
print("打球数据集:")
print(df)
print("\n" + "="*60 + "\n")
# 将分类变量转换为数值
from sklearn.preprocessing import LabelEncoder
# 为每个分类特征创建编码器
label_encoders = {}
X_encoded = pd.DataFrame()
for column in ['outlook', 'temperature', 'humidity', 'windy']:
le = LabelEncoder()
X_encoded[column] = le.fit_transform(df[column])
label_encoders[column] = le
print(f"{column} 编码映射: {dict(zip(le.classes_, le.transform(le.classes_)))}")
# 目标变量编码
le_target = LabelEncoder()
y = le_target.fit_transform(df['play'])
print(f"play 编码映射: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}")
# 准备特征和目标变量
X = X_encoded
print(f"\n特征矩阵形状: {X.shape}")
print(f"目标变量形状: {y.shape}")
# 创建决策树分类器
clf = DecisionTreeClassifier(
criterion='entropy', # 使用信息熵作为分裂标准
max_depth=4, # 限制树深度,防止过拟合
random_state=42
)
# 训练决策树
clf.fit(X, y)
# 显示决策树结构
plt.figure(figsize=(15, 10))
feature_names = ['outlook', 'temperature', 'humidity', 'windy']
class_names = ['no', 'yes']
# 绘制决策树
plot_tree(clf,
feature_names=feature_names,
class_names=class_names,
filled=True,
rounded=True,
fontsize=10)
plt.title("打球决策树", fontsize=16)
plt.show()
# 显示决策树文本表示
print("\n决策树文本表示:")
print("="*60)
from sklearn.tree import export_text
tree_rules = export_text(clf, feature_names=feature_names)
print(tree_rules)
# 显示特征重要性
print("\n特征重要性:")
print("="*60)
for feature, importance in zip(feature_names, clf.feature_importances_):
print(f"{feature}: {importance:.4f}")
# 预测示例
print("\n预测示例:")
print("="*60)
test_cases = [
['sunny', 'hot', 'high', 'false'], # 应该预测不打球
['overcast', 'mild', 'normal', 'true'], # 应该预测打球
['rainy', 'cool', 'normal', 'false'] # 应该预测打球
]
for i, test in enumerate(test_cases):
# 编码测试数据
test_encoded = []
for j, col in enumerate(['outlook', 'temperature', 'humidity', 'windy']):
if test[j] in label_encoders[col].classes_:
test_encoded.append(label_encoders[col].transform([test[j]])[0])
else:
test_encoded.append(-1) # 未知类别
prediction = clf.predict([test_encoded])[0]
prediction_label = le_target.inverse_transform([prediction])[0]
print(f"测试案例 {i+1}: {test}")
print(f" 预测结果: {'打球' if prediction_label == 'yes' else '不打球'}")
print(f" 预测概率: {clf.predict_proba([test_encoded])[0]}")
print()
# 模型评估(使用相同数据,实际应用中应该使用测试集)
accuracy = clf.score(X, y)
print(f"模型准确率(训练集): {accuracy:.4f}")
```
## 二、DBSCAN聚类算法(鸢尾花数据集)
```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score
import seaborn as sns
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y_true = iris.target
feature_names = iris.feature_names
target_names = iris.target_names
print("鸢尾花数据集信息:")
print(f"数据形状: {X.shape}")
print(f"特征名称: {feature_names}")
print(f"类别名称: {target_names}")
print(f"真实标签分布: {np.bincount(y_true)}")
print("\n" + "="*60 + "\n")
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 使用DBSCAN进行聚类
print("DBSCAN聚类参数调优:")
print("-" * 40)
# 尝试不同的参数组合
param_combinations = [
{'eps': 0.5, 'min_samples': 5},
{'eps': 0.6, 'min_samples': 5},
{'eps': 0.7, 'min_samples': 5},
{'eps': 0.8, 'min_samples': 5},
{'eps': 0.5, 'min_samples': 10},
]
best_score = -1
best_params = None
best_labels = None
for params in param_combinations:
dbscan = DBSCAN(eps=params['eps'], min_samples=params['min_samples'])
labels = dbscan.fit_predict(X_scaled)
# 计算轮廓系数(排除噪声点)
if len(set(labels)) > 1 and -1 in labels:
valid_mask = labels != -1
if sum(valid_mask) > 1: # 确保有足够的非噪声点
score = silhouette_score(X_scaled[valid_mask], labels[valid_mask])
else:
score = -1
elif len(set(labels)) > 1:
score = silhouette_score(X_scaled, labels)
else:
score = -1
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
print(f"eps={params['eps']}, min_samples={params['min_samples']}")
print(f" 聚类数量: {n_clusters}, 噪声点数量: {n_noise}")
print(f" 轮廓系数: {score:.4f}")
if score > best_score:
best_score = score
best_params = params
best_labels = labels
print("\n" + "="*60)
print(f"最佳参数: eps={best_params['eps']}, min_samples={best_params['min_samples']}")
print(f"最佳轮廓系数: {best_score:.4f}")
# 使用最佳参数进行聚类
dbscan = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'])
y_pred = dbscan.fit_predict(X_scaled)
# 统计聚类结果
unique_labels = set(y_pred)
n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
n_noise = list(y_pred).count(-1)
print(f"\n聚类结果统计:")
print(f"聚类数量: {n_clusters}")
print(f"噪声点数量: {n_noise}")
print(f"标签分布: {np.bincount(y_pred + 1)}") # +1是为了让噪声点标签从0开始
# 计算评估指标(排除噪声点)
valid_mask = y_pred != -1
if sum(valid_mask) > 0:
ari = adjusted_rand_score(y_true[valid_mask], y_pred[valid_mask])
print(f"调整兰德指数(ARI): {ari:.4f}")
# 使用PCA进行降维可视化
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
print(f"\nPCA解释方差比: {pca.explained_variance_ratio_}")
print(f"累计解释方差: {sum(pca.explained_variance_ratio_):.4f}")
# 创建可视化图形
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# 1. 真实类别分布
scatter1 = axes[0, 0].scatter(X_pca[:, 0], X_pca[:, 1], c=y_true,
cmap='viridis', edgecolor='k', s=50)
axes[0, 0].set_title('真实类别分布', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('PCA Component 1')
axes[0, 0].set_ylabel('PCA Component 2')
plt.colorbar(scatter1, ax=axes[0, 0], ticks=[0, 1, 2])
axes[0, 0].grid(True, alpha=0.3)
# 2. DBSCAN聚类结果
colors = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
col = 'gray' # 噪声点用灰色表示
class_member_mask = (y_pred == k)
xy = X_pca[class_member_mask]
axes[0, 1].scatter(xy[:, 0], xy[:, 1], c=[col],
label='噪声点' if k == -1 else f'聚类{k}',
edgecolor='k', s=50, alpha=0.8)
axes[0, 1].set_title('DBSCAN聚类结果', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('PCA Component 1')
axes[0, 1].set_ylabel('PCA Component 2')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
# 3. 特征关系图(前两个特征)
scatter3 = axes[1, 0].scatter(X[:, 0], X[:, 1], c=y_pred,
cmap='Set1', edgecolor='k', s=50)
axes[1, 0].set_title('特征关系(花萼)', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel(feature_names[0])
axes[1, 0].set_ylabel(feature_names[1])
plt.colorbar(scatter3, ax=axes[1, 0])
axes[1, 0].grid(True, alpha=0.3)
# 4. 特征关系图(后两个特征)
scatter4 = axes[1, 1].scatter(X[:, 2], X[:, 3], c=y_pred,
cmap='Set1', edgecolor='k', s=50)
axes[1, 1].set_title('特征关系(花瓣)', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel(feature_names[2])
axes[1, 1].set_ylabel(feature_names[3])
plt.colorbar(scatter4, ax=axes[1, 1])
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.suptitle('鸢尾花数据集DBSCAN聚类分析', fontsize=16, fontweight='bold', y=1.02)
plt.show()
# 显示聚类中心(对于DBSCAN,需要手动计算)
print("\n聚类中心(非噪声点):")
print("-" * 40)
for cluster_id in range(n_clusters):
cluster_mask = y_pred == cluster_id
if np.any(cluster_mask):
cluster_center = X[cluster_mask].mean(axis=0)
cluster_size = np.sum(cluster_mask)
print(f"\n聚类 {cluster_id}:")
print(f" 样本数量: {cluster_size}")
print(f" 中心特征值:")
for i, (feature, value) in enumerate(zip(feature_names, cluster_center)):
print(f" {feature}: {value:.2f}")
# 显示噪声点分析
if n_noise > 0:
print(f"\n噪声点分析:")
print(f" 噪声点数量: {n_noise}")
print(f" 噪声点比例: {n_noise/len(X):.2%}")
# 噪声点的特征统计
noise_mask = y_pred == -1
noise_data = X[noise_mask]
print(f"\n 噪声点特征统计:")
for i, feature in enumerate(feature_names):
print(f" {feature}: 均值={noise_data[:, i].mean():.2f}, "
f"标准差={noise_data[:, i].std():.2f}")
# 保存聚类结果
results_df = pd.DataFrame(X, columns=feature_names)
results_df['真实类别'] = y_true
results_df['真实类别名称'] = [target_names[i] for i in y_true]
results_df['聚类标签'] = y_pred
results_df['是否为噪声点'] = results_df['聚类标签'] == -1
print("\n聚类结果数据框(前10行):")
print(results_df.head(10))
```
## 程序说明:
### 1. 决策树程序特点:
- 使用信息熵(entropy)作为分裂标准
- 限制最大深度防止过拟合
- 显示完整的决策树结构和特征重要性
- 包含预测示例和模型评估
### 2. DBSCAN聚类程序特点:
- 使用DBSCAN算法(基于密度的聚类)
- 自动调优eps和min_samples参数
- 提供多种可视化:PCA降维、特征关系图
- 计算轮廓系数和调整兰德指数进行评估
- 分析噪声点并显示聚类中心
这两个程序都包含了详细的数据处理、模型训练/聚类、结果可视化和性能评估,可以直接运行。