继续完成机器学习任务
mport pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import requests
import os
import sys
任务1: 使用pandas从本地读取iris数据集
def load_iris_with_pandas():
# 检查是否存在iris.csv文件,如果不存在则下载
if not os.path.exists('iris.csv'):
print("下载iris数据集...")
try:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
response = requests.get(url, timeout=30)
response.raise_for_status()
with open('iris.csv', 'wb') as f:
f.write(response.content)
print("数据集下载完成")
except Exception as e:
print(f"下载数据集失败: {e}")
print("将使用scikit-learn的数据集进行后续操作")
return None, None
# 定义列名
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']# 使用pandas读取数据集
df = pd.read_csv('iris.csv', header=None, names=column_names)print("\n1. 使用pandas读取的iris数据集:")
print("数据集形状:", df.shape)
print("前5行数据:")
print(df.head())
print("数据类型:")
print(df.dtypes)
print("\n类别分布:")
print(df['class'].value_counts())# 返回特征和标签
X = df.drop('class', axis=1).values
y = df['class'].values
return X, y
任务2: 从scikit-learn直接加载iris数据集
def load_iris_with_sklearn():
# 从scikit-learn加载数据集
iris = load_iris()
X, y = iris.data, iris.target
print("\n2. 从scikit-learn加载的iris数据集:")
print("数据集形状:", X.shape)
print("特征名称:", iris.feature_names)
print("目标类别:", iris.target_names)
print("前5个样本特征:")
print(X[:5])
print("前5个样本标签:")
print(y[:5])
print("\n类别分布:")
unique, counts = np.unique(y, return_counts=True)
for i, (u, c) in enumerate(zip(unique, counts)):print(f"{iris.target_names[u]}: {c}个样本")return X, y
任务3: 实现五折交叉验证进行模型训练
def perform_cross_validation(X, y):
# 创建随机森林分类器
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# 设置五折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)print("\n3. 五折交叉验证结果:")# 存储每个折的评估指标
all_accuracies = []
all_precisions = []
all_recalls = []
all_f1s = []# 进行交叉验证
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):X_train, X_test = X[train_idx], X[test_idx]y_train, y_test = y[train_idx], y[test_idx]# 训练模型rf_classifier.fit(X_train, y_train)# 预测y_pred = rf_classifier.predict(X_test)# 计算评估指标accuracy = accuracy_score(y_test, y_pred)precision = precision_score(y_test, y_pred, average='macro')recall = recall_score(y_test, y_pred, average='macro')f1 = f1_score(y_test, y_pred, average='macro')# 存储结果all_accuracies.append(accuracy)all_precisions.append(precision)all_recalls.append(recall)all_f1s.append(f1)print(f" 折 {fold}:")print(f" 准确度: {accuracy:.4f}")print(f" 精度: {precision:.4f}")print(f" 召回率: {recall:.4f}")print(f" F1值: {f1:.4f}")# 计算平均指标
avg_accuracy = np.mean(all_accuracies)
avg_precision = np.mean(all_precisions)
avg_recall = np.mean(all_recalls)
avg_f1 = np.mean(all_f1s)print("\n4. 平均评估指标:")
print(f" 平均准确度: {avg_accuracy:.4f}")
print(f" 平均精度: {avg_precision:.4f}")
print(f" 平均召回率: {avg_recall:.4f}")
print(f" 平均F1值: {avg_f1:.4f}")# 使用sklearn的cross_val_score进行验证
print("\n5. 使用sklearn的cross_val_score验证:")
cv_accuracy = cross_val_score(rf_classifier, X, y, cv=5, scoring='accuracy')
cv_precision = cross_val_score(rf_classifier, X, y, cv=5, scoring='precision_macro')
cv_recall = cross_val_score(rf_classifier, X, y, cv=5, scoring='recall_macro')
cv_f1 = cross_val_score(rf_classifier, X, y, cv=5, scoring='f1_macro')print(f" 交叉验证准确度: {cv_accuracy.mean():.4f} ± {cv_accuracy.std():.4f}")
print(f" 交叉验证精度: {cv_precision.mean():.4f} ± {cv_precision.std():.4f}")
print(f" 交叉验证召回率: {cv_recall.mean():.4f} ± {cv_recall.std():.4f}")
print(f" 交叉验证F1值: {cv_f1.mean():.4f} ± {cv_f1.std():.4f}")
主函数
def main():
print("===== Iris数据集实验 =====")
print("\n实验目的:熟悉Python基本操作,掌握数据读写与模型评估")
# 任务1: 使用pandas加载数据集
print("\n" + "="*40)
print("任务1: 使用pandas从本地读取iris数据集")
try:X_pd, y_pd = load_iris_with_pandas()
except Exception as e:print(f"使用pandas加载数据集时出错: {e}")print("继续执行下一步...")# 任务2: 使用scikit-learn加载数据集
print("\n" + "="*40)
print("任务2: 从scikit-learn直接加载iris数据集")
X_skl, y_skl = load_iris_with_sklearn()# 任务3和4: 交叉验证和评估指标计算
print("\n" + "="*40)
print("任务3-4: 五折交叉验证与模型评估指标计算")
perform_cross_validation(X_skl, y_skl)print("\n" + "="*40)
print("实验完成!")
print("注:本实验使用随机森林分类器(RandomForestClassifier)进行模型训练")
print(" 并通过五折交叉验证计算了模型的准确度、精度、召回率和F1值")
if name == "main":
try:
main()
except KeyboardInterrupt:
print("\n程序被用户中断")
sys.exit(0)
except Exception as e:
print(f"程序执行出错: {e}")
sys.exit(1)