第204集:监督学习基础
1. 监督学习概述
1.1 什么是监督学习
监督学习(Supervised Learning)是机器学习中最常见和应用最广泛的一种学习范式。它通过使用带有标签的训练数据来学习输入(特征)和输出(标签)之间的映射关系,从而能够对新的、未见过的数据进行预测。
1.2 监督学习的基本要素
- 训练数据:包含输入特征和对应的标签
- 模型:用于学习输入与输出之间映射关系的算法
- 损失函数:衡量模型预测结果与真实标签之间的差异
- 优化算法:用于调整模型参数以最小化损失函数
- 评估指标:用于衡量模型在测试数据上的性能
1.3 监督学习的主要类型
- 分类(Classification):预测离散的类别标签
- 回归(Regression):预测连续的数值输出
- 序列预测(Sequence Prediction):预测时间序列或序列数据
- 结构化预测(Structured Prediction):预测结构化的输出
2. 分类算法
2.1 逻辑回归(Logistic Regression)
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 加载数据
data = load_iris()
X, y = data.data, data.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建并训练模型
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 评估准确性
accuracy = accuracy_score(y_test, y_pred)
print(f"逻辑回归准确率: {accuracy:.2f}")2.2 决策树(Decision Tree)
from sklearn.tree import DecisionTreeClassifier
# 创建并训练决策树模型
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
# 预测
y_pred_tree = tree_model.predict(X_test)
# 评估
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(f"决策树准确率: {accuracy_tree:.2f}")
# 查看特征重要性
feature_importance = pd.DataFrame({
'feature': data.feature_names,
'importance': tree_model.feature_importances_
}).sort_values('importance', ascending=False)
print("特征重要性:")
print(feature_importance)2.3 随机森林(Random Forest)
from sklearn.ensemble import RandomForestClassifier
# 创建并训练随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# 预测
y_pred_rf = rf_model.predict(X_test)
# 评估
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"随机森林准确率: {accuracy_rf:.2f}")2.4 支持向量机(SVM)
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 创建并训练SVM模型
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)
# 预测
y_pred_svm = svm_model.predict(X_test_scaled)
# 评估
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM准确率: {accuracy_svm:.2f}")2.5 K最近邻(KNN)
from sklearn.neighbors import KNeighborsClassifier
# 创建并训练KNN模型
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train_scaled, y_train)
# 预测
y_pred_knn = knn_model.predict(X_test_scaled)
# 评估
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN准确率: {accuracy_knn:.2f}")3. 回归算法
3.1 线性回归(Linear Regression)
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error, r2_score
# 加载数据
boston = load_boston()
X_reg, y_reg = boston.data, boston.target
# 划分训练集和测试集
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# 创建并训练线性回归模型
lr_model = LinearRegression()
lr_model.fit(X_train_reg, y_train_reg)
# 预测
y_pred_lr = lr_model.predict(X_test_reg)
# 评估
mse_lr = mean_squared_error(y_test_reg, y_pred_lr)
r2_lr = r2_score(y_test_reg, y_pred_lr)
print(f"线性回归MSE: {mse_lr:.2f}")
print(f"线性回归R²: {r2_lr:.2f}")3.2 岭回归(Ridge Regression)
from sklearn.linear_model import Ridge
# 创建并训练岭回归模型
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_reg, y_train_reg)
# 预测
y_pred_ridge = ridge_model.predict(X_test_reg)
# 评估
mse_ridge = mean_squared_error(y_test_reg, y_pred_ridge)
r2_ridge = r2_score(y_test_reg, y_pred_ridge)
print(f"岭回归MSE: {mse_ridge:.2f}")
print(f"岭回归R²: {r2_ridge:.2f}")3.3 Lasso回归
from sklearn.linear_model import Lasso
# 创建并训练Lasso回归模型
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train_reg, y_train_reg)
# 预测
y_pred_lasso = lasso_model.predict(X_test_reg)
# 评估
mse_lasso = mean_squared_error(y_test_reg, y_pred_lasso)
r2_lasso = r2_score(y_test_reg, y_pred_lasso)
print(f"Lasso回归MSE: {mse_lasso:.2f}")
print(f"Lasso回归R²: {r2_lasso:.2f}")3.4 决策树回归
from sklearn.tree import DecisionTreeRegressor
# 创建并训练决策树回归模型
tree_reg_model = DecisionTreeRegressor(random_state=42)
tree_reg_model.fit(X_train_reg, y_train_reg)
# 预测
y_pred_tree_reg = tree_reg_model.predict(X_test_reg)
# 评估
mse_tree_reg = mean_squared_error(y_test_reg, y_pred_tree_reg)
r2_tree_reg = r2_score(y_test_reg, y_pred_tree_reg)
print(f"决策树回归MSE: {mse_tree_reg:.2f}")
print(f"决策树回归R²: {r2_tree_reg:.2f}")3.5 随机森林回归
from sklearn.ensemble import RandomForestRegressor
# 创建并训练随机森林回归模型
rf_reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg_model.fit(X_train_reg, y_train_reg)
# 预测
y_pred_rf_reg = rf_reg_model.predict(X_test_reg)
# 评估
mse_rf_reg = mean_squared_error(y_test_reg, y_pred_rf_reg)
r2_rf_reg = r2_score(y_test_reg, y_pred_rf_reg)
print(f"随机森林回归MSE: {mse_rf_reg:.2f}")
print(f"随机森林回归R²: {r2_rf_reg:.2f}")4. 模型评估指标
4.1 分类评估指标
混淆矩阵(Confusion Matrix)
from sklearn.metrics import confusion_matrix, classification_report
# 生成混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("混淆矩阵:")
print(cm)
# 生成分类报告
print("\n分类报告:")
print(classification_report(y_test, y_pred))准确率(Accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy:.2f}")精确率(Precision)、召回率(Recall)、F1分数(F1-Score)
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"精确率: {precision:.2f}")
print(f"召回率: {recall:.2f}")
print(f"F1分数: {f1:.2f}")ROC曲线和AUC值
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
# 将多分类标签二值化
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
# 计算每个类别的ROC曲线
for i in range(3):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], model.predict_proba(X_test)[:, i])
roc_auc = auc(fpr, tpr)
print(f"类别{i}的AUC值: {roc_auc:.2f}")4.2 回归评估指标
均方误差(MSE)
mse = mean_squared_error(y_test_reg, y_pred_lr)
print(f"均方误差: {mse:.2f}")均方根误差(RMSE)
rmse = mean_squared_error(y_test_reg, y_pred_lr, squared=False)
print(f"均方根误差: {rmse:.2f}")平均绝对误差(MAE)
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test_reg, y_pred_lr)
print(f"平均绝对误差: {mae:.2f}")R²分数
r2 = r2_score(y_test_reg, y_pred_lr)
print(f"R²分数: {r2:.2f}")5. 过拟合与欠拟合
5.1 过拟合(Overfitting)
- 定义:模型在训练数据上表现很好,但在测试数据上表现很差
- 原因:模型过于复杂,学习了训练数据中的噪声和随机波动
- 解决方法:
- 增加训练数据量
- 使用正则化技术(如L1、L2正则化)
- 减小模型复杂度
- 使用交叉验证
- 特征选择
5.2 欠拟合(Underfitting)
- 定义:模型在训练数据和测试数据上都表现很差
- 原因:模型过于简单,无法捕捉数据中的模式
- 解决方法:
- 增加模型复杂度
- 添加更多特征
- 减少正则化强度
5.3 交叉验证(Cross-Validation)
from sklearn.model_selection import cross_val_score
# 使用5折交叉验证评估逻辑回归模型
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"5折交叉验证准确率: {scores}")
print(f"平均准确率: {scores.mean():.2f}")6. 特征工程在监督学习中的应用
6.1 特征选择
from sklearn.feature_selection import SelectKBest, chi2
# 使用卡方检验选择前k个最好的特征
selector = SelectKBest(chi2, k=3)
X_new = selector.fit_transform(X, y)
print(f"原始特征数: {X.shape[1]}")
print(f"选择后特征数: {X_new.shape[1]}")
print(f"选择的特征索引: {selector.get_support(indices=True)}")6.2 特征缩放
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 归一化
minmax_scaler = MinMaxScaler()
X_minmax = minmax_scaler.fit_transform(X)6.3 特征编码
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# 标签编码
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# 独热编码
from sklearn.compose import ColumnTransformer
# 假设第0列是类别特征
ct = ColumnTransformer([
('onehot', OneHotEncoder(), [0])
], remainder='passthrough')
X_encoded = ct.fit_transform(X)7. 实践案例:信用卡欺诈检测
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
# 加载数据(假设数据已下载)
# df = pd.read_csv('creditcard.csv')
# 为演示,创建模拟数据
np.random.seed(42)
n_samples = 10000
n_fraud = 50
# 创建正常交易
normal_transactions = pd.DataFrame({
'V1': np.random.normal(0, 1, n_samples - n_fraud),
'V2': np.random.normal(0, 1, n_samples - n_fraud),
'V3': np.random.normal(0, 1, n_samples - n_fraud),
'Amount': np.random.uniform(0, 500, n_samples - n_fraud),
'Class': 0
})
# 创建欺诈交易
fraud_transactions = pd.DataFrame({
'V1': np.random.normal(-2, 1, n_fraud),
'V2': np.random.normal(-2, 1, n_fraud),
'V3': np.random.normal(-2, 1, n_fraud),
'Amount': np.random.uniform(100, 1000, n_fraud),
'Class': 1
})
# 合并数据
df = pd.concat([normal_transactions, fraud_transactions], ignore_index=True)
# 划分特征和标签
X = df.drop('Class', axis=1)
y = df['Class']
# 数据均衡处理(SMOTE过采样)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X_resampled, y_resampled, test_size=0.2, random_state=42
)
# 创建并训练模型
fraud_model = RandomForestClassifier(n_estimators=100, random_state=42)
fraud_model.fit(X_train, y_train)
# 预测
y_pred = fraud_model.predict(X_test)
# 评估
print("混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
print("\n分类报告:")
print(classification_report(y_test, y_pred))8. 总结
监督学习是机器学习中应用最广泛的范式,通过学习带有标签的训练数据来预测新数据的标签或数值。本集我们学习了:
- 分类算法:逻辑回归、决策树、随机森林、SVM、KNN等
- 回归算法:线性回归、岭回归、Lasso回归、决策树回归、随机森林回归等
- 模型评估指标:混淆矩阵、准确率、精确率、召回率、F1分数、MSE、RMSE、MAE、R²等
- 过拟合与欠拟合:原因及解决方法
- 特征工程:特征选择、缩放、编码等
- 实践案例:信用卡欺诈检测
监督学习在各个领域都有广泛的应用,如金融风险评估、医疗诊断、图像识别、自然语言处理等。选择合适的算法和评估指标对于解决实际问题至关重要。
下一集我们将学习无监督学习基础,探索不需要标签的数据挖掘方法。