学习曲线
表示随着训练样本的逐渐增多,算法训练出的模型的表现能力。 测试数据:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
np.random.seed(666)
x = np.random.uniform(-3, 3, size=100)
X = x.reshape(-1, 1)
y = 0.5*x**2 + x + 2 + np.random.normal(0, 1, size=100)
线性回归学习曲线绘制:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def plot_learning_curves(X, y):
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
train_errors, test_errors = [], []
for m in range(1, len(X_train)+1):
model = LinearRegression()
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_test_predict = model.predict(X_test)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
test_errors.append(mean_squared_error(y_test, y_test_predict))
plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(train_errors), 'r-+', linewidth=2, label="train")
plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(test_errors), 'b-', linewidth=3, label="test")
plt.legend()
plot_learning_curves(X, y)
多项式回归学习曲线绘制:
def plot_learning_curves_with_algo(X, y, algo= LinearRegression()):
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
train_errors, test_errors = [], []
for m in range(1, len(X_train)+1):
algo.fit(X_train[:m], y_train[:m])
y_train_predict = algo.predict(X_train[:m])
y_test_predict = algo.predict(X_test)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
test_errors.append(mean_squared_error(y_test, y_test_predict))
plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(train_errors), 'r-+', linewidth=2, label="train")
plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(test_errors), 'b-', linewidth=3, label="test")
plt.legend()
plt.axis([0, len(X_train)+1, 0, 4])
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
def PolynomialRegression(degree):
return Pipeline([
("poly", PolynomialFeatures(degree = degree)),
("std_scaler", StandardScaler()),
("lin_reg", LinearRegression())
])
plot_learning_curves_with_algo(X, y, PolynomialRegression(degree=2))
plot_learning_curves_with_algo(X, y, PolynomialRegression(degree=20))
欠拟合的情况下,训练数据集和测试数据集的拟合结果误差都较大!模型不对。
过拟合的情况下,训练数据集的误差和最佳拟合的误差差不多,但是测试数据集的误差很大。测试集的误差和训练集的误差相差较远。 泛化能力不够好!