学习曲线

表示随着训练样本的逐渐增多,算法训练出的模型的表现能力。 测试数据:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
np.random.seed(666)
x = np.random.uniform(-3, 3, size=100)
X = x.reshape(-1, 1)
y = 0.5*x**2 + x + 2 + np.random.normal(0, 1, size=100)

线性回归学习曲线绘制:

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves(X, y):
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
    train_errors, test_errors = [], []
    for m in range(1, len(X_train)+1):
        model = LinearRegression()
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_test_predict = model.predict(X_test)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        test_errors.append(mean_squared_error(y_test, y_test_predict))
    plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(train_errors), 'r-+', linewidth=2, label="train")
    plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(test_errors), 'b-', linewidth=3, label="test")
    plt.legend()
plot_learning_curves(X, y)

多项式回归学习曲线绘制:

def plot_learning_curves_with_algo(X, y, algo= LinearRegression()):
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
    train_errors, test_errors = [], []
    for m in range(1, len(X_train)+1):
        algo.fit(X_train[:m], y_train[:m])
        y_train_predict = algo.predict(X_train[:m])
        y_test_predict = algo.predict(X_test)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        test_errors.append(mean_squared_error(y_test, y_test_predict))
    plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(train_errors), 'r-+', linewidth=2, label="train")
    plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(test_errors), 'b-', linewidth=3, label="test")
    plt.legend()
    plt.axis([0, len(X_train)+1, 0, 4])

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

def PolynomialRegression(degree):
	return Pipeline([
		("poly", PolynomialFeatures(degree = degree)),
		("std_scaler", StandardScaler()), 
		("lin_reg", LinearRegression())
		])

plot_learning_curves_with_algo(X, y, PolynomialRegression(degree=2))
plot_learning_curves_with_algo(X, y, PolynomialRegression(degree=20))

欠拟合的情况下,训练数据集和测试数据集的拟合结果误差都较大!模型不对。

过拟合的情况下,训练数据集的误差和最佳拟合的误差差不多,但是测试数据集的误差很大。测试集的误差和训练集的误差相差较远。 泛化能力不够好!