高维数据向低维数据映射

k 个主成分,每个主成分都是n维的。主成分分析的目的就是把样本从一个坐标系转换到另一个坐标系。 我们只取出前k个主成分(每个主成分的坐标轴有n个元素),因为这个前k个特征特别重要。形成W这个矩阵,这个矩阵是k*n的。

样本1和k个W相乘的结果就是样本1映射到 $W_k$ 这个坐标系上得到的k维的向量。 k 比n小,所以完成了一个样本从n维到k维(高维向低维)的映射。

高维向低维映射

也可以从低维恢复到高维,但是降维时丢失的信息无法恢复

求出k个主成分得到$W_k$这个矩阵,得到K个轴的单位方向向量之后,就可以从高维向低维数据的映射,也可以反过来从低维映射回高维数据。

封装的PCA类包含fit, transform, inverse_transform 函数:

import numpy as np

class PCA:
    def __init__(self, n_components):
        """初始化PCA模型"""
        assert n_components >= 1, "n_components must be bigger than 0."
        self.n_components = n_components
        self.components_ = None

    def fit(self, X, eta = 0.01, n_iters = 1e4):
        assert self.n_components <= X.shape[1], "n_components must be not bigger than the feature number of X."
        def f(X, w):
            return np.sum(X.dot(w) ** 2) / len(X)
        def df(X, w):
            return X.T.dot(X.dot(w)) * 2 / len(X)
        def direction(w):  # 求单位向量
            return w / np.linalg.norm(w)
        def demean(X):  # 需要对原始数据做demean处理 - 归零
            return X - np.mean(X, axis=0)
        def first_component(X, initial_w, eta=0.001, n_iters=1e4, epsilon=1e-8):
            w = direction(initial_w)
            i_iter = 0
            while (i_iter < n_iters):
                last_w = w
                gradient = df(X, w)
                w = w + eta * gradient
                w = direction(w)
                if np.abs(f(X, w) - f(X, last_w)) < epsilon:
                    break
                i_iter += 1
            return w

        X_pca = demean(X)
        self.components_ = np.empty(shape=(self.n_components, X.shape[1]))
        for i in range(self.n_components):
            initial_w = np.random.random(X_pca.shape[1])
            w = first_component(X_pca, initial_w)
            self.components_[i, :] = w
            X_pca = X_pca - (X_pca.dot(w)).reshape(-1, 1) * w
        return self

    def transform(self, X):
        """将给定的X映射到各主成分分量中"""
        assert self.components_.shape[1] == X.shape[1], "n_components must be equal to the feature number of X."
        return X.dot(self.components_.T)

    def inverse_transform(self, X_k):
        """将给定的X_k反向映射到原来的特征空间"""
        assert self.components_.shape[0] == X_k.shape[1], "n_components must be equal to the feature number of X_k."
        return X_k.dot(self.components_)
    def first_n_components(n, X, eta = 0.001, n_iters=1e4, epsilon=1e-8):

        def first_component(X, initial_w, eta=0.001, n_iters=1e4, epsilon=1e-8):
            def f(X, w):
                return np.sum(X.dot(w) ** 2) / len(X)

            def df(X, w):
                return X.T.dot(X.dot(w)) * 2 / len(X)

            def direction(w):  # 求单位向量
                return w / np.linalg.norm(w)

            w = direction(initial_w)
            i_iter = 0

            while (i_iter < n_iters):
                last_w = w
                gradient = df(X, w)
                w = w + eta * gradient
                w = direction(w)
                if np.abs(f(X, w) - f(X, last_w)) < epsilon:
                    break
                i_iter += 1
            return w

        def demean(X):  # 需要对原始数据做demean处理 - 归零
            return X - np.mean(X, axis=0)

        X_pca = X.copy()
        res = []
        X_pca = demean(X_pca)
        for i in range(n):
            initial_w = np.random.random(X_pca.shape[1])
            w = first_component(X_pca, initial_w)
            res.append(w)
            X_pca = X_pca - (X_pca.dot(w)).reshape(-1, 1) * w

        return res

    def __repr__(self):
        return "PCA(n_components = %d)" % self.n_components

测试代码:

import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append(r'C:\\N-20KEPC0Y7KFA-Data\\junhuawa\\Documents\\00-Play-with-ML-in-Python\\Jupyter')
import playML
from playML.pca import PCA
X = np.empty((100, 2))
X[:, 0] = np.random.uniform(0, 100, size=100)
X[:, 1] = 0.75*X[:, 0] + 3.0 + np.random.normal(0, 10, size=100)

pca = PCA(n_components = 1)
pca.fit(X)
X_reduction = pca.transform(X)
X_restore = pca.inverse_transform(X_reduction)
X_restore.shape
plt.scatter(X[:, 0], X[:, 1])
plt.scatter(X_restore[:, 0], X_restore[:, 1], color='r')

虽然恢复成2维的样本数据,但是显然他的原始数据信息已经丢失了。