from sklearn import datasetsimport matplotlib.pyplot as pltimport numpy as npimport pandas as pd
iris = datasets.load_iris()iris_x = iris.datairis_y = iris.target
print(iris_y.shape)print('*'*50)print(iris_x.shape)
(150,)**************************************************(150, 4)
from sklearn import decomposition
pca = decomposition.PCA()pca
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False)
# 使用pca()参数默认设置iris_pca = pca.fit_transform(iris_x)iris_pca.shape
(150, 4)
# 保留的n(4)个成分各自的方差百分比pca.explained_variance_ratio_
array([0.92461621, 0.05301557, 0.01718514, 0.00518309])
pca.explained_variance_ratio_.sum()
1.0
# 将主成分个数设置为2pca = decomposition.PCA(n_components=2)iris_x_2 = pca.fit_transform(iris_x)iris_x_2.shape
(150, 2)
pca.explained_variance_ratio_
array([0.92461621, 0.05301557])
pca.explained_variance_ratio_.sum()
0.9776317750248034
pca = decomposition.PCA(n_components=0.99)iris_x_3 = pca.fit(iris_x).transform(iris_x)pca.explained_variance_ratio_
array([0.92461621, 0.05301557, 0.01718514])
pca.explained_variance_ratio_.sum()
0.9948169145498101
%matplotlib inlinefig = plt.figure(figsize=(20,7)) ax = fig.add_subplot(121) ax.scatter(iris_x[:,0],iris_x[:,1],c=iris_y,s=40) ax.set_title('Before PCA')ax2 = fig.add_subplot(122) ax2.scatter(iris_x_3[:,0],iris_x_3[:,1],c=iris_y,s=40) ax2.set_title('After PCA')
<matplotlib.text.Text at 0x188d58e4ac8>