PCA降维算法
def pca(x,k=0,percent = 0.9):
"""
:function: 主成分分析法
:param X: 数据X m*n维 n表示特征个数,m表示数据个数
:param K: K表是要保留的维度
:param percent: 样本所占比例
:return: 返回特征向量
"""
m,n = x.shape
mean = np.mean(x,axis=0)
mean.shape = (1,n)
x_norm = x - mean
x_norm = x_norm.T # 将它变成 行列分别为特征的矩阵 便于计算!!!
cov = np.dot(x_norm, x_norm.T)
eigval, eigvec = np.linalg.eig(cov)
index = np.argsort(-eigval)
eigvec_sort = eigvec[index]
eigval_sort = eigval[index]
eigval_ratio = eigval_sort/np.sum(eigval_sort)
sum = 0
for i in range(eigval_ratio.shape[0]):
sum += eigval_ratio[i]
if sum > percent:
return eigvec_sort[:,:i+1]
模板匹配算法
def neartemplet(x_train,y_train,sample):
"""
:function: 模板匹配法
:param X_train: 训练集 M*N M为样本个数 N为特征个数
:param y_train: 训练集标签 1*M
:param sample: 待识别样品
:return: 返回判断类别
"""
n_train = x_train.shape[0]
dis = []
for i in range(n_train):
dis.append(np.sum((sample-x_train[i,:])**2))
minIndx = np.argmin(dis)
return y_train[minIndx]
划分数据集
def train_test_split(x,y,ratio = 3):
"""
:function: 对数据集划分为训练集、测试集
:param x: m*n维 m表示数据个数 n表示特征个数
:param y: 标签
:param ratio: 产生比例 train:test = 3:1(默认比例)
:return: x_train y_train x_test y_test
"""
n_samples , n_train = x.shape[0] , int(x.shape[0]*(ratio)/(1+ratio))
train_id = random.sample(range(0,n_samples),n_train)
x_train = x[train_id,:]
y_train = y[train_id]
x_test = np.delete(x,train_id,axis = 0)
y_test = np.delete(y,train_id,axis = 0)
return x_train,y_train,x_test,y_test
测试代码
from sklearn import datasets
from Include.chapter3 import function
import numpy as np
#读取数据
digits = datasets.load_digits()
x , y = digits.data,digits.target
#划分数据集
x_train, y_train, x_test, y_test = function.train_test_split(x,y)
testId = np.random.randint(0, x_test.shape[0])
sample = x_test[testId, :]
eigVec = function.pca(x_train)
mean = np.mean(x,axis=0).reshape((1,64))
#去均值
x_train = x_train - mean
sample = sample - mean
#降维
x_train = np.dot(x_train,eigVec)
sample = np.dot(sample,eigVec)
#模板匹配
ans = function.neartemplet(x_train,y_train,sample)
print(ans==y_test[testId])
True