TVP

# 用KNN识别MNIST手写字符实战

Hi， 好久不见，粉丝涨了不少，我要再不更新，估计要掉粉了，今天有时间把最近做的一些工作做个总结，我用KNN来识别MNIST手写字符，主要是代码部分，全部纯手写，没有借助机器学习的框架，希望对大家理解KNN有帮助。

https://github.com/Alvin2580du/KNN_mnist

-------------------------------------------------

importos

importmath

fromfunctoolsimportreduce

importnumpyasnp

fromcollectionsimportCounter

importpandasaspd

fromdatetimeimportdatetime

defapplyfuns(inputs):

iflen(inputs) >10:

return"data"

else:

returninputs.strip()

defsplit_datasets(filename="./datasets/knn/digit-training.txt"):

#将原始数据分拆开，一个样本保存到一个文件中

dir_name = filename.split("/")[-1].split(".")[].split("-")[1]

save_path ='./datasets/knn/{}'.format(dir_name)

if notos.path.exists(save_path):

os.makedirs(save_path)

datacopy = data.copy()

datacopy['labels'] = data[].apply(applyfuns)

label = datacopy[~datacopy['labels'].isin(['data'])]

label.columns = ['0','1']

train = datacopy[datacopy['labels'].isin(['data'])][]

k =

index =

limit =32

save = []

foryintrain:

save.append(y)

k +=1

ifk >= limit:

df = pd.DataFrame(save)

df.to_csv("./datasets/knn/{}/{}_{}.txt".

format(dir_name,index,label['1'].values[index]),

index=None,

save = []

k =

index +=1

defimg2vectorV1(filename):

# get data

rows =32

cols =32

imgVector = []

fileIn =open(filename)

forrowinrange(rows):

forcolinrange(cols):

imgVector.append(int(lineStr[col]))

returnimgVector

defvector_subtract(v,w):

# 向量相减

return[v_i - w_iforv_i,w_iinzip(v,w)]

defdistance(v,w):

# 计算距离函数

s = vector_subtract(v,w)

returnmath.sqrt(sum_of_squares(s))

defget_dict_min(lis,k):

#找到距离最近的k个样本，然后找到出现次数最多的那一类样本

save = []

res = g[1]

save.append(res)

returnCounter(save).most_common(1)[][]

defknnclassifiy(k=3):

#用来统计训练集中没类样本总数

k0,k1,k2,k3,k4,k5,k6,k7,k8,k9 =,,,,,,,,,

hwLabels = []

trainingFileList = os.listdir(dataSetDir +"training")# load training data

m =len(trainingFileList)

trainingMat = np.zeros((m,1024))

foriinrange(m):

fileNameStr = trainingFileList[i]

fileStr = fileNameStr.split('.')[]

classNumStr =int(fileStr.split('.')[].split("_")[1])

ifclassNumStr ==:

k0 +=1

elifclassNumStr ==1:

k1 +=1

elifclassNumStr ==2:

k2 +=1

elifclassNumStr ==3:

k3 +=1

elifclassNumStr ==4:

k4 +=1

elifclassNumStr ==5:

k5 +=1

elifclassNumStr ==6:

k6 +=1

elifclassNumStr ==7:

k7 +=1

elifclassNumStr ==8:

k8 +=1

else:# 9

k9 +=1

hwLabels.append(classNumStr)

trainingMat[i,:] = img2vectorV1(dataSetDir +'training/%s'% fileNameStr)

testFileList = os.listdir(dataSetDir +'testing')

#用来统计测试集的样本总数

tkp0,tkp1,tkp2,tkp3,tkp4,tkp5,tkp6,tkp7,tkp8,tkp9 =,,,,,,,,,

#用来统计分类正确的样本数

tk0,tk1,tk2,tk3,tk4,tk5,tk6,tk7,tk8,tk9 =,,,,,,,,,

C =0.0

mTest =len(testFileList)

foriinrange(mTest):

fileNameStr = testFileList[i]

fileStr = fileNameStr.split('.')[]

TestclassNumStr =int(fileStr.split('.')[].split("_")[1])

ifTestclassNumStr ==:

tkp0 +=1

elifTestclassNumStr ==1:

tkp1 +=1

elifTestclassNumStr ==2:

tkp2 +=1

elifTestclassNumStr ==3:

tkp3 +=1

elifTestclassNumStr ==4:

tkp4 +=1

elifTestclassNumStr ==5:

tkp5 +=1

elifTestclassNumStr ==6:

tkp6 +=1

elifTestclassNumStr ==7:

tkp7 +=1

elifTestclassNumStr ==8:

tkp8 +=1

else:# 9

tkp9 +=1

data_file_name = dataSetDir +'testing/%s'% fileNameStr

vectorUnderTest = img2vectorV1(data_file_name)

distaces_list = {}

forjinrange(m):

distaces = distance(vectorUnderTest,trainingMat[j])#计算距离

distaces_list[distaces] = hwLabels[j]

sorted_distance_list =sorted(distaces_list.items(),

key=lambdae: e[],

reverse=False)

#对距离进行排序

#获得距离最近的K个样本中，出现次数最多的那个样本

C +=1

tk0 +=1

tk1 +=1

tk2 +=1

tk3 +=1

tk4 +=1

tk5 +=1

tk6 +=1

tk7 +=1

tk8 +=1

else:# 9

tk9 +=1

print("- "*20)

print(' Training info ')

print(" {} = {}".format("0",k0))

print(" {} = {} ".format("1",k1))

print(" {} = {} ".format("2",k2))

print(" {} = {} ".format("3",k3))

print(" {} = {} ".format("4",k4))

print(" {} = {} ".format("5",k5))

print(" {} = {} ".format("6",k6))

print(" {} = {} ".format("7",k7))

print(" {} = {} ".format("8",k8))

print(" {} = {} ".format("9",k9))

print("- "*20)

print(" Total Sample = {} ".format(m))

print()

print("- "*20)

print(' Testing info ')

print("- "*20)

print(" {} = {}, {}, {:0.2f}% ".

format("0",tkp0,abs(tkp0 - tk0),1-abs(tkp0 - tk0)/tkp0))

print(" {} = {}, {}, {:0.2f}% ".

format("1",tkp1,abs(tkp1 - tk1),1-abs(tkp1 - tk1)/tkp1))

print(" {} = {}, {}, {:0.2f}% ".

format("2",tkp2,abs(tkp2 - tk2),1-abs(tkp2 - tk2)/tkp2))

print(" {} = {}, {}, {:0.2f}% ".

format("3",tkp3,abs(tkp3 - tk3),1-abs(tkp3 - tk3)/tkp3))

print(" {} = {}, {}, {:0.2f}% ".

format("4",tkp4,abs(tkp4 - tk4),1-abs(tkp4 - tk4)/tkp4))

print(" {} = {}, {}, {:0.2f}% ".

format("5",tkp5,abs(tkp5 - tk5),1-abs(tkp5 - tk5)/tkp5))

print(" {} = {}, {}, {:0.2f}% ".

format("6",tkp6,abs(tkp6 - tk6),1-abs(tkp6 - tk6)/tkp6))

print(" {} = {}, {}, {:0.2f}% ".

format("7",tkp7,abs(tkp7 - tk7),1-abs(tkp7 - tk7)/tkp7))

print(" {} = {}, {}, {:0.2f}% ".

format("8",tkp8,abs(tkp8 - tk8),1-abs(tkp8 - tk8)/tkp8))

print(" {} = {}, {}, {:0.2f}% ".

format("9",tkp9,abs(tkp9 - tk9),1-abs(tkp9 - tk9)/tkp9))

print("- "*20)

print(" Accuracy = {:0.2f}%".format(C /float(mTest)))

print("Correct/Total = {}/{}".format(int(C),mTest))

print(" End of Training @ {} ".

format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

defbuild_knnclassifier():

# 这里对不同的k进行分类，找到最合适的K。

ks = [3,5,7,9]

forkinks:

print(" Beginning of Training @ {} ".

format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

knnclassifiy(k)

print()

defbuildPredict(k=7):

hwLabels = []

trainingFileList = os.listdir(dataSetDir +"training")#加载测试数据

m =len(trainingFileList)

trainingMat = np.zeros((m,1024))

foriinrange(m):

fileNameStr = trainingFileList[i]

fileStr = fileNameStr.split('.')[]

classNumStr =int(fileStr.split('.')[].split("_")[1])# return 1

hwLabels.append(classNumStr)

trainingMat[i,:] = img2vectorV1(dataSetDir +'training/%s'% fileNameStr)

predictFileList = os.listdir(dataSetDir +'predict')# load the testing set

mTest =len(predictFileList)

foriinrange(mTest):

fileNameStr = predictFileList[i]

data_file_name = dataSetDir +'predict/%s'% fileNameStr

vectorUnderTest = img2vectorV1(data_file_name)

distaces_list = {}

forjinrange(m):

distaces = distance(vectorUnderTest,trainingMat[j])

distaces_list[distaces] = hwLabels[j]

sorted_distance_list =sorted(distaces_list.items(),

key=lambdae: e[],

reverse=False)

if__name__ =='__main__':

method ='build_knnclassifier'

ifmethod =='split_datasets':

dataname = ['./datasets/knn/digit-training.txt','./datasets/knn/digit-testing.txt',

'./datasets/knn/digit-predict.txt']

fornindataname:

split_datasets(n)

ifmethod =='build_knnclassifier':

build_knnclassifier()

ifmethod =='buildPredict':

buildPredict(k=7)

TRAINING

Beginning of Training @ 2018-05-06 23:08:16

- - - - - - - - - - - - - - - - - - - -

Training info

0 = 100

1 = 94

2 = 93

3 = 105

4 = 87

5 = 81

6 = 95

7 = 90

8 = 109

9 = 89

- - - - - - - - - - - - - - - - - - - -

Total Sample = 943

- - - - - - - - - - - - - - - - - - - -

TESTING

- - - - - - - - - - - - - - - - - - - -

Testing info

- - - - - - - - - - - - - - - - - - - -

0 = 20, 1, 0.95%

1 = 20, 2, 0.90%

2 = 25, 0, 1.00%

3 = 18, 1, 0.94%

4 = 25, 2, 0.92%

5 = 16, 0, 1.00%

6 = 16, 1, 0.94%

7 = 19, 0, 1.00%

8 = 17, 1, 0.94%

9 = 20, 2, 0.90%

- - - - - - - - - - - - - - - - - - - -

Accuracy = 0.95%

Correct/Total = 187.0/196

Endof Training @ 2018-05-06 23:09:48

TRAINING

Beginning of Training @ 2018-05-06 23:09:48

- - - - - - - - - - - - - - - - - - - -

Training info

0 = 100

1 = 94

2 = 93

3 = 105

4 = 87

5 = 81

6 = 95

7 = 90

8 = 109

9 = 89

- - - - - - - - - - - - - - - - - - - -

Total Sample = 943

- - - - - - - - - - - - - - - - - - - -

TESTING

- - - - - - - - - - - - - - - - - - - -

Testing info

- - - - - - - - - - - - - - - - - - - -

0 = 20, 1, 0.95%

1 = 20, 4, 0.80%

2 = 25, 0, 1.00%

3 = 18, 1, 0.94%

4 = 25, 5, 0.80%

5 = 16, 0, 1.00%

6 = 16, 1, 0.94%

7 = 19, 0, 1.00%

8 = 17, 3, 0.82%

9 = 20, 5, 0.75%

- - - - - - - - - - - - - - - - - - - -

Accuracy = 0.94%

Correct/Total = 185.0/196

Endof Training @ 2018-05-06 23:11:20

TRAINING

Beginning of Training @ 2018-05-06 23:11:20

- - - - - - - - - - - - - - - - - - - -

Training info

0 = 100

1 = 94

2 = 93

3 = 105

4 = 87

5 = 81

6 = 95

7 = 90

8 = 109

9 = 89

- - - - - - - - - - - - - - - - - - - -

Total Sample = 943

- - - - - - - - - - - - - - - - - - - -

TESTING

- - - - - - - - - - - - - - - - - - - -

Testing info

- - - - - - - - - - - - - - - - - - - -

0 = 20, 1, 0.95%

1 = 20, 4, 0.80%

2 = 25, 0, 1.00%

3 = 18, 0, 1.00%

4 = 25, 4, 0.84%

5 = 16, 0, 1.00%

6 = 16, 1, 0.94%

7 = 19, 0, 1.00%

8 = 17, 3, 0.82%

9 = 20, 3, 0.85%

- - - - - - - - - - - - - - - - - - - -

Accuracy = 0.95%

Correct/Total = 187.0/196

Endof Training @ 2018-05-06 23:12:45

TRAINING

Beginning of Training @ 2018-05-06 23:12:45

- - - - - - - - - - - - - - - - - - - -

Training info

0 = 100

1 = 94

2 = 93

3 = 105

4 = 87

5 = 81

6 = 95

7 = 90

8 = 109

9 = 89

- - - - - - - - - - - - - - - - - - - -

Total Sample = 943

TESTING

- - - - - - - - - - - - - - - - - - - -

Testing info

- - - - - - - - - - - - - - - - - - - -

0 = 20, 1, 0.95%

1 = 20, 4, 0.80%

2 = 25, 0, 1.00%

3 = 18, 0, 1.00%

4 = 25, 4, 0.84%

5 = 16, 0, 1.00%

6 = 16, 1, 0.94%

7 = 19, 0, 1.00%

8 = 17, 3, 0.82%

9 = 20, 3, 0.85%

- - - - - - - - - - - - - - - - - - - -

Accuracy = 0.94%

Correct/Total = 185.0/196

Endof Training @ 2018-05-06 23:14:10

PREDICTION

• 发表于:
• 原文链接http://kuaibao.qq.com/s/20180513G0RVRT00?refer=cp_1026
• 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
• 如有侵权，请联系 cloudcommunity@tencent.com 删除。

2023-06-08

2023-06-08

2023-06-08

2023-06-08

2023-06-08

2023-06-08

2023-06-08

2023-06-08

2023-06-08

10元无门槛代金券