机器学习实战 p21
源代码:
def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) #get the number of lines in the file returnMat = zeros((numberOfLines,3)) #prepare matrix to return classLabelVector = [] #prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index,:] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) 此句报错 index += 1 return returnMat,classLabelVector
报错如下:
>>> mat,label = kNN.file2matrix('datingTestSet.txt') Traceback (most recent call last): File "<stdin>", line 1, in <module> File "kNN.py", line 50, in file2matrix classLabelVector.append(int(listFromLine[-1])) ValueError: could not convert string to int: largeDoses
解决方法:
listFromLine[-1]的值形似如下格式,带有回车换行符
largeDoses\r\n
smallDoses\r\n
didntLike\r\n
didntLike\r\n
didntLike\r\n
要将字母字符串转换为int类型是不可能的。
作者定义largeDoses 为3,smallDoses 为2,didntLike为1
于是笔者增加了一个字典类型
d = {'didntLike': 1, 'smallDoses': 2, 'largeDoses': 3}
通过d[listFromLine[-1]]得到对应的label
更改后的代码如下:
rf.py
from numpy import * import operator from os import listdir def rf(filename): fr = open(filename) numberOfLines = len(fr.readlines()) #get the number of lines in the file returnMat = zeros((numberOfLines,3)) #prepare matrix to return d = {'didntLike': 1, 'smallDoses': 2, 'largeDoses': 3} classLabelVector = [] index = 0 fr = open(filename) for line in fr.readlines(): listFromLine = line.split('\t') returnMat[index,:] = listFromLine[0:3] listFromLine[-1] = listFromLine[-1][0:-2] #去除尾端的回车换行符 classLabelVector.append(d[listFromLine[-1]]) #取到字典中对应的label值 index += 1 return returnMat,classLabelVector
画图:
import rf mat,label = rf.rf('datingTestSet.txt') import matplotlib import matplotlib.pyplot as plt fig = plt.figure() >>> ax1 = fig.add_subplot(2, 2, 1) >>> ax1.scatter(mat[:,0],mat[:,1]) >>> ax2 = fig.add_subplot(2, 2, 2) >>> ax2.scatter(mat[:,1],mat[:,2]) from numpy import array #需要自己导入array,否则会报错 >>> ax3 = fig.add_subplot(2, 2, 3) >>> ax3.scatter(mat[:,0],mat[:,1],15.0*array(label),15.0*array(label)) ax4 = fig.add_subplot(2, 2, 4) ax4.scatter(mat[:,1],mat[:,2],15.0*array(label),15.0*array(label)) plt.show()