# 数据挖掘实践指南读书笔记3

http://guidetodatamining.com/ 这本书理论比较简单，书中错误较少，动手锻炼较多，如果每个代码都自己写出来，收获不少。总结：适合入门。 欢迎转载，转载请注明出处，如有问题欢迎指正。 合集地址：https://www.zybuluo.com/hainingwyx/note/559139

### 2. 基于物品属性过滤

```# 计算中位数和绝对标准差
def getMedian(self, alist):
"""return median of alist"""
if alist == []:
return []
blist = sorted(alist)
length = len(alist)
if length % 2 == 1:
# length of list is odd so return middle element
return blist[int(((length + 1) / 2) -  1)]
else:
# length of list is even so compute midpoint
v1 = blist[int(length / 2)]
v2 =blist[(int(length / 2) - 1)]
return (v1 + v2) / 2.0

def getAbsoluteStandardDeviation(self, alist, median):
"""given alist and median return absolute standard deviation"""
sum = 0
for item in alist:
sum += abs(item - median)
return sum / len(alist)

def unitTest():
list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54]
list2 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 68]
list3 = [69]
list4 = [69, 72]
classifier = Classifier('data/athletesTrainingSet.txt')
m1 = classifier.getMedian(list1)
m2 = classifier.getMedian(list2)
m3 = classifier.getMedian(list3)
m4 = classifier.getMedian(list4)
asd1 = classifier.getAbsoluteStandardDeviation(list1, m1)
asd2 = classifier.getAbsoluteStandardDeviation(list2, m2)
asd3 = classifier.getAbsoluteStandardDeviation(list3, m3)
asd4 = classifier.getAbsoluteStandardDeviation(list4, m4)
assert(round(m1, 3) == 65)
assert(round(m2, 3) == 66)
assert(round(m3, 3) == 69)
assert(round(m4, 3) == 70.5)
assert(round(asd1, 3) == 8)
assert(round(asd2, 3) == 7.5)
assert(round(asd3, 3) == 0)
assert(round(asd4, 3) == 1.5)

print("getMedian and getAbsoluteStandardDeviation work correctly")```

assert语句用于软件组件测试的做法是一种常用的技术。产品每一部分分成一段实现代码加上对实现代码的测试代码，这一点十分重要。

```# 归一化
def normalizeColumn(self, columnNumber):
"""given a column number, normalize that column in self.data"""
# first extract values to list, v is vector, clounm is 0/1,col is a list
col = [v[1][columnNumber] for v in self.data]
median = self.getMedian(col)
asd = self.getAbsoluteStandardDeviation(col, median)
#print("Median: %f   ASD = %f" % (median, asd))
self.medianAndDeviation.append((median, asd))
for v in self.data:
v[1][columnNumber] = (v[1][columnNumber] - median) / asd```

