# 二、 数据集

```1．怀孕次数。
2．2小时口服葡萄糖耐量测试中得到的血糖浓度。
3．舒张期血压（mm Hg）。
4．三头肌皮脂厚度（mm）。
5．2小时血清胰岛素（mu U/ml）。
6．身体质量指数（体重kg/（身高in m）^2）。
7．糖尿病家族遗传作用值。
8．年龄。```

# 三、 算法实现

## （一） 处理数据

```import csv
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]] #为何要把每个数都转为浮点数
return dataset

#测试
filename = 'pima-indians-diabetes.data.csv'
print('Loaded data file {0} with {1} rows'.format(filename, len(dataset)))```

```import random
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]

#测试
dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15]]
splitRatio = 0.67
train, test = splitDataset(dataset, splitRatio)
print('Split {0} rows into train with {1} and test with {2}'.format(len(dataset), train, test))```

## （二） 提取数据特征

### 1 按类别划分数据

```def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i] #假设最后一个值为类别值
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated

#测试
dataset = [[1,20,1], [2,21,0], [3,22,1]]
separated = separateByClass(dataset)
print('Separated instances: {0}'.format(separated))```

`Separated instances: {1: [[1, 20, 1], [3, 22, 1]], 0: [[2, 21, 0]]}`

### 2 计算均值和标准差

```import math
def mean(numbers):
return sum(numbers)/float(len(numbers))

def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)

#测试
numbers = [1,2,3,4,5]
print('Summary of {0}: mean={1}, stdev={2}'.format(numbers, mean(numbers), stdev(numbers)))```

`Summary of [1, 2, 3, 4, 5]: mean=3.0, stdev=1.5811388300841898`

### 3 提取数据集的特征

```def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries

dataset = [[1,20,0], [2,21,1], [3,22,0]]
summary = summarize(dataset)
print('Attribute summaries: {0}'.format(summary))```

`Attribute summaries: [(2.0, 1.0), (21.0, 1.0)]`

### 4 按类别提取属性特征

```def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries

dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]
summary = summarizeByClass(dataset)
print('Summary by class value: {0}'.format(summary))```

`Summary by class value: {1: [(2.0, 1.4142135623730951), (21.0, 1.4142135623730951)], 0: [(3.0, 1.4142135623730951), (21.5, 0.7071067811865476)]}`

## （三） 预测

### 1 计算高斯分布（正态分布）的概率密度函数

```import math
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

#测试
x = 71.5
mean = 73
stdev = 6.2
probability = calculateProbability(x, mean, stdev)
print('Probability of belonging to this class: {0}'.format(probability))```

`Probability of belonging to this class: 0.0624896575937`

### 2 计算所属类的概率

```def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities

#测试
summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]}
inputVector = [1.1, '?']
probabilities = calculateClassProbabilities(summaries, inputVector)
print('Probabilities for each class: {0}'.format(probabilities))```

`Probabilities for each class: {0: 0.7820853879509118, 1: 6.298736258150442e-05}`

### 3 单一预测

```def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel

#测试
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
inputVector = [1.1, '?']
result = predict(summaries, inputVector)
print('Prediction: {0}'.format(result))```

`PredictionP : A`

### 4 多重预测

```def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions

#测试
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
testSet = [[1.1, '?'], [19.1, '?']]
predictions = getPredictions(summaries, testSet)
print('Predictions: {0}'.format(predictions))```

`Predictions: ['A', 'B']`

## （四） 评估精度

```def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0

#测试
testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
predictions = ['a', 'a', 'a']
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}'.format(accuracy))```

`Accuracy: 66.66666666666666`

## （五）合并代码

```import csv
import random
import math

dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset

def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]

def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated

def mean(numbers):
return sum(numbers)/float(len(numbers))

def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)

def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries

def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries

def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities

def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel

def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions

def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct/float(len(testSet))) * 100.0

def main():
filename = 'pima-indians-diabetes.data.csv'
splitRatio = 0.67
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingSet), len(testSet)))
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%'.format(accuracy))

main()```

```Split 768 rows into train=514 and test=254 rows
Accuracy: 68.11023622047244%```

# 四、 后续扩展

1 计算所属类的概率：将一个数据样本归属于每个类的概率更新为一个比率。计算上就是将一个样本数据归属于某个类的概率，比上其归属于每一个类的概率的和。举例来说，一个样本属于类A的概率时0.02，属于类B的概率时0.001，那么样本属于类A的可能性是(0.02/(0.02+0.001))*100 大约为95.23%。

2 对数概率：对于一个给定的属性值，每个类的条件概率很小。当将其相乘时结果会更小，那么存在浮点溢出的可能（数值太小，以至于在Python中不能表示）。一个常用的修复方案是，合并其概率的对数值。可以研究实现下这个改进。

3 名词属性：改进算法实现，使其支持名词属性。这是十分相似的，你所收集的每个属性的摘要信息是对于每个类的类别值的比率。潜心学习参考文献来获取更多信息。

4 不同的密度函数（伯努利或者多项式）：我们已经尝试了高斯朴素贝叶斯，你也可以尝试下其他分布。实现一个不同的分布诸如多项分布、伯努利分布或者内核朴素贝叶斯，他们对于属性值的分布 和/或 与类值之间的关系有不同的假设。

0 条评论

• ### 小朋友学Python（7）：输入与输出

一、print与input print "a =", a = input() print "b =" b = input() print "c =", c = ...

• ### 小朋友学C++（19）：函数模板

在了解模板之前，咱们先来求一下两个 int型的和，两个float型的和，两个double型的和

• ### ​信息学竞赛ACM评分机制

算法竞赛常见的评分方式有两种：一种是中小学NOIP/CSP-JS所采用的OI机制；另一种是大学ACM-ICPC所采用的ACM机制。中小学采用OI机制的比赛比较多...

• ### 今日 Paper | 多轨迹预测研究；3DMM 人脸模型；对抗网络判别器；交叉模态信息等

论文名称：The Garden of Forking Paths: Towards Multi-Future Trajectory Predictio

• ### MongoDB 基本操作

切换数据库 use test 无需新建数据库，切换时若不存在则自动新建数据库。 查看数据库 \$ show dbs 插入数据 db.表名.方法 不指明 id 则自...

• ### 介绍一下 information_schema 库

今天给大家介绍一款 Mysql 中附属的数据库，就是 information_schema 数据库，为什么说是附属呢？是因为这个数据库是在安装 Mysql 的同...

1\ 代码覆盖报告 pip install coverage 2 　manage.py