朴素贝叶斯练习实例

文本分类:过滤恶意留言

此处有两个改进的地方: (1)若有的类别没有出现,其概率就是0,会十分影响分类器的性能。所以采取各类别默认1次累加,总类别(两类)次数2,这样不影响相对大小。 (2)若很小是数字相乘,则结果会更小,再四舍五入存在误差,而且会造成下溢出。采取取log,乘法变为加法,并且相对大小趋势不变。

# -*- coding: utf-8 -*-
"""
Created on Sun Dec 10 13:51:56 2017

文本分类:应用过滤恶意留言

@author: jasonhaven
"""
import numpy as np

#1 载入数据集:6条文本及它们各自的类别,这6条文本作为训练集。
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec


#2 创建词汇表:利用集合结构内元素的唯一性,创建一个包含所有词汇的词表。
def createVocabSet(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

# 3 把输入文本根据词表转化为计算机可处理的01向量形式:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

#4训练模型:在训练样本中计算先验概率 p(Ci) 和 条件概率 p(x,y | Ci),本实例有0和1两个类别,所以返回p(x,y | 0),p(x,y | 1)和p(Ci)。
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs=trainMatrix.shape[0]
    numWords=len(trainMatrix[0])
    pAbusive=sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom =2.0
    p1Denom =2.0
    for i in range(numTrainDocs):
        if trainCategory[i]==1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)          #change to log()
    p0Vect = np.log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0    


def testingNB():
    '''
    加载数据集+提炼词表;
    训练模型:根据六条训练集计算先验概率和条件概率;
    测试模型:对训练两条测试文本进行分类。
    '''
    listOPosts,listClasses = loadDataSet()
    #print(listOPosts)
    #print(listClasses)
    myVocabList = createVocabSet(listOPosts)
    #print(myVocabList)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    #print(trainMat)
    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
    
    #test
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    

if __name__=='__main__':
    testingNB()

过滤垃圾邮件

ham/1.txt

Hi Peter,

With Jose out of town, do you want to
meet once in a while to keep things
going and do some interesting stuff?

Let me know
Eugene

spam/1.txt

--- Codeine 15mg -- 30 for $203.70 -- VISA Only!!! --

-- Codeine (Methylmorphine) is a narcotic (opioid) pain reliever
-- We have 15mg & 30mg pills -- 30/15mg for $203.70 - 60/15mg for $385.80 - 90/15mg for $562.50 -- VISA Only!!! ---
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 10 13:51:56 2017

垃圾邮件过滤

@author: jasonhaven
"""
import numpy as np

#1 载入数据集:6条文本及它们各自的类别,这6条文本作为训练集。
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec


#2 创建词汇表:利用集合结构内元素的唯一性,创建一个包含所有词汇的词表。
def createVocabSet(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

# 3 把输入文本根据词表转化为计算机可处理的01向量形式:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

#4训练模型:在训练样本中计算先验概率 p(Ci) 和 条件概率 p(x,y | Ci),本实例有0和1两个类别,所以返回p(x,y | 0),p(x,y | 1)和p(Ci)。
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs=trainMatrix.shape[0]
    numWords=len(trainMatrix[0])
    pAbusive=sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom =2.0
    p1Denom =2.0
    for i in range(numTrainDocs):
        if trainCategory[i]==1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)          #change to log()
    p0Vect = np.log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0    

    
#对邮件的文本划分成词汇,长度小于2的默认为不是词汇,过滤掉即可。返回一串小写的拆分后的邮件信息。
def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 

#文档词袋模型:使用数组代替集合数据结构,可以保存词汇频率信息。
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in range(1,26):
        wordList = textParse(open('./spam/1.txt','r').read())
        # print wordList
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./ham/1.txt','r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabSet(docList)#create vocabulary
    trainingSet = list(range(50))
    testSet=[]           #create test set
    for i in range(10):
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error",docList[docIndex])
    print('the error rate is: ',float(errorCount)/len(testSet))
    #return vocabList,fullText

if __name__=='__main__':
    #testingNB()
    spamTest()

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

相关文章

来自专栏自然语言处理

实现 | 朴素贝叶斯模型算法研究与实例分析

构建一个快速过滤器来屏蔽在线社区留言板上的侮辱性言论。如果某条留言使用了负面或者侮辱性的语言,那么就将该留言标识为内容不当。对此问题建立两个类别: 侮辱类和非侮...

1874
来自专栏有趣的Python和你

机器学习实战之朴素贝叶斯

在学习朴素贝叶斯分类模型之前,我们回顾一下之前学习的KNN和决策树,读者本人的总结:不同的机器学习方法有着不同的假设和理论进行支撑,而这些假设和理论在很大程度上...

1565
来自专栏强仔仔

利用JavaScript中的正则表达式实现常用输入框的验证

本章主要讲:通过JavaScript中正则表达式的 应用实现(http、电话号码、邮箱、数字、字母及其数字、时间日期、身份证)等的验证。 下面看例子demo的实...

2216
来自专栏SnailTyan

Improving Deep Neural Networks学习笔记(三)

5. Hyperparameter tuning 5.1 Tuning process Hyperparameters: α\alpha, β\beta, β1...

2131
来自专栏AILearning

【机器学习实战】第4章 基于概率论的分类方法:朴素贝叶斯

第4章 基于概率论的分类方法:朴素贝叶斯 <script type="text/javascript" src="http://cdn.mathjax.org...

3209
来自专栏Petrichor的专栏

深度学习: Jacobian矩阵 & Hessian矩阵

[1] Functions - Gradient, Jacobian and Hessian [2] Deep Learning Book

3453
来自专栏有趣的Python和你

机器学习实战之朴素贝叶斯

1192
来自专栏deepcc

jQuery身份证验证插件

3596
来自专栏SnailTyan

Improving Deep Neural Networks学习笔记(二)

4. Optimization algorithms 4.1 Mini-batch gradient descent xtx^{\\{t\\}},yty^{\\...

2146
来自专栏量化投资与机器学习

用Numpy实现优化算法比较

1192

扫码关注云+社区