前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >朴素贝叶斯练习实例

朴素贝叶斯练习实例

作者头像
JasonhavenDai
发布2018-04-11 15:01:01
8260
发布2018-04-11 15:01:01
举报
文章被收录于专栏:JasonhavenDaiJasonhavenDai
文本分类:过滤恶意留言

此处有两个改进的地方: (1)若有的类别没有出现,其概率就是0,会十分影响分类器的性能。所以采取各类别默认1次累加,总类别(两类)次数2,这样不影响相对大小。 (2)若很小是数字相乘,则结果会更小,再四舍五入存在误差,而且会造成下溢出。采取取log,乘法变为加法,并且相对大小趋势不变。

代码语言:javascript
复制
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 10 13:51:56 2017

文本分类:应用过滤恶意留言

@author: jasonhaven
"""
import numpy as np

#1 载入数据集:6条文本及它们各自的类别,这6条文本作为训练集。
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec


#2 创建词汇表:利用集合结构内元素的唯一性,创建一个包含所有词汇的词表。
def createVocabSet(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

# 3 把输入文本根据词表转化为计算机可处理的01向量形式:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

#4训练模型:在训练样本中计算先验概率 p(Ci) 和 条件概率 p(x,y | Ci),本实例有0和1两个类别,所以返回p(x,y | 0),p(x,y | 1)和p(Ci)。
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs=trainMatrix.shape[0]
    numWords=len(trainMatrix[0])
    pAbusive=sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom =2.0
    p1Denom =2.0
    for i in range(numTrainDocs):
        if trainCategory[i]==1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)          #change to log()
    p0Vect = np.log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0    


def testingNB():
    '''
    加载数据集+提炼词表;
    训练模型:根据六条训练集计算先验概率和条件概率;
    测试模型:对训练两条测试文本进行分类。
    '''
    listOPosts,listClasses = loadDataSet()
    #print(listOPosts)
    #print(listClasses)
    myVocabList = createVocabSet(listOPosts)
    #print(myVocabList)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    #print(trainMat)
    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
    
    #test
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    

if __name__=='__main__':
    testingNB()
过滤垃圾邮件

ham/1.txt

代码语言:javascript
复制
Hi Peter,

With Jose out of town, do you want to
meet once in a while to keep things
going and do some interesting stuff?

Let me know
Eugene

spam/1.txt

代码语言:javascript
复制
--- Codeine 15mg -- 30 for $203.70 -- VISA Only!!! --

-- Codeine (Methylmorphine) is a narcotic (opioid) pain reliever
-- We have 15mg & 30mg pills -- 30/15mg for $203.70 - 60/15mg for $385.80 - 90/15mg for $562.50 -- VISA Only!!! ---
代码语言:javascript
复制
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 10 13:51:56 2017

垃圾邮件过滤

@author: jasonhaven
"""
import numpy as np

#1 载入数据集:6条文本及它们各自的类别,这6条文本作为训练集。
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec


#2 创建词汇表:利用集合结构内元素的唯一性,创建一个包含所有词汇的词表。
def createVocabSet(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

# 3 把输入文本根据词表转化为计算机可处理的01向量形式:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

#4训练模型:在训练样本中计算先验概率 p(Ci) 和 条件概率 p(x,y | Ci),本实例有0和1两个类别,所以返回p(x,y | 0),p(x,y | 1)和p(Ci)。
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs=trainMatrix.shape[0]
    numWords=len(trainMatrix[0])
    pAbusive=sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom =2.0
    p1Denom =2.0
    for i in range(numTrainDocs):
        if trainCategory[i]==1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)          #change to log()
    p0Vect = np.log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0    

    
#对邮件的文本划分成词汇,长度小于2的默认为不是词汇,过滤掉即可。返回一串小写的拆分后的邮件信息。
def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 

#文档词袋模型:使用数组代替集合数据结构,可以保存词汇频率信息。
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in range(1,26):
        wordList = textParse(open('./spam/1.txt','r').read())
        # print wordList
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./ham/1.txt','r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabSet(docList)#create vocabulary
    trainingSet = list(range(50))
    testSet=[]           #create test set
    for i in range(10):
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error",docList[docIndex])
    print('the error rate is: ',float(errorCount)/len(testSet))
    #return vocabList,fullText

if __name__=='__main__':
    #testingNB()
    spamTest()
本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2017.12.10 ,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 文本分类:过滤恶意留言
  • 过滤垃圾邮件
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档