这一步我们以后再介绍如何使用
这里需要注意一下,当我们把这个文件打成exe可执行文件,会导致缺失字典文件,我们就需要把jieba库里面的文件给取出来, 在导入的时候直接更换文件
import sys
import jieba #打成可执行的exe文件防止文件缺失
jieba.set_dictionary("dict.txt")
jieba.initialize()
import requests
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import QApplication, QMainWindow
from bs4 import BeautifulSoup
import jieba.posseg as peg
from collections import Counter
这个已经由我们的Qtdesigner设计好了,我们就是编写模块
def geHTMLText(self,url):
try:
r = requests.get(url, timeout=300)
r.raise_for_status()
r.encoding = "UTF-8"
return r.text
except Exception as e:
print(e)
return "ERROR"
def content(self,html):
ls = []
soup = BeautifulSoup(html, 'html.parser')
#适用与中国新闻网http://www.chinanews.com/
tag_div = soup.find_all("div", attrs={"class": "left_zw"})
for i in tag_div:
#print(i.get_text())
ls.append(i)
return ls
由于结巴库自带词性分析,我们只需要将其转为相应的中文即可
jieba库之词性分析
#调用jieba中的 import jieba.posseg as peg
words = peg.cut(text)
for word, flag in words:
print(str(flag)) #flag就是词性的英文
def JudgeWord(self,flag):
if flag == "a":
return "形容词"
elif flag == "ad":
return "副形词"
elif flag == "an":
return "名形词"
elif flag == "b":
return "区别词"
elif flag == "c":
return "连词"
elif flag == "dg":
return "副语素"
elif flag == "d":
return "副词"
elif flag == "e":
return "叹词"
elif flag == "f":
return "方位词"
elif flag == "g":
return "语素"
elif flag == "h":
return "前接成分"
elif flag == "i":
return "成语"
elif flag == "j":
return "简称略语"
elif flag == "k":
return "后接成分"
elif flag == "l":
return "习用语"
elif flag == "m":
return "数词"
elif flag == "Ng":
return "名语素"
elif flag == "n":
return "名词"
elif flag == "nr":
return "人名"
elif flag == "ns":
return "地名"
elif flag == "nt":
return "机构团体"
elif flag == "nz":
return "其他专名"
elif flag == "o":
return "拟声词"
elif flag == "p":
return "介词"
elif flag == "q":
return "量词"
elif flag == "r":
return "代词"
elif flag == "s":
return "处所词"
elif flag == "tg":
return "时语素"
elif flag == "t":
return "时间词"
elif flag == "u":
return "助词"
elif flag == "vg":
return "动语素"
elif flag == "v":
return "动词"
elif flag == "vd":
return "副动词"
elif flag == "vn":
return "名动词"
elif flag == "w":
return "标点符号"
elif flag == "x":
return "非语素字"
elif flag == "y":
return "语气词"
elif flag == "z":
return "状态词"
elif flag == "un":
return "未知词"
else:
return None
#词性分析监听
def getSpeech(self):
textStr = self.textEdit.toPlainText()
self.textEdit_2.setText("")
text = str(textStr)
words = peg.cut(text)
print(words)
#result1 = ""
c1 = Counter()
c2 = Counter()
for word, flag in words:
_word = str(word).replace('\n', '').replace('\r', '')
_flag = str(flag)
c1[_word] += 1
words = self.JudgeWord(_flag)
c2[str(words)] += 1
#result1 = result1 + _word +"词性是:"+ str(words)+""
self.textEdit_2.append("各种单词计数如下:")
for k,v in dict(c2).items():
self.textEdit_2.append("词性:{0} ———— 个数:{1}".format(k, v))
self.textEdit_2.append("=======================================================")
self.textEdit_2.append("=======================================================")
self.textEdit_2.append("计数如下:")
for k,v in dict(c1).items():
self.textEdit_2.append("词语:{0} ———— 个数{1}".format(k,v))
#print("词语:{0} ———— 个数{1}".format(k,v))
绑定操作:
#给这个爬取按钮添加事件监听
self.pushButton.clicked.connect(self.getClinkde)
声明:我的博客即将同步至腾讯云+社区,邀请大家一同入驻:https://cloud.tencent.com/developer/support-plan?invite_code=23z6dnotw0skk
如需源码:请联系作者!