今天我们使用python来搭建简易的搜索引擎。
搜索引擎的本质其实就是对数据的预处理,分词构建索引和查询。
(这边我们默认所有的数据都是utf-8的数据类型)
我们在一个网站上去获取所有的URL:
defcrawl(pages,depth=2):
foriinrange(depth):
newpages =set()
forpageinpages:
try:
c = urllib.request.urlopen(page)
except:
print('Invaild page:',page)
continue
soup = bs4.BeautifulSoup(c.read())
links = soup('a')
forlinkinlinks:
if('href'indict(link.attrs)):
url = urllib.urljoin(page,link['href'])
ifurl.find("'")!=-1:continue
url = url.split('#')[]
ifurl[:3]=='http':
newpages.add(url)
pages = newpages
通过一个循环抓取当前页面上所有的链接,我们尽可能多的去抓取链接,之所以选择set而不使用list是防止重复的现象,我们可以将爬取的的网站存放到文件或者MySQL或者是MongoDB里。
output = sys.stdout
outputfile =open('lujing.txt','w')
sys.stdout = outputfile
list = GetFileList(lujing,[])
将生成的路径文件lujing.txt读取,并按照路径文件对文本处理
# 将生成的路径文件lujing.txt读取,并按照路径文件对文本处理,去标签
forlineinopen("lujing.txt"):
print(line)
# line=line[0:-2]
line1 = line[:12]
line2 = line[13:16]
line3 = line[17:-1]
line4 = line[17:-6]
line = line1 +'\\'+ line2 +'\\'+ line3
print(line4)
path = line
fb =open(path,"rb")
data = fb.read()
bianma = chardet.detect(data)['encoding']# 获取当前文件的编码方式,并按照此编码类型处理文档
page =open(line,'r',encoding=bianma,errors='ignore').read()
dr = re.compile(r']+>',re.S)# 去HTML标签
dd = dr.sub('',page)
print(dd)
fname ='TXT'+"\\"+ line4 +".txt"
# print(fname)
f =open(fname,"w+",encoding=bianma)# 将去标签的文件写到文件夹内,并按照原命名以txt文档方式保存
# fo=open(fname,"w+")
f.write(dd)
下面我们进行分词索引:
importjieba
importchardet
importpymysql
importimportlib,sys
importlib.reload(sys)
# 如果使用MongoDB
# from pymongo import MongoClient
# #data processing
# client = MongoClient('localhost',27017)
# apiDB = client['urlDB'] #serverDB_name:test_nodedata
# questionnaires = apiDB['weburl']
# data = list(questionnaires.find())
conn = pymysql .connect(host="localhost",user="root",
password="123456",db="suoyin",port=3307)
conn.text_factory =str
c = conn.cursor()
c.execute('drop table doc')
c.execute('create table doc (idintprimary key,linktext)')
c.execute('drop table word')
c.execute('create table word (termvarchar(25) primary key,listtext)')
conn.commit()
conn.close()
defFenci():
num =
forlineinopen("url.txt"):
lujing = line
print(lujing)
num +=1
print(line)
line = line[17:-5]
print(line)
line ='TXT'+'\\'+ line +'Txt'# line为文件位置
print(line)# 文件名称
path = line
fb =open(path,"rb")
data = fb.read()
bianma = chardet.detect(data)['encoding']# 获取文件编码 print(bianma)
# page = open(line, 'r', encoding=bianma, errors='ignore').read()
# page1=page.decode('UTF-8')
ifbianma =='UTF-16':
data = data.decode('UTF-16')
data = data.encode('utf-8')
word = jieba.cut_for_search(data)
seglist = list(word)
print(seglist)
# 创建数据库
c = conn.cursor()# 创建游标
c.execute('insert into doc values(?,?)',(num,lujing))
# 对每个分出的词语建立词表
forwordinseglist:
# print(word)
# 检验看看这个词语是否已存在于数据库
c.execute('select list from word where term=?',(word,))
result = c.fetchall()
# 如果不存在
iflen(result) ==:
docliststr =str(num)
c.execute('insert into word values(?,?)',(word,docliststr))
# 如果已存在
else:
docliststr = result[][]# 得到字符串
docliststr +=' '+str(num)
c.execute('update word set list=? where term=?',(docliststr,word))
conn.commit()
conn.close()
Fenci()
最后一步,查询:
importpymsql
importjieba
importmath
conn = pymysql .connect(host="localhost",user="root",
password="123456",db="suoyin",port=3307)
c = conn.cursor()
c.execute('selectcount(*) from doc')
N =1+ c.fetchall()[][]# 文档总数
target =input('请输入搜索词:')
seggen = jieba.cut_for_search(target)
score = {}# 文档号:匹配度
forwordinseggen:
print('得到查询词:',word)
# 计算score
tf = {}# 文档号:文档数
c.execute('select list from word where term=?',(word,))
result = c.fetchall()
iflen(result) >:
doclist = result[][]
doclist = doclist.split(' ')
# 把字符串转换为元素为int的list
doclist = [int(x)forxindoclist]
# 当前word对应的df数
df =len(set(doclist))
idf = math.log(N / df)
print('idf:',idf)
fornumindoclist:
ifnumintf:
tf[num] = tf[num] +1
else:
tf[num] =1
# tf统计结束,现在开始计算score
fornumintf:
ifnuminscore:
# 如果该num文档已经有分数了,则累加
score[num] = score[num] + tf[num] * idf
else:
score[num] = tf[num] * idf
sortedlist =sorted(score.items(),key=lambdad: d[1],reverse=True)
cnt =
fornum,docscoreinsortedlist:
cnt = cnt +1
c.execute('select link from doc where id=?',(num,))
url = c.fetchall()[][]
print("Result Ranking:",cnt)
print('url:',url,'match degree:',docscore)
ifcnt >20:
break
ifcnt ==:
print('No result')
搞定。
领取专属 10元无门槛券
私享最新 技术干货