无意看到一个很好玩的爬虫闯关游戏,后续会持续将每关心得贴出来游戏开始地址[1]
访问链接得到可以看到如下题目
捕获.PNG
捕获.PNG
捕获.PNG
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Word : python can change world!
# @Version : python3.6
import requests
import re
from bs4 import BeautifulSoup
def get_html(page):
url='http://www.heibanke.com/lesson/crawler_ex00/'+str(page)
res=requests.get(url).text
return res
def main():
page=''
print('开始第一关')
while True:
req=get_html(page)
if u'恭喜' in req:
print('成功啦!可以试着将最后显示的数字填进网址里去看看哦!')
break
else:
soup=BeautifulSoup(req,"html.parser")
html=soup.h3.string
num=re.findall(r'\d+',html)
page=str(num.pop())
print(page)
if __name__ == '__main__':
main()
成功后可以获取下一关的地址链接哦!试试吧,如果你也对python爬虫有兴趣,欢迎交流指正哦!继续第二关的爬虫闯关游戏下一关地址
访问链接得到可以看到如下题目
捕获.PNG
捕获.PNG
代码思路
:通过request模块的post提交data参数,通过re模块抓取成功后的信息!就是这么简单!
由此开始写代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Word : python can change world!
# @Version : python3.6
import requests
import re
def main():
url='http://www.heibanke.com/lesson/crawler_ex01/'
num=0
a=30
while a >0:
data={'username':'admin','password':num}
res=requests.post(url,data=data).text
if u'错误' in res:
print(num,'错误')
num=num+1
a=a+1
else:
print(num,'正确')
title=re.findall("<title>(.*?)</title>",res)
word=re.findall("<h1>(.*?)</h1>",res)
word2=re.findall("<h3>(.*?)</h3>",res)
html=re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>',res)
print('\n'.join([title[0], word[0], word2[0],'下一关地址是','http://www.heibanke.com'+html[0]]))
break
if __name__ == '__main__':
main()
过关标识
轻松过关有木有!如果你也对python爬虫有兴趣,欢迎交流指正哦!继续第三关的爬虫闯关游戏下一关地址
访问链接得到可以看到如下题目:
捕获.PNG
捕获.PNG
捕获.PNG
登录时的post表单参数
csr参数
捕获.PNG
代码思路
:由此总结可知,通过request模块访问login页面,获取csr随机参数,再带入账号密码一起post提交表单登入,登入之后再获取csr固定参数,再带入账号,和随机的30以内的数字密码,只要密码对了就过关了!
由此开始写代码:
import requests
import re
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Word : python can change world!
# @Version : python3.6
def main():
login_data={'username':'user','password':'password'}
url='http://www.heibanke.com/lesson/crawler_ex02/'
login_url='http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/'
r2=requests.get(login_url)
c2=r2.cookies
login_data['csrfmiddlewaretoken']=c2['csrftoken']
r3=requests.post(login_url,data=login_data,allow_redirects=False,cookies=c2)
c3=r3.cookies
pass_data={'username':'user','csrfmiddlewaretoken':c3['csrftoken']}
for passwd in range(31):
pass_data['password']=pas
swd
r5=requests.post(url,pass_data,cookies=c3)
text=r5.text
result=re.findall(r'密码错误',text)
if u'密码错误' in text:
print("%s密码错误"%passwd)
else:
print("%s密码正确"%passwd)
title=re.findall("<title>(.*?)</title>",text)
word=re.findall("<h1>(.*?)</h1>",text)
word2=re.findall("<h3>(.*?)</h3>",text)
html=re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>',text)
print('\n'.join([title[0], word[0], word2[0],'下一关地址是','http://www.heibanke.com'+html[0]]))
break
if __name__ == '__main__':
main()
过关标识
轻松过关有木有!如果你也对python爬虫有兴趣,欢迎交流指正哦!继续第四关的爬虫闯关游戏[下一关地址]:(http://www.heibanke.com/lesson/crawler_ex03/)
访问链接得到可以看到如下题目
图片.png
捕获.PNG
捕获.PNG
代码思路
:由此总结可知,通过request模块访问login页面,获取csr随机参数,再带入账号密码一起post提交表单登入,登入之后访问密码列表,爬取密码位置,和对应数值,再组成100位的密码,再带入账号密码登录,只要密码对了就过关了!
由此开始写代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Word : python can change world!
# @Version : python3.6
import requests
from bs4 import BeautifulSoup
import threading
from queue import Queue
dict1={}
vlauess=[]
web1="http://www.heibanke.com/accounts/login"
web2="http://www.heibanke.com/lesson/crawler_ex03/pw_list/"
web3="http://www.heibanke.com/lesson/crawler_ex03/"
global queuewz
global queuemm
queuewz=Queue()
queuemm=Queue()
class mythreads(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
work()
while not queuemm.empty():
try:
dict1[str(queuewz.get())]=queuemm.get()
print(dict1)
print("字典长度为%s"%len(dict1))
if int(len(dict1)) ==100:
print("凑到100啦!")
for i in range(1,101):
vlauess.append(dict1[str(i)])
c=vlauess[:100]
zzmm=''.join(c)
print("密码为%s"%zzmm)
print("正在登录.......")
dataWebsite1 = {'username': 'user','password': zzmm}
s=login_get()
res=s.post(web3, data=dataWebsite1).text
if u'恭喜' in res:
title=re.findall("<title>(.*?)</title>",res)
word=re.findall("<h1>(.*?)</h1>",res)
word2=re.findall("<h3>(.*?)</h3>",res)
html=re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>',res)
print('\n'.join([title[0], word[0], word2[0],'下一关地址是','http://www.heibanke.com'+html[0]]))
break
else:
print("网页有问题哦!可以尝试手动将获得的正确密码登入进去哦!")
break
else:
main()
except IndexError:
print("例表空了,下一页!")
def login_get():
try:
s = requests.Session()
r=s.get(web1) # 访问登录页面获取登录要用的csrftoken
token1 = r.cookies['csrftoken'] # 保存csrftoken
# 将csrftoekn存入字段csrfmiddlewaretoken
dataWebsite1 = {'username': 'user',
'password': 'password',
'csrfmiddlewaretoken': token1
}
res=s.post(web1, data=dataWebsite1)
except KeyError as e:
pass
return s
def get_html(s):
r=s.get(web2)
res=r.text
return res
def get_dict(res):
soup=BeautifulSoup(res,"html.parser")
for a in soup.find_all('td',attrs={'title':'password_pos'}):
wz=(a.string)
queuewz.put(wz)
for b in soup.find_all('td',attrs={'title':'password_val'}):
mm=(b.string)
queuemm.put(mm)
def work():
res=get_html(s)
get_dict(res)
def main():
global s
s=login_get()
threads=[]
threads_count=10
for i in range(threads_count):
threads.append(mythreads())
for t in threads:
t.start()
for t in threads:
t.join()
if __name__ == '__main__':
main()
账号为user,密码可得为:9263152516931477925579303326531490163668774566288173618538253675264972338183086744513535774895069524
登录过关标识
这里线程我给的是10,访问速度不算快,但比多线程快很多哦,写的有点杂乱,继续第五关的爬虫闯关游戏[下一关地址]:(http://www.heibanke.com/lesson/crawler_ex04/)
访问链接可以看到如下视图:
捕获.PNG
捕获.PNG
import requests
import re
import urllib.request
web_login="http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex04/"
web_ex04="http://www.heibanke.com/lesson/crawler_ex04/"
def get_s():
s=requests.Session()
s.get(web_login)
token1 = s.cookies['csrftoken'] # 保存csrftoken
# 将csrftoekn存入字段csrfmiddlewaretoken
dataWebsite1 = {'username': 'user',
'password': 'password',
'csrfmiddlewaretoken': token1
}
s.post(web_login, data=dataWebsite1)
return s
def get_img(s):
for i in range(1,101):
res=s.get(web_ex04).text
img=re.findall('![]((.*?))',res)
img_html='http://www.heibanke.com'+str(img[0])
print("这是第%s张验证码"%i)
urllib.request.urlretrieve(img_html,"%s.png"%i)
# print(img_html)
def main():
s=get_s()
get_img(s)
if __name__ == '__main__':
main()
以下是需要用到的cmd命令我把它整理成bat文件
rem 执行改批处理前先要目录下创建font_properties文件 ,内容为font 0 0 0 0 0
echo Run Tesseract for Training..
tesseract.exe num.font.exp0.tif num.font.exp0 nobatch box.train
echo Compute the Character Set..
unicharset_extractor.exe num.font.exp0.box
mftraining.exe -F font_properties -U unicharset -O num.unicharset num.font.exp0.tr
echo Clustering..
cntraining.exe num.font.exp0.tr
echo Rename Files..
rename normproto num.normproto
rename inttemp num.inttemp
rename pffmtable num.pffmtable
rename shapetable num.shapetable
echo Create Tessdata..
combine_tessdata.exe num.
echo. & pause
代码思路
:由此总结可知,通过request模块访问login页面,获取csr随机参数,再带入账号密码一起post提交表单登入,登入之后下载验证码用pytesseract模块调用tesseract识别验证码,并提交给captha_1参数,再加上csr和captha_0参数以post方式访问,password从0开始遍历数字,只要密码对了就过关了!
由此开始写代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @word : python will change the world!
# @Version : 3.6
import requests
import re
import urllib.request
import pytesseract
from PIL import Image
import sys
web_login="http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex04/"
web_ex04="http://www.heibanke.com/lesson/crawler_ex04/"
def get_s():
s=requests.Session()
s.get(web_login)
token1 = s.cookies['csrftoken'] # 保存csrftoken
dataWebsite1 = {'username': 'user',
'password': 'password',
'csrfmiddlewaretoken': token1
}
s.post(web_login, data=dataWebsite1)
return s
def cleanyzm(yzm,s,num):
a=yzm
if len(a.replace(' ',''))!=4:
print("打回")
get_img(s,num)
else:
yzm=a.upper()
print(yzm)
return yzm
def get_img(s,num):
res=s.get(web_ex04).text
img=re.findall('![]((.*?))',res)
img_html='http://www.heibanke.com'+str(img[0])
print("验证码下载完毕")
urllib.request.urlretrieve(img_html,"yzm.png")
yzm=pytesseract.image_to_string(Image.open('yzm.png'),lang='num')
cshu=re.findall(r'img src="/captcha/image/(.*)/" alt="captcha"', res)[0]
token1 = s.cookies['csrftoken']
yzm=cleanyzm(yzm,s,num)
dataWebsite1 = {'username': 'user',
'password': num,
'csrfmiddlewaretoken': token1,
'captcha_0':cshu,
'captcha_1':yzm
}
res=s.post(web_ex04, data=dataWebsite1).text
if u'密码错误' in res:
print(num,"密码错误")
num=num+1
get_img(s,num)
else:
if u'验证码输入错误' in res:
print(num,"验证码输入错误!")
get_img(s,num)
else:
print(num,"成功!")
title=re.findall("<title>(.*?)</title>",res)
word=re.findall("<h1>(.*?)</h1>",res)
word2=re.findall("<h3>(.*?)</h3>",res)
print('\n'.join([title[0],word[0],word2[0]]))
sys.exit(0)
def main():
s=get_s()
#global num
num=0 ########设置开始运行的密码,直到成功程序退出!
get_img(s,num)
if __name__ == '__main__':
main()
账号为user密码可得为一个二位数字
登录过关标识
这里我用的是pytesseract模块,识别率也还不是很理想,不断循环直至验证码读对为止,还好下载验证码不会刷新,不然就只能用selenium模块进行操控浏览器进行模拟登陆了,这也不乏一种思路哦!感兴趣的朋友可以试试,如果你也对python爬虫有兴趣,欢迎交流指正,相互学习哦!
[1]
游戏开始地址: http://www.heibanke.com/lesson/crawler_ex00/