今天的内容:
1:正则特殊字符
2:python re模块
3:实现百度爬虫
视频内源码如下:
#objURL
fromurllib.requestimport*
#这一步导入到开网址的函数
importre
#url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=TFBOYS&oq=TFBOYS&rsp=-1'
#url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1504857959439_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1504857959439%5E00_1254X613&word=%E7%BE%8E%E5%A5%B3'
url ='https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1504858948695_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E6%9A%B4%E8%B5%B0%E6%BC%AB%E7%94%BB%E5%8A%A8%E5%9B%BE'
html = urlopen(url)
obj = html.read().decode()
#获取到html代码
urls = re.findall(r'"objURL":"(.*?)"',obj)
#根据链接下载
print(urls)
index =1
mylist = ['.jpg','.gif','.png']
forurlinurls:
ifindex
try:
forhzinmylist:
ifre.search(hz,url):
print('正在下载第%d张'%(index))
urlretrieve(url,str(index)+hz)
#相对地址,urlretrieve(下载链接,下载之后保存地址)
index +=1
break
else:
print('正在下载第%d张'%(index))
urlretrieve(url,str(index)+'.jpg')
#相对地址,urlretrieve(下载链接,下载之后保存地址)
index +=1
exceptException:
print("下载失败%d张"%index)
else:
print('十张图片下载结束!')
break
课堂笔记如下:
文章内视频及代码下载地址:
领取专属 10元无门槛券
私享最新 技术干货