Python爬虫，python台湾金点设计奖数据采集源码

二爷

发布于 2020-11-25 11:01:11

4070

发布于 2020-11-25 11:01:11

文章被收录于专栏：二爷记

渣渣写的爬虫，也是以前就写过的一个网站，采集官方数据，应用python进行数据采集抓取，同时进行了try.except报错处理，算是可以运行完毕的爬虫，同时将报错相关记录写入到了txt文件中，可以进行后续报错中断数据的补采集操作！

目标网址：https://www.goldenpin.org.tw/%E9%87%91%E9%BB%9E%E8%A8%AD%E8%A8%88%E7%8D%8E/?y=2020

想要获取一个网站的数据，在排除反爬限制的前提下，就是请求访问的中断处理和完善，偶尔服务器访问请求中断或者报错，以及节点获取的错误，都可以考虑进去，防止爬虫中断，当然采集数据建议还是链接数据库保存相关数据信息内容，不管是后面补采集还是其他都更加方便处理操作！

几个关键点

requests访问超时封装处理

from requests.adapters import HTTPAdapter

self.s = requests.Session()
self.s.mount('http://', HTTPAdapter(max_retries=5))
self.s.mount('https://', HTTPAdapter(max_retries=5))

更改数字，可以设置重试次数！

格式化数据

for href,category in zip (hrefs,categorys):
    print(href,category)

for in zip 函数的使用可自行百度参考方法！

标题格式化数据处理

title=req.xpath('//h1[@class="entry-title"]/text()')[0]
pattern = r"[\/\\\:\*\?\"\<\>\|]"
h1 = re.sub(pattern, "_", title)  # 替换为下划线

标题作为数据存储的文件名，需要进行格式化处理，将非法字符替换为下划线！

完整源码参考：

#金点奖设计采集
#20201118 @author：WX：huguo00289

# -*- coding: UTF-8 -*-
import requests,re,time
from fake_useragent import UserAgent
import json,os
from lxml import etree
from requests.adapters import HTTPAdapter


class Gd(object):
    def __init__(self):
        self.ua=UserAgent()
        self.headers={
            'User-Agent':self.ua.random,
            #Cookie: PHPSESSID=t9gp0d4ebv684u4miotr4edkba; devicePixelRatio=1; _ga=GA1.3.657336680.1605679856; _gid=GA1.3.569339568.1605679856; _gat_gtag_UA_55240876_38=1
        }
        self.url="https://www.goldenpin.org.tw/ft-admin/admin-ajax.php"
        self.s = requests.Session()
        self.s.mount('http://', HTTPAdapter(max_retries=5))
        self.s.mount('https://', HTTPAdapter(max_retries=5))



    def get_content(self,i):
        data={
            'action': 'presscore_template_ajax',
            'postID': '15317',
            'paged': 1,
            'targetPage': i,
            'term': '',
            'orderby': '',
            'order': '',
            'nonce': '004811f2a4',
            'contentType': 'portfolio',
            'pageData[type]': 'page',
            'pageData[template]': 'portfolio',
            'pageData[layout]': 'masonry',
            'sender': 'more',
        }
        response=self.s.post(self.url,timeout=8,headers=self.headers,data=data)
        resq=response.content.decode('utf-8')
        req = json.loads(resq)
        print(req['success'])
        if req['success'] == True:
            html=req['html']
            #print(html)
            hrefs=re.findall(r'<a target="_blank"  href="(.+?)" class="alignnone rollover layzr-bg"',html,re.S)
            print(len(hrefs))
            categorys=re.findall(r'<div class="entry-meta portfolio-categories"><span class="category-link"><a href=".+?" >(.+?)</span></div></div>',html,re.S)
            print(len(categorys))
            for href,category in zip (hrefs,categorys):
                print(href,category)
                try:
                    self.parse(href,category)
                except Exception as e:
                    print(f'采集网页：{href} 出错，错误代码：{e}')
                    with open('href_fail.txt', 'a+', encoding='utf-8') as f:
                        f.write(f'{href},{category}\n')



    def parse(self,href,category):
        print(f'>> 正在采集网页 {href} 数据..')
        html=self.s.get(href,headers=self.headers,timeout=8).content.decode('utf-8')
        req=etree.HTML(html)
        title=req.xpath('//h1[@class="entry-title"]/text()')[0]
        pattern = r"[\/\\\:\*\?\"\<\>\|]"
        h1 = re.sub(pattern, "_", title)  # 替换为下划线
        path = f'{category}/{h1}/'
        infos=req.xpath('//div[@class="wpb_wrapper"]//text()')
        info=''.join(infos)
        with open(f'{category}.txt','a+',encoding='utf-8') as f:
            f.write(f'{title}\n{info}\n\n')
        imgs=req.xpath('//div[@class="fancy-media-wrap layzr-bg"]/img[@class="lazy-load preload-me"]/@data-src')
        print(h1,info,imgs)
        try:
            self.downs(imgs,path)
        except Exception as e:
            print(f'获取图片：{imgs} 出错，错误代码：{e}')
            with open('imgs_fail.txt', 'a+', encoding='utf-8') as f:
                f.write(f'{imgs},{path}\n')



    def downs(self,imgs,path):
        os.makedirs(path, exist_ok=True)
        for img in imgs:
            img_url=img
            img_name=img.split('/')[-1]
            print(img_url,img_name)
            try:
                self.dwon_img(img_url,img_name,path)
            except Exception as e:
                print(f'下载图片：{img_name} 出错，错误代码：{e}')
                with open('img_fail.txt','a+',encoding='utf-8') as f:
                    f.write(f'{img_url},{img_name},{path}\n')



    def dwon_img(self,img_url,img_name,path):
        print(f'>> 正在下载图片：{img_name} ..')
        r=self.s.get(img_url,timeout=8,headers=self.headers)
        with open(f'{path}{img_name}','wb') as f:
            f.write(r.content)
        print(f'>> 图片：{img_name} 下载完成！')


    def run(self):
        for i in range(1,1000):
            print(f'>> 正在爬取第 {i} 页数据..')
            try:
                self.get_content(i)
            except Exception as e:
                print(f'爬取第 {i} 页数据出错，错误代码：{e}')
                with open('list_fail.txt','a+',encoding='utf-8') as f:
                    f.write(f'{i}\n')



if __name__=='__main__':
    spider=Gd()
    spider.run()

数据补采集操作源码参考：

#金点奖设计采集补采集

# -*- coding: UTF-8 -*-
from gdspider import Gd


#补图片
def get_bimg_fail():
    path=r'bimg_fail.txt'
    with open(path,'r',encoding='utf-8') as f:
        img_fails=f.readlines()

    print(len(img_fails))
    spider=Gd()
    for img_fail in img_fails:
        img_fail=img_fail.strip()
        img_fail=img_fail.split(',')
        print(img_fail)
        spider.dwon_img(img_fail[0],img_fail[1],img_fail[2])



#补连接
def get_href_fail():
    path=r'bhref_fail.txt'
    with open(path,'r',encoding='utf-8') as f:
        href_fails=f.readlines()

    print(len(href_fails))
    spider=Gd()
    for href_fail in href_fails:
        href_fail=href_fail.strip()
        href_fail=href_fail.split(",")
        href=href_fail[0]

        if "<a href=" in str(href_fail):
            category=href_fail[-1].split('/')[-1]
            category=category.strip()
            category = category.replace('" >', '')
        else:
            category=href_fail[1]

        href=href.replace(' https','https')
        print(href,category)
        spider.parse(href,category)



def get_blist():
    path = r'blist_fail.txt'
    with open(path, 'r', encoding='utf-8') as f:
        blist_fails = f.readlines()

    print(len(blist_fails))
    spider = Gd()
    for blist_fail in blist_fails:
        blist_fail=blist_fail.strip()
        print(blist_fail)
        spider.get_content(blist_fail)




if __name__=='__main__':
    get_bimg_fail()
    #get_href_fail()
    #get_blist()