赶集啦~~~
由于没有开time.sleep所以会被反扒, 卡死之后一般用ctrl+c停止那个类目爬取……
二级页面爬取成功会输出页面名
,否则会输出停止页面
1234567891011 | end at /jiaju/o39//rirongbaihuo/ end at 168Fail to Get Info fromhttp://cs.ganji.com/shouji/2245928138x.htmFail to Get Info fromhttp://cs.ganji.com/shouji/2245928138x.htmFail to Get Info fromhttp://cs.ganji.com/shouji/2201642961x.htmFail to Get Info fromhttp://cs.ganji.com/shouji/2201642961x.htmFail to Get Info fromhttp://cs.ganji.com/shouji/2178792516x.htmFail to Get Info fromhttp://cs.ganji.com/shouji/2178792516x.htm另外没有报错????打开上述页面发现是被买走了 |
---|
def GetSoup(url,buquan=1): if buquan: wb_data = requests.get((BaseUrl + url),timeout=2) # 因为有的页面需要前缀. else: wb_data = requests.get(url) wb_data.encoding = 'utf8' # 加上这句否则会乱码!!! if wb_data.status_code == 200: soup = BeautifulSoup(wb_data.text,'lxml') return soup else: print('Fail to Get Info from'+url) return None
TypeSel = '#wrapper > div.content > div > div > dl > dt > a'def GetChannal(url): soup = GetSoup(url) if soup == None: return None else: types = soup.select(TypeSel) for type in types: href = type.get('href') title = type.get_text() FirstUrl.insert_one(dict(title=title,href=href))
GoodSel = '#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'ThirdSet = set()def GetGoods(url): soup = GetSoup(url) if soup != None: goods = soup.select(GoodSel) for good,p in zip(goods,range(1,10)): title = good.get_text().strip() href = good.get('href') data = dict(title=title,href=href) if data['href'] in ThirdSet: return False else: ThirdSet.add(data['href']) SecondUrl.insert_one(data) return True else: return False def GetGoodsUrl(): for up in FirstUrl.find(): base = up['href'] for p in range(1, 10000): st = base + 'o' + str(p) + '/' try: if GetGoods(st) == False: print(base, 'end at', str(p))#爬取成功 break except: print('error in page', st)#有bug pass
应该开多进程的….
不过分了几个函数也不会挂得太惨咯..
place和新旧类别挺麻烦的QAQ
因为缺少信息会挂一片(比如jiaju页面很多都没有新旧…)
所以删掉try从小搞一下..来debug
ttSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1'tmSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > div > ul.title-info-l.clearfix > li > i'tpSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > span > a'pcSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type'plSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > a'newSel = '#wrapper > div.content.clearfix > div.leftBox > div > div.det-summary > div > div.second-dt-bewrite > ul > li'def GetGoodfInfo(url): soup = GetSoup(url,buquan=0) if soup != None: titles = soup.select(ttSel)[0].get_text() timers = soup.select(tmSel)[0].get_text().split('\\')[0].strip().split('\xa0')[0] types = soup.select(tpSel)[5].get_text() prices = soup.select(pcSel)[0].get_text() places = soup.select(plSel) place = ''.join(places[i].get_text() for i in range(1,4)) news = soup.select(newSel)[0].get_text().split(':')[1].replace('\n','').strip() #print('place',place) #print('type',types) data = dict(title=titles,time=timers,type=types,price=prices,place=place,new=news) #print(data) Info.insert_one(data)
ttSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1'tmSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > div > ul.title-info-l.clearfix > li > i'tpSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > span > a'pcSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type'plSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > a'newSel = '#wrapper > div.content.clearfix > div.leftBox > div > div.det-summary > div > div.second-dt-bewrite > ul > li'def GetGoodfInfo(url): soup = GetSoup(url,buquan=0) if soup != None: titles = soup.select(ttSel)[0].get_text() timers = soup.select(tmSel)[0].get_text().split('\\')[0].strip().split('\xa0')[0] types = soup.select(tpSel)[5].get_text() prices = soup.select(pcSel)[0].get_text() places = soup.select(plSel) place = ''.join(places[i].get_text() for i in range(1,4)) news = soup.select(newSel)[0].get_text().split(':')[1].replace('\n','').strip() #print('place',place) #print('type',types) data = dict(title=titles,time=timers,type=types,price=prices,place=place,new=news) #print(data) Info.insert_one(data)
try: GetGoodfInfo(url['href']) except Exception as e: print(str(e), 'fail to get ', url['href']) pass
忘了判断404…好迷……..幸好跑之前加了个try
这次爬取的数据量突然变大后感受到了爬虫的魅力.
爬取时候没有用time.sleep似乎因为本身的延迟也没有挂的太惨
写了try之后那么挂了Ctrl+c能爬取接下来另一个页面
大数据量爬取得先爬几百个测一下字符串通用性~
但是有一些疑问
重复可能是因为爬取规则有问题,或者网站本身的问题,所以重复的需要处理下。 8k url爬不了这个要看他提示什么错误了,如果没有提示错误就比较不好排除了。 大量和少量的不同很多,大量的爬取需要注意更多细节的东西,这不,到8k的url就爬不了。 chmod可以整个文件夹修改的,加上-R 不用一个个文件修改
另外我发现重复似乎是因为..同时run了两个py程序往数据库里面写……….
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有