此进阶篇相较于前一篇每个item多了工作职责(jobDescription),工作要求(jobRequirement)两个字段。 另外从技术的角度上来说,前一篇在tencent.py文件中只有一个parse函数,此进阶篇要完成链接的跳转,在跳转后新的页面中爬取内容,有3个parse函数。
进入powershell.png
点击在此处打开Powershell窗口,可以实现基于当前目录打开powershell
正确新建项目.png
这个命令起到的效果是新建了一个工程名为TencentJob2的工程目录。
.项目文件结构缩略图
上图是整个工程的缩略图。
from scrapy import Field
import scrapy
class Tencentjob2Item(scrapy.Item):
jobName = Field()
jobType = Field()
recruitmentNumber = Field()
workplace = Field()
publishTime = Field()
jobDescription = Field()
jobRequirement = Field()
import scrapy
from TencentJob2.items import Tencentjob2Item
from scrapy.http import Request
class TencentSpider(scrapy.Spider):
name = 'tencent'
allowed_domains = ['hr.tencent.com']
start_urls = ['https://hr.tencent.com/position.php?&start=0']
def parse(self,response):
maxPage = response.xpath("//div[@class='pagenav']/a[last()-1]/text()").extract()[0]
baseUrl = 'https://hr.tencent.com/position.php?&start={}0'
for i in range(int(maxPage)):
yield Request(baseUrl.format(i),callback= self.parse1, dont_filter=True)
def parse1(self, response):
def find(pNode, xpath):
if len(pNode.xpath(xpath)):
return pNode.xpath(xpath).extract()[0]
else:
return ''
job_list = response.xpath("//tr[@class='odd' or @class= 'even']")
for job in job_list:
item = Tencentjob2Item()
item['jobName'] = find(job, "td[1]/a/text()")
item['jobType'] = find(job, "td[2]/text()")
item['recruitmentNumber'] = find(job, "td[3]/text()")
item['workplace'] = find(job, "td[4]/text()")
item['publishTime'] = find(job, "td[5]/text()")
details_url = "https://hr.tencent.com/" + find(job, "td[1]/a/@href")
yield Request(details_url,meta={'item':item},callback=self.parse2)
def parse2(self, response):
def contentList2str(contentList):
result = ''
if contentList[0][0] != '1':
for a,b in zip(range(len(contentList)),contentList):
result += str(a+1) + '.' + b + '\n'
else:
result = '\n'.join(contentList)
return result
item = response.meta['item']
item['jobDescription'] = contentList2str(response.xpath("//table/tr[3]/td/ul/li/text()").extract())
item['jobRequirement'] = contentList2str(response.xpath("//table/tr[4]/td/ul/li/text()").extract())
return item
import pandas as pd
class Tencentjob2Pipeline(object):
job_list = []
def process_item(self, item, spider):
self.job_list.append(dict(item))
return item
def close_spider(self, spider):
df = pd.DataFrame(self.job_list)
df.to_excel('腾讯社会招聘(详细版).xlsx', columns=[k for k in self.job_list[0].keys()])
#ITEM_PIPELINES = {
# 'TencentJob2.pipelines.Tencentjob2Pipeline': 300,
#}
改为:
ITEM_PIPELINES = {
'TencentJob2.pipelines.Tencentjob2Pipeline': 300,
}
修改的作用就是使工程知道调用名为Tencentjob2Pipeline的管道。 第2个修改,下面这一段
#CONCURRENT_REQUESTS_PER_DOMAIN = 96
#CONCURRENT_REQUESTS_PER_IP = 96
改为:
CONCURRENT_REQUESTS_PER_DOMAIN = 96
CONCURRENT_REQUESTS_PER_IP = 96
修改的作用是增大线程并发数量,使程序能够尽快运行完成。
提示:
excel结果截图.png