爬取腾讯招聘的职位信息,并保存为json文件。 获得现有的3571条职位信息(职位名称、职位类别、人数、地点发布时间、详情页的链接),并获得详情页的内容。
项目准备:
scrapy startproject tencent
cd tencent/tencent/spiders
scrapy genspider tencent_position "hr.tencent.com"
编辑items.py
import scrapy
class TencentItem(scrapy.Item):
# 职位名
position_name = scrapy.Field()
# 职位类别
position_type = scrapy.Field()
# 招聘人数
people_number = scrapy.Field()
# 工作地点
work_location = scrapy.Field()
# 发布时间
publish_times = scrapy.Field()
# 详情链接
position_link = scrapy.Field()
#详情页内容
detailContent = scrapy.Field()
编辑tencent_position.py
# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem
class TencentPositionSpider(scrapy.Spider):
name = 'tencent_position'
allowed_domains = ['hr.tencent.com']
base_url = "https://hr.tencent.com/"
start_urls = ['https://hr.tencent.com/position.php']
def parse(self, response):
node_list = response.xpath('//*[@class="even"] | //*[@class="odd"]')
next_page = response.xpath('//a[@id="next"]/@href').extract_first()
for node in node_list:
item = TencentItem()
item['position_name'] = node.xpath('./td[1]/a/text()').extract_first()
item['position_link'] = node.xpath('./td[1]/a/@href').extract_first()
item['position_type'] = node.xpath('./td[2]/text()').extract_first()
item['people_number'] = node.xpath('./td[3]/text()').extract_first()
item['work_location'] = node.xpath('./td[4]/text()').extract_first()
item['publish_times'] = node.xpath('./td[5]/text()').extract_first()
detail_page = self.base_url + item['position_link'] #拼接详情页链接
yield scrapy.Request(url=detail_page, callback=self.detail, meta={"item": item})
#请求详情,回调函数是自定义的detail
# meta的作用是传参给detail,把item传给detail
yield item
#<a href="position.php?&start=10#a" id="next">下一页</a>
next_url = self.base_url+next_page#拼接下一页链接
yield scrapy.Request(url=next_url,callback=self.parse)
def detail(self,response):
item = response.meta['item'] #获得item
item['detailContent'] = "".join(response.xpath('//ul[@class="squareli"]/li/text()').extract())
yield item
然后就是管道文件,保存数据:
import json
import time
class TencentPipeline(object):
def open_spider(self, spider):
self.file = open("tencent.json", "w")
self.num=0
self.start_time = time.time()
def process_item(self, item, spider):
self.num+=1
content = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(content)
return item
def close_spider(self, spider):
self.end_time = time.time()
print("----------保存"+str(self.num)+"条数据----------")
print("共耗时+"+str(self.end_time-self.start_time)+"秒")
self.file.close()
最后放开settings.py内的管道文件注释:
ITEM_PIPELINES = {
'tencent.pipelines.TencentPipeline': 300,
}
运行项目:
scrapy crawl tencent_position
项目源码:https://gitee.com/stefanpy/Scrapy_projects/tree/dev/tencent