pip install scrapy pip install pyOpenSSL pip install cryptography pip install CFFI pip install lxml pip install cssselect pip install Twisted
scrapy startproject zhipinSpider
scrapy genspider job_position "zhipin.com"
image.png
目录结构: items.py : pipelines.py:处理爬取的内容 settings.py :配置文件
scrapy shell - s USER_AGENT="xx" https://www.zhipin.com/c101280100/h101280100/
/ 匹配根节点 // 任意节点 . 当前节点 .. 父节点 @ 属性 //div[@title="xxx"]/div
image.png
image.png
import scrapy
class ZhipinspiderItem(scrapy.Item): # 工作名称 title = scrapy.Field() # 工资 salary = scrapy.Field() # 招聘公司 company = scrapy.Field() # 工作详细链接 url = scrapy.Field() # 工作地点 work_addr = scrapy.Field() # 行业 industry = scrapy.Field() # 公司规模 company_size = scrapy.Field() # 招聘人 recruiter = scrapy.Field() # 发布时间 publish_date = scrapy.Field()
import scrapy from ZhipinSpider.items import ZhipinspiderItem
class JobPositionSpider(scrapy.Spider): # 定义该Spider的名字 name = 'job_position' # 定义该Spider允许爬取的域名 allowed_domains = ['zhipin.com'] # 定义该Spider爬取的首页列表 start_urls = ['https://www.zhipin.com/c101280100/h_101280100/']
# 该方法负责提取response所包含的信息
# response代表下载器从start_urls中每个URL下载得到的响应
def parse(self, response):
# 遍历页面上所有//div[@class="job-primary"]节点
for job_primary in response.xpath('//div[@class="job-primary"]'):
item = ZhipinspiderItem()
# 匹配//div[@class="job-primary"]节点下/div[@class="info-primary"]节点
# 也就是匹配到包含工作信息的<div.../>元素
info_primary = job_primary.xpath('./div[@class="info-primary"]')
item['title'] = info_primary.xpath('./h3/a/div[@class="job-title"]/text()').extract_first()
item['salary'] = info_primary.xpath('./h3/a/span[@class="red"]/text()').extract_first()
item['work_addr'] = info_primary.xpath('./p/text()').extract_first()
item['url'] = info_primary.xpath('./h3/a/@href').extract_first()
# 匹配//div[@class="job-primary"]节点下./div[@class="info-company"]节点下
# 的/div[@class="company-text"]的节点
# 也就是匹配到包含公司信息的<div.../>元素
company_text = job_primary.xpath('./div[@class="info-company"]' +
'/div[@class="company-text"]')
item['company'] = company_text.xpath('./h3/a/text()').extract_first()
company_info = company_text.xpath('./p/text()').extract()
if company_info and len(company_info) > 0:
item['industry'] = company_info[0]
if company_info and len(company_info) > 2:
item['company_size'] = company_info[2]
# 匹配//div[@class="job-primary"]节点下./div[@class="info-publis"]节点下
# 也就是匹配到包含发布人信息的<div.../>元素
info_publis = job_primary.xpath('./div[@class="info-publis"]')
item['recruiter'] = info_publis.xpath('./h3/text()').extract_first()
item['publish_date'] = info_publis.xpath('./p/text()').extract_first()
yield item
# 解析下一页的链接
new_links = response.xpath('//div[@class="page"]/a[@class="next"]/@href').extract()
if new_links and len(new_links) > 0:
# 获取下一页的链接
new_link = new_links[0]
# 再次发送请求获取下一页数据
yield scrapy.Request("https://www.zhipin.com" + new_link, callback=self.parse)
class ZhipinspiderPipeline(object): def process_item(self, item, spider): print("工作:" , item['title']) print("工资:" , item['salary']) print("工作地点:" , item['work_addr']) print("详情链接:" , item['url'])
print("公司:" , item['company'])
print("行业:" , item['industry'])
print("公司规模:" , item['company_size'])
print("招聘人:" , item['recruiter'])
print("发布日期:" , item['publish_date'])
BOT_NAME = 'ZhipinSpider'
SPIDER_MODULES = ['ZhipinSpider.spiders'] NEWSPIDER_MODULE = 'ZhipinSpider.spiders'
ROBOTSTXT_OBEY = True
DEFAULT_REQUEST_HEADERS = { "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8' }
ITEM_PIPELINES = { 'ZhipinSpider.pipelines.ZhipinspiderPipeline': 300, }
scrapy crawl job_position
import mysql.connector
class ZhipinspiderPipeline(object): # 定义构造器,初始化要写入的文件 def init(self): self.conn = mysql.connector.connect(user='root', password='32147', host='localhost', port='3306', database='python', use_unicode=True) self.cur = self.conn.cursor() # 重写close_spider回调方法,用于关闭数据库资源 def close_spider(self, spider): print('----------关闭数据库资源-----------') # 关闭游标 self.cur.close() # 关闭连接 self.conn.close() def process_item(self, item, spider): self.cur.execute("INSERT INTO job_inf VALUES(null, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (item['title'], item['salary'], item['company'], item['url'], item['work_addr'], item['industry'], item.get('company_size'), item['recruiter'], item['publish_date'])) self.conn.commit()
image.png
COOKIES_ENABLED=False
image.png
image.png
image.png