前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >Scrapy框架之爬取拉勾网

Scrapy框架之爬取拉勾网

作者头像
菲宇
发布2019-06-13 11:37:51
5670
发布2019-06-13 11:37:51
举报
文章被收录于专栏:菲宇菲宇菲宇

0.前言

最近有一段时间没爬虫了,现在来学习一下爬虫框架及数据库操作! 看完这篇文章,你可以学会如下操作!

  • scrapy框架
  • BeautifulSoup
  • lxml
  • selenium
  • pyecharts
  • pymysql

1.建立项目

scrapy startproject CQJob scrapy genspider cqjobs

2.spider+selenium

start_urls配置

start_urls = ['https://www.lagou.com/jobs/list_?px=default&hy=%E5%8C%BB%E7%96%97%E5%81%A5%E5%BA%B7&city=%E9%87%8D%E5%BA%86'] # 配置url

chromedriver配置

# 开启chromedriver browser = webdriver.Chrome() browser.get(self.start_urls[0]) browser.implicitly_wait(10) # # 写入源html # f = open('./wr.txt', 'w', encoding='utf8') # raw_html = browser.page_source # f.write(raw_html) # f.close() ....... BeautifulSoup及xpath使用对多页面处理代码 ............ browser.close() # 关闭浏览器

BeautifulSoup及xpath使用对多页面处理

# 使用BeautifulSoup定位 ''' pager_next pager_next pager_next_disabled ''' for i in range(11): selector = etree.HTML(browser.page_source) # 获取源码 soup = BeautifulSoup(browser.page_source, features='lxml') # a = soup.find_all("div", class_="pager_container") span = soup.find("div", attrs={"class": "pager_container"}).find("span", attrs={"action": "next"}) # f = open('./new.txt', 'w', encoding='utf8') # f.write(str(span)) # f.close() classSpan = span['class'] print('----------------------------------------------') print(classSpan) # 输出内容为 -> ['pager_next', 'pager_next_disabled'] next_flag = classSpan[1] self.parsedata(selector) if next_flag == "pager_next_disabled": print("已经爬到最后一页,爬虫结束") break else: print("还有下一页,爬虫继续") # 这里一定要注意不能直接copy源代码的xpath,因为每个页面元素标签有可能不一样! browser.find_element_by_xpath('//*[@id="s_position_list"]/div[2]/div/span[@action="next"]').click() # 点击下一页 time.sleep(5) print('第{}页抓取完毕'.format(i + 1))

数据定义及封装

items.py

class CqjobItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() # 职位 company = scrapy.Field() # 公司名称 location = scrapy.Field() # 公司地点 welfare = scrapy.Field() # 福利 salaryMin = scrapy.Field() # 工资下限 salaryMax = scrapy.Field() # 工资上限 salaryMid = scrapy.Field() # 平均工资 experience = scrapy.Field() # 工作经验 education = scrapy.Field() # 教育程度 companyType = scrapy.Field() # 公司类型 companyLevel = scrapy.Field() # 公司级别 companySize = scrapy.Field() # 公司人数规模

cqjobs.py(spiders文件)

name_list = [] location_list = [] company_list = [] welfare_list = [] salaryMin_list = [] salaryMax_list = [] salaryMid_list = [] experience_list = [] education_list = [] companyType_list = [] companyLevel_list = [] companySize_list = [] items = CqjobItem() items['name'] = self.name_list items['company'] = self.company_list items['location'] = self.location_list items['welfare'] = self.welfare_list items['salaryMin'] = self.salaryMin_list items['salaryMax'] = self.salaryMax_list items['salaryMid'] = self.salaryMid_list items['experience'] = self.experience_list items['education'] = self.education_list items['companyType'] = self.companyType_list items['companyLevel'] = self.companyLevel_list items['companySize'] = self.companySize_list print(items)

xpath爬取特定数据

def parsedata(self, selector): sel_list = selector.xpath('//*[@id="s_position_list"]/ul/li') for item in sel_list: name = item.xpath('div[1]/div[1]/div[1]/a/h3/text()')[0] self.name_list.append(name) location = item.xpath('div[1]/div[1]/div[1]/a/span/em/text()')[0] self.location_list.append(location) company = item.xpath('div[1]/div[2]/div[1]/a/text()')[0] self.company_list.append(company) welfare = item.xpath('div[2]/div[2]/text()')[0] self.welfare_list.append(welfare) salaryList = item.xpath('div[1]/div[1]/div[2]/div/span/text()')[0].strip().split("-") # print(salaryList) # [10k-15k] salaryMin = salaryList[0][:len(salaryList[0]) - 1] # 10 去除k,只留数字 self.salaryMin_list.append(salaryMin) salaryMax = salaryList[1][:len(salaryList[1]) - 1] # 15 self.salaryMax_list.append(salaryMax) salaryMid = (int(salaryMin) + int(salaryMax)) / 2 self.salaryMid_list.append(salaryMid) educationArray = item.xpath('div[1]/div[1]/div[2]/div//text()')[3].strip().split("/") # print(educationArray) experience = educationArray[0].strip() self.experience_list.append(experience) education = educationArray[1].strip() self.education_list.append(education) # conmpanyMsgArray = item.xpath('div[1]/div[2]/div[2]/text()')[0].strip().split("/") conmpanyMsgList = item.xpath('div[1]/div[2]/div[2]/text()')[0].strip().split("/") companyType = conmpanyMsgList[0].strip() self.companyType_list.append(companyType) companyLevel = conmpanyMsgList[1].strip() self.companyLevel_list.append(companyLevel) companySize = conmpanyMsgList[2].strip() self.companySize_list.append(companySize)

数据分析

# TreeMap数据格式 def getTreeData(self, treedata): treemap_data = [] for key in treedata: if key != '重庆': treemap_data.append({"value": treedata[key], "name": key}) return treemap_data # 位置数据分析 def LocAnalysis(self, items): loca_data = items['location'] list_data = set(loca_data) treemap_data = {} for item in list_data: treemap_data[item] = loca_data.count(item) print(treemap_data) data = self.getTreeData(treemap_data) # 转换为相应的TreeMap数据 print(data) treemap = TreeMap("重庆医疗健康位置分布图", width=1200, height=600, title_pos="center") treemap.add("位置数据", data, is_label_show=True, label_pos='inside', label_text_color='#000', is_legend_show=False) treemap.render() def educaAnalysis(self, items): educa_data = items['education'] educalist_data = set(educa_data) print(educalist_data) edupie_list = [] edupie_data = [] for item in educalist_data: edupie_list.append(item) edupie_data.append(educa_data.count(item)) print(edupie_list) print(edupie_data) pie = Pie("重庆医疗健康招聘学历要求", title_pos='center') pie.add( "学历", edupie_list, edupie_data, center=[50, 50], is_random=True, radius=[30, 75], rosetype="area", is_legend_show=False, is_label_show=True, ) pie.render() # 位置数据分析 def CompanyAnalysis(self, items): loca_data = items['company'] list_data = set(loca_data) treemap_data = {} for item in list_data: treemap_data[item] = loca_data.count(item) print(treemap_data) data = self.getTreeData(treemap_data) # 转换为相应的TreeMap数据 print(data) treemap = TreeMap("重庆医疗相关公司分布图", width=1500, height=900, title_pos="center") treemap.add("公司数据", data, is_label_show=True, label_pos='inside', label_text_color='#000', is_legend_show=False) treemap.render() # 工资数据分析 def SalaryAnalysis(self, items): axis_data = items['name'] print(axis_data) ayis_data = items['salaryMid'] print(ayis_data) bar = Bar("重庆医疗职位平均工资图", width=1500, height=450, title_pos="center") bar.add("工资数据", axis_data, ayis_data, mark_line=["average"], mark_point=["max", "min"], legend_pos='right', is_xaxis_show=False) bar.render() def SalaryTreeAnalysis(self, items): salary_name = items['name'] salary_data = items['salaryMid'] salary_set = set(salary_name) treemap_data = {} for item in salary_set: treemap_data[item] = salary_data print(treemap_data) data = self.getTreeData(treemap_data) # 转换为相应的TreeMap数据 print(data) treemap = TreeMap("重庆医疗职位工资分布图", width=1500, height=900, title_pos="center") treemap.add("职位数据", data, is_label_show=True, label_pos='inside', label_text_color='#000', is_legend_show=False) treemap.render()

数据分析调用

self.LocAnalysis(items) self.educaAnalysis(items) self.CompanyAnalysis(items) self.SalaryAnalysis(items) self.SalaryTreeAnalysis(items)

3.数据存储

settings.py

# Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False ITEM_PIPELINES = { 'CQJob.pipelines.CqjobPipeline': 300, }

pipelines.py

import pymysql class CqjobPipeline(object): def process_item(self, item, spider): ''' 将爬取的信息保存到mysql ''' connection = pymysql.connect(host='localhost', user='root', password='xxxx', db='scrapydb', charset='utf8mb4') try: with connection.cursor() as cursor: for i in range(len(item['name'])): sql = "insert into `cqjobs`(`name`,`company`,`location`,`welfare`,`salaryMin`,`salaryMax`,`salaryMid`,`experience`,`education`,`companyType`,`companyLevel`,`companySize`)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" cursor.execute(sql, ( item['name'][i], item['company'][i], item['location'][i], item['welfare'][i], item['salaryMin'][i], item['salaryMax'][i], item['salaryMid'][i], item['experience'][i], item['education'][i], item['companyType'][i], item['companyLevel'][i], item['companySize'][i])) connection.commit() # except pymysql.err.IntegrityError as e: # print('重复数据,勿再次插入!') finally: connection.close() return item

转自https://mp.weixin.qq.com/s/hqdtE1aid3UjjhEfe8hfbw

本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2019年04月10日,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 0.前言
  • 1.建立项目
  • 2.spider+selenium
  • 3.数据存储
相关产品与服务
数据保险箱
数据保险箱(Cloud Data Coffer Service,CDCS)为您提供更高安全系数的企业核心数据存储服务。您可以通过自定义过期天数的方法删除数据,避免误删带来的损害,还可以将数据跨地域存储,防止一些不可抗因素导致的数据丢失。数据保险箱支持通过控制台、API 等多样化方式快速简单接入,实现海量数据的存储管理。您可以使用数据保险箱对文件数据进行上传、下载,最终实现数据的安全存储和提取。
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档