项目来源 漂亮学姐因为工作需要,得根据已有的学校名单,采集它们的英文名称,描述,简称有的话也带上。共有 2740 条学校名称数据,一条条去搜索得到结果,然后复制粘贴到表格里,一天下来估计人都傻了。 花几分钟写了个爬虫小程序,帮助小姐姐解放了双手,成功表现了一波,得到了学姐的夸奖,学姐还请我一起喝了 11 月的第一杯奶茶,美滋滋。
共有 2740 条学校名称数据
在百度百科中搜索清华大学并查看搜索结果
查看网页源代码,可以惊喜的发现,简要描述的数据显目地躺在开头!
经过分析发现,网页结构简单,可以通过构造URL来请求,获取网页源码,然后从中提取出我们想要的数据即可
当当当,下面该我们的爬虫上场了
导入需要用到的库
import requests
import pandas as pd
from random import choice
from lxml import etree
import openpyxl
import logging
基本配置参数
# 日志输出的基本配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
wb = openpyxl.Workbook() # 创建工作簿对象
sheet = wb.active # 获取活动的工作表
sheet.append(['学校名称', '中文简称', '学校名称(英文)', '描述', '百度百科链接']) # 添加第一行列名
# 产生随机请求头用于切换
user_agent = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
读取数据、爬取网页
# 读取已有的学校名称数据
df = pd.read_excel('学校名称.xlsx')['学校名称']
items = df.values
# 遍历 获取每所学校名称
for item in items:
try:
# 随机生成请求头
headers = {
'User-Agent':choice(user_agent)
}
# 构造url
url = f'https://baike.baidu.com/item/{item}'
# 发送请求 获取响应
rep = requests.get(url, headers=headers)
# Xpath解析提取数据
html = etree.HTML(rep.text)
# 描述
description = ''.join(html.xpath('/html/head/meta[4]/@content'))
# 外文名
en_name = ','.join(html.xpath('//dl[@class="basicInfo-block basicInfo-left"]/dd[2]/text()')).strip()
# 中文简称 有的话 是在dd[3]标签下
simple_name = ''.join(html.xpath('//dl[@class="basicInfo-block basicInfo-left"]/dd[3]/text()')).strip()
sheet.append([item, simple_name, en_name, url])
logging.info([item, simple_name, en_name, description, url])
except Exception as e:
logging.info(e.args)
pass
# 保存数据
wb.save('成果.xlsx')
运行效果如下:
一共有 2740 页的数据需要请求爬取,为了提高爬取效率,使用多线程。
# -*- coding: UTF-8 -*-
"""
@File :帮学姐.py
@Author :叶庭云
@CSDN :https://yetingyun.blog.csdn.net/
"""
import requests
import pandas as pd
from random import choice
from lxml import etree
import openpyxl
import logging
from concurrent.futures import ThreadPoolExecutor
from time import sleep
from random import random
# print(random())
# 日志输出的基本配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['学校名称', '简称', '英文名称', '描述', '百度百科链接', '原始序号'])
df = pd.read_excel('学校名称.xlsx')['学校名称']
items = list(df.values)
# print(items)
# print(items.index('复旦大学'))
user_agent = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def get_data(item):
try:
# 记录一下它原本的序号
sort_num = items.index(item)
# 随机生成请求头
headers = {
'User-Agent':choice(user_agent)
}
# 构造的url
url = f'https://baike.baidu.com/item/{item}'
# 发送请求 获取响应
rep = requests.get(url, headers=headers)
sleep(random())
# Xpath解析提取数据
html = etree.HTML(rep.text)
description = ''.join(html.xpath('/html/head/meta[4]/@content'))
# 外文名
en_name = ','.join(html.xpath('//dl[@class="basicInfo-block basicInfo-left"]/dd[2]/text()')).strip()
# 中文简称 有的话 是在dd[3]标签下
simple_name = ''.join(html.xpath('//dl[@class="basicInfo-block basicInfo-left"]/dd[3]/text()')).strip()
sheet.append([item, simple_name, en_name, description, url, sort_num])
logging.info([item, simple_name, en_name, description, url, sort_num])
except Exception as e:
logging.info(e.args)
pass
# 函数调用 开多线程
def run():
with ThreadPoolExecutor(max_workers=5) as executor:
executor.map(get_data, items)
wb.save('成果.xlsx')
print('===================== 数据成功下载完成 ======================')
run()
运行效果如下:
等程序运行一会儿,数据就全部爬取下来啦。
因为开多线程,爬取的顺序会打乱,再按照学姐给的学校名称的表格里的顺序排列好,就可以交差啦。
# 程序中也记录了它原本的序号 一起写入Excel
sort_num = items.index(item)
import pandas as pd
# 读取数据
df = pd.read_excel('成果.xlsx')
# 按原始序号排序
df1 = df.sort_values(by='原始序号')
# 查看排序后的数据
df1.head(8)
运行效果如下:
作者:叶庭云 微信公众号:修炼Python CSDN:https://yetingyun.blog.csdn.net/ 本文仅用于交流学习,未经作者允许,禁止转载,更勿做其他用途,违者必究。 觉得文章对你有帮助、让你有所收获的话,期待你的点赞呀,不足之处,也可以在评论区多多指正。