在互联网招聘行业,前程无忧(51job)作为国内领先的招聘平台之一,汇聚了大量企业招聘信息。对于求职者、猎头或数据分析师来说,实时获取最新的招聘信息至关重要。
手动收集数据效率低下,而通过Python编写自动化爬虫,可以定时爬取前程无忧的最新职位,并存储到数据库或本地文件中,便于后续分析。本文将介绍如何使用Python requests库结合定时任务,实现自动化爬取前程无忧招聘数据。
前程无忧的招聘列表页URL通常为:
其中关键参数:
python
:搜索关键词2,1.html
:第2页,每页50条数据import requests
from bs4 import BeautifulSoup
import pandas as pd
from apscheduler.schedulers.blocking import BlockingScheduler
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
def fetch_jobs(keyword="python", pages=1):
base_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html"
job_list = []
for page in range(1, pages + 1):
url = base_url.format(keyword, page)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
jobs = soup.find_all('div', class_='j_joblist') # 根据实际HTML结构调整
for job in jobs:
title = job.find('span', class_='jname').text.strip()
company = job.find('a', class_='cname').text.strip()
location = job.find('span', class_='d at').text.strip()
salary = job.find('span', class_='sal').text.strip()
pub_date = job.find('span', class_='time').text.strip()
link = job.find('a', class_='el')['href'].strip()
job_list.append({
"职位名称": title,
"公司名称": company,
"工作地点": location,
"薪资范围": salary,
"发布时间": pub_date,
"详情链接": link
})
time.sleep(2) # 避免请求过快被封
return job_list
def save_to_csv(jobs, filename="51job_jobs.csv"):
df = pd.DataFrame(jobs)
df.to_csv(filename, index=False, encoding='utf_8_sig') # 防止中文乱码
print(f"数据已保存至 {filename}")
def scheduled_job():
print("开始爬取前程无忧最新职位...")
jobs = fetch_jobs(keyword="python", pages=3) # 爬取3页数据
save_to_csv(jobs)
print("爬取完成!")
if __name__ == "__main__":
scheduler = BlockingScheduler()
scheduler.add_job(scheduled_job, 'interval', days=1) # 每天执行一次
print("定时爬虫已启动,按 Ctrl+C 退出")
try:
scheduler.start()
except KeyboardInterrupt:
scheduler.shutdown()
前程无忧可能会检测频繁请求,因此需要:
fake_useragent
库)。requests
+proxies
)。time.sleep
)。from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import pandas as pd
from apscheduler.schedulers.blocking import BlockingScheduler
import time
# 代理配置
proxyHost = "www.16yun.cn"
proxyPort = "5445"
proxyUser = "16QMSOML"
proxyPass = "280651"
proxyMeta = f"http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}"
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
def get_random_ua():
ua = UserAgent()
return ua.random
def fetch_jobs(keyword="python", pages=1):
base_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html"
job_list = []
for page in range(1, pages + 1):
url = base_url.format(keyword, page)
try:
# 使用随机User-Agent和代理
headers["User-Agent"] = get_random_ua()
response = requests.get(url, headers=headers, proxies=proxies, timeout=10)
response.raise_for_status() # 检查请求是否成功
soup = BeautifulSoup(response.text, 'html.parser')
jobs = soup.find_all('div', class_='j_joblist') # 根据实际HTML结构调整
for job in jobs:
title = job.find('span', class_='jname').text.strip()
company = job.find('a', class_='cname').text.strip()
location = job.find('span', class_='d at').text.strip()
salary = job.find('span', class_='sal').text.strip()
pub_date = job.find('span', class_='time').text.strip()
link = job.find('a', class_='el')['href'].strip()
job_list.append({
"职位名称": title,
"公司名称": company,
"工作地点": location,
"薪资范围": salary,
"发布时间": pub_date,
"详情链接": link
})
time.sleep(2) # 避免请求过快被封
except Exception as e:
print(f"第{page}页请求失败: {e}")
continue
return job_list
def save_to_csv(jobs, filename="51job_jobs.csv"):
df = pd.DataFrame(jobs)
df.to_csv(filename, index=False, encoding='utf_8_sig') # 防止中文乱码
print(f"数据已保存至 {filename}")
def scheduled_job():
print("开始爬取前程无忧最新职位...")
jobs = fetch_jobs(keyword="python", pages=3) # 爬取3页数据
save_to_csv(jobs)
print("爬取完成!")
if __name__ == "__main__":
scheduler = BlockingScheduler()
scheduler.add_job(scheduled_job, 'interval', days=1) # 每天执行一次
print("定时爬虫已启动,按 Ctrl+C 退出")
try:
scheduler.start()
except KeyboardInterrupt:
scheduler.shutdown()
本文介绍了如何使用Python requests + BeautifulSoup + APScheduler构建自动化爬虫,定时爬取前程无忧的最新招聘数据,并存储到CSV文件。
通过自动化爬虫,可以高效获取招聘市场动态,适用于求职分析、竞品调研、行业趋势研究等场景。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。