from bs4 import BeautifulSoup
import requests
import pymongo
import re
# 封装requests,获取WEB页面数据
def get_web_data(url):
# 构造请求头
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
data = requests.get(url, headers=headers)
return data
# 通过页面爬去所有的详情页url
def get_urls(page_url):
try:
data = get_web_data(page_url)
# 保存地址
url_hhpcpost = []
url_gy = []
# 判断网页是否访问成功
if data.status_code == 200:
soup = BeautifulSoup(data.text, 'lxml')
urls = soup.select('td.t > a')
# urls = soup.select('div.ac_linkurl > a')
for url in urls:
# 获取a标签中的href属性
url = url.get('href')
# 判断url类型并且保存到列表中
if url[2:10] == 'hhpcpost':
url_hhpcpost.append('https:' + url)
if url[8:10] == 'gy':
url_gy.append(url)
else:
print(page_url + 'access failed!' + 'status_code:' + str(data.status_code))
except Exception as e:
print(e)
finally:
return url_gy, url_hhpcpost
# 获取二手商品信息
def get_shouji_info_gy(url):
info = []
try:
# 判断网页内容是否获取成功若状态码为200则页面访问成功
data = get_web_data(url)
if data.status_code == 200:
soup = BeautifulSoup(data.text, 'lxml')
# 获取标题 并去除2边的换行符以及空白符
title = soup.select('div.detail-title > h1')[0].get_text().strip()
# 获取价格
price = soup.select('div.infocard__container__item__main > span')[0].get_text().strip()
# 使用正则表达式提取价格数字
prices = re.search('\\d+', price)[0]
# 区域
areas = soup.select(
'div.infocard__container.haveswitch > div:nth-of-type(2) > div.infocard__container__item__main>a')
area = ''
for i in areas:
area = area + i.get_text().strip()
# 描述信息
description = soup.select('div.foldingbox > article.description_con')[0].get_text().strip()
# 获取图片
imgs = soup.select('li > span > img')
pics = []
for pic in imgs:
pics.append('https:' + pic.get('src'))
info = {
'title': title,
'prices': prices,
'area': area,
'description': description,
'pics': pics
}
except Exception as e:
print(e)
print(url)
finally:
return info
# 获取二手商品信息
def get_shouji_info_hhpcpost(url):
info = []
try:
# 判断网页内容是否获取成功若状态码为200则页面访问成功
data = get_web_data(url)
if data.status_code == 200:
soup = BeautifulSoup(data.text, 'lxml')
# 获取标题 并去除2边的换行符以及空白符
title = soup.select('div.detail-info-hd > div.detail-info-tit')[0].get_text().strip()
# 获取价格
price = soup.select('div.detail-info-hd > div.detail-info-price > span.info-price-money')[
0].get_text().strip()
# 使用正则表达式提取价格数字
prices = re.search('\\d+', price)[0]
# 区域
area = soup.select('ul > li:nth-of-type(3) > span.info-bd-text')[0].get_text().strip()
# 描述信息
description = soup.select('div.hh-detail-desc-box > div.hh-detail-desc')[0].get_text().strip()
# 获取图片
imgs = soup.select('div.hh-detail-small-pic > ul > li')
pics = []
for pic in imgs:
pics.append('https:' + pic.get('data-src'))
info = {
'title': title,
'prices': prices,
'area': area,
'description': description,
'pics': pics
}
except Exception as e:
print(e)
print(url)
finally:
return info
def main():
try:
# 连接MongoDB数据库
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient['rs']
collection = mydb['data_58']
uri = 'https://gy.58.com/shouji/pn'
for i in range(1, 29, 1):
page_url = uri + str(i)
(url_gy, url_hhpcpost) = get_urls(page_url)
for url in url_gy:
data = get_shouji_info_gy(url)
# 保存数据到MongoDB
if len(data) > 0:
collection.insert_one(data)
print(data)
for url in url_hhpcpost:
data = get_shouji_info_hhpcpost(url)
if len(data) > 0:
collection.insert_one(data)
print(data)
except Exception as e:
print(e)
# print(get_shouji_info_gy('https://gy.58.com/shouji/37378994974604x.shtml?iuType=p_1&PGTID=0d300024-007d-f012-64a4-30fe82910e2d&ClickID=14'))
# print(get_shouji_info_hhpcpost('https://hhpcpost.58.com/shouji/37350593977988x.shtml?iuType=p_1&PGTID=0d300024-007d-f012-64a4-30fe82910e2d&ClickID=12'))
main()