Python获取爱卡网所有车型评分及前十页购车用途

介绍

爬虫框架一直没看,也不敢看。怕自己学不会,嘿嘿。

本项目使用的是定向爬虫方式实现。多线程爬取。

1

运行环境

python3 模块 :

requests

re

threading

pymongo

gevent

bs4

2

思路

进入车型页面,正则匹配所有车型ID。通过ID构造详情链接,获取详情页面评分,购车用途。

通过分析每页的评价都是通过ajax技术加载的,所有单独写了个函数处理前十页评价。

通过字典方式存储,统计所有购车用途的数量。

采用MongoDB存储。

3

代码示例

大家看注释

4

# -*- coding: utf-8 -*-

import requests

from bs4 import BeautifulSoup

import re

from pymongo import MongoClient

import threading

import gevent

from gevent import monkey

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'

}

def gethtml(url):

'''获取URL HTML文本'''

proex = {'https':"https://59.37.132.30:8080"}

try:

html = requests.get(url,headers=headers,timeout=15,proxies=proex)

html.encoding = html.apparent_encoding

html.raise_for_status()

return html.text

except BaseException as f:

print('获取主页源码失败了,错误信息为:',f)

def geturls(html):

'''匹配所有车辆URL ID'''

try:

patten = re.compile('a href="(/\d+/)" target="_blank"')

urls = re.findall(patten,html)

return urls

except BaseException as f:

print('获取URL失败,错误信息为:',f)

def getpages(url):

'''获取每款车辆评价数量'''

try:

html = gethtml(url)

soup = BeautifulSoup(html,'lxml')

page = soup.find(class_='unify_page mt20')

pat_f = re.compile('''rel="nofollow">(\d+)''')

pages = re.findall(pat_f,str(page))

return pages

except BaseException as f:

print('获取URL失败,错误信息为:',f)

def getall(pages,url,purposes):

'''构造Ajax请求参数'''

try:

if len(pages) == 0:

date = getinfo(url,purposes)

return date

else:

date = getinfo(url,purposes)

id = url.split('/',4)[3] #切片获取车辆URL ID值

#判断评价页数,大于10页只获取10页数据,小于10全部获取

if int(pages[-1]) > 10:

listone = list(range(2,11))

ta = threading.Thread(target=getmoinfo,args=(listone[:4], id, purposes,))

tb = threading.Thread(target=getmoinfo, args=(listone[4:6], id, purposes,))

tc = threading.Thread(target=getmoinfo, args=(listone[6:9], id, purposes,))

ta.start()

tb.start()

tc.start()

else:

listone = list(range(2,int(pages[-1])))

ta = threading.Thread(target=getmoinfo, args=(listone[:int(pages[-1])],id,purposes,))

ta.start()

return date

except BaseException as f:

print('获取URL失败,错误信息为:',f)

def getmoinfo(page,id,purposes):

'''遍历所有AjaxURL,并获取购车用途'''

data = {'r': 'reputation/reputation/GetAjaxKbList3',

'page': page,

'pserid': id,

'jh': '0',

'wd': '0'}

f_url = 'http://newcar.xcar.com.cn/auto/index.php'

url = requests.get(f_url,params=data).url

try:

html = gethtml(url)

soup = BeautifulSoup(html,'lxml')

#购车用途

purposee = soup.find_all(class_='purpose clearfix')

pat_s = re.compile('(.*?)')

purpose = re.findall(pat_s,str(purposee))

purposes.append(purpose)

except BaseException as f:

print('获取数据失败,错误信息为:',f)

def getinfo(url,purposes):

'''获取车辆综合评分'''

try:

data = {}

html = gethtml(url)

soup = BeautifulSoup(html,'lxml')

pat_o = re.compile('

综合评分:(.*?)分

')

#综合评分

synthesis = re.findall(pat_o,str(soup))

#车型

title = soup.find(class_='tt_h1').get_text()

#整体评分

infos = soup.find(class_='column')

pat_t = re.compile('\d+\.\d+分')

info = re.findall(pat_t,str(infos))

#购车用途

purposee = soup.find_all(class_='purpose clearfix')

pat_s = re.compile('(.*?)')

purpose = re.findall(pat_s,str(purposee))

purposes.append(purpose)

data['车型'] = title.strip()

data['综合评分'] = synthesis[0]

data['外观'] = info[0]

data['内饰'] = info[1]

data['空间'] = info[2]

data['舒适'] = info[3]

data['耗油'] = info[4]

data['动力'] = info[5]

data['操控'] = info[6]

data['性价比'] = info[7]

return data

except BaseException as f:

print('获取数据失败,错误信息为:',f)

def main(urls,num):

'''程序入口'''

try:

conn = MongoClient('127.0.0.1', 27017)

db = conn.cars

cars = db.cars

for url1 in urls:

num += 1

if requests.get(url).url != url:

print('该车型没有综合评分及购车用途',url)

else:

purposes = []

print('正在处理第{}个链接,URL:{}'.format(num,url))

pages = getpages(url)

data = getall(pages,url,purposes)

clearfix = {}

call = []

#将所有购车用途遍历,重新加入新的列表,进行统计出现次数

for a in purposes:

for b in a:

call.append(b)

for c in set(call):

clearfix[c] = call.count(c)

data['购车用途'] = clearfix

cars.insert(data)

print('成功保存{}条数据'.format(num))

except BaseException as f:

print('主函数运行出错了,错误信息为:',f)

if __name__ == '__main__':

url = "http://newcar.xcar.com.cn/price/"

html = gethtml(url)

urls = geturls(html)

monkey.patch_all()

jobs = []

num = 0

for x in range(190):

job = None

if x == 0:

job = gevent.spawn(main, urls[1:(x+1)*10],num)

elif 0 < x < 189:

job = gevent.spawn(main, urls[x*10+1:(x+1)*10],num)

else:

job = gevent.spawn(main, urls[x*10+1:1891],num)

jobs.append(job)

gevent.joinall(jobs)

print('----------全部保存成功----------')

  • 发表于:
  • 原文链接http://kuaibao.qq.com/s/20171214G0G95I00?refer=cp_1026
  • 腾讯「云+社区」是腾讯内容开放平台帐号(企鹅号)传播渠道之一,根据《腾讯内容开放平台服务协议》转载发布内容。
  • 如有侵权,请联系 yunjia_community@tencent.com 删除。

扫码关注云+社区

领取腾讯云代金券