前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >爬取去哪儿网酒店信息

爬取去哪儿网酒店信息

作者头像
川川菜鸟
发布2021-10-18 10:16:28
7090
发布2021-10-18 10:16:28
举报
文章被收录于专栏:python全栈教程专栏

不说太多废话,就简单一句:你们你要爬哪里可以把地点改一下,还有时间改一下,爬取数量自己修改参数和代码,变化不大。有问题请留言,我不再次废话分析(这里我爬取的上海最近的酒店信息)

代码语言:javascript
复制
# coding=utf-8
import csv#用来储存文件的模块
import time
import requests
import json
import pandas as pd#excel出处理

# 区域店铺id ct_Poi cateName抓取,传入参数为区域id
def crow_id(city):
    url = 'https://wxapp.qunar.com/api/hotel/hotellist'#目标网址
    headers = {
        "wx-v": "",
        "content-type": "application/json",
        "Connection": "Keep-Alive",
        "Accept-Encoding": "gzip",
        "wx-q": "",
        "unionid": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU",
        "openid": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0",
        "wx-t": "",
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; OPPO A57 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36 MicroMessenger/6.7.2.1340(0x2607023A) NetType/WIFI Language/zh_CN",
        "charset": "utf-8",
        "referer": "https://servicewechat.com/wx799d4d93a341b368/114/page-frame.html",
        "Host": "wxapp.qunar.com",
        "Cookie": "QN48=tc_437f21c62a765ca0_165c198a408_e56b; QN1=qunar; QN66=smart_app; QN1=O5cv+luWLPthsvB1BKl0Ag==",
        "Content-Length": "0",
    }
    #请求头和cookie
    p0 = {'http': 'http://101.132.122.230:3128'}
    p1 = {'http': 'http://114.113.126.83:80'}
    p2 = {'http': 'http://210.45.123.127:9999'}
    p3 = {'http': 'http://118.190.217.182:80'}
    p4 = {'http': 'http://120.27.14.125:80'}
    p5 = {'http': 'http://118.31.223.194:3128'}
    p6 = {'http': 'http://101.37.79.125:3128'}
    p7 = {'http': 'http://125.62.26.197:3128'}
    p8 = {'http': 'http://218.60.8.98:3129'}
    p9 = {'http': 'http://114.215.95.188:3128'}
    p10 = {'http': 'http://218.60.8.99:3129'}
    p11 = {'http': 'http://218.60.8.83:3129'}
    p12 = {'http': 'http://118.190.217.61:80'}
    p13 = {'http': 'http://203.86.26.9:3128'}
    p14 = {'http': 'http://114.113.126.87:80'}
    p15 = {'http': 'http://106.12.32.43:3128'}
    #爬取不同页网址
    p = p1
    page = 1
    #抓取我们需要的数据
    data = {
        "city": city,
        "cityUrl": "",
        "page": page,
        "extra": "{}",
        "sort": "",
        "keywords": "",
        "checkOutDate": "2020-10-29",
        "checkInDate": "2020-10-29",
        "locationAreaFilter": "",
        "comprehensiveFilter": "[]",
        "fixedComprehensiveFilter": "[]",
        "SDKVersion": "2.2.4",
        "wxUnionId": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU",
        "wxOpenId": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0",
        "bd_source": "smart_app",
        "bd_origin": "pt-onl-ots-ggjd",
    }
    r = requests.post(url, headers=headers, params=data, proxies=p)
    result = json.loads(r.text)
    pages = result['data']['totalPage']
    # pages=586
    hotel = result['data']
    # attrs = hotel['attrs']
    print("当前总页数:",pages)
    print("Page:%d" %page)
    print(len(hotel), pages)
    df = pd.DataFrame(data=hotel['hotels'])
    df.to_csv('qunaer9.csv', mode='a', header=False)
    df.drop(df.index, inplace=True)



    if pages > 1:
        pages = pages - page
        page +=1
        while pages >=0:
            data2 = {
                "city": city,
                "cityUrl": "",
                "page": page,
                "extra": "{}",
                "sort": "",
                "keywords": "",
                "checkOutDate": "2020-11-2",
                "checkInDate": "2020-11-1",
                "locationAreaFilter": "",
                "comprehensiveFilter": "[]",
                "fixedComprehensiveFilter": "[]",
                "SDKVersion": "2.2.4",
                "wxUnionId": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU",
                "wxOpenId": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0",
                "bd_source": "smart_app",
                "bd_origin": "pt-onl-ots-ggjd",
            }
            try:
                r = requests.post(url, headers=headers, params=data2, proxies=p)
                print(len(hotel), pages)
                print(page)
                result = json.loads(r.text)
                hotel = result['data']
                # attrs = hotel['attrs']
                df = pd.DataFrame(data=hotel['hotels'])
                df.to_csv('qunaer9.csv',mode='a',header=False)
                df.drop(df.index,inplace=True)
            except Exception as e:
                print(e)
            finally:
                print("Page:%d" %page)
                pages -= 1
                page = page+1
                time.sleep(3.1)



if __name__ == '__main__':
    a = {"areaObj": {
                        "上海": [{"city": '上海'}]
                     }}


    datas = a['areaObj']
    b = datas.values()
    area_list = []
    for data in b:
        for d in data[0:]:
            area_list.append(d)
    l = 0
    old = time.time()
    for i in range(len(area_list)):
        print("开始抓取%s区域:" % (area_list[i]['city']))
        crow_id(area_list[i]['city'])
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2020/11/06 ,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档