http://m.maoyan.com/mmdb/comments/movie/341737.json?_v_=yes&offset=15&startTime=2018-09-02%2013%3A33%3A14
# -*- coding:utf-8 -*-
import requests
import json
from datetime import datetime
import time
from tqdm import tqdm
from random import random
class MaoYan():
"""docstring for ClassName"""
def __init__(self, movie_id):
print '*******MaoYan_spider******'
print 'Author : Awesome_Tang'
print 'Date : 2018-09-01'
print 'Version: Python2.7'
print '**************************\n'
self.movie_id = movie_id
self.starttime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
self.starturl = 'http://m.maoyan.com/mmdb/comments/movie/%s.json?_v_=yes&offset=0&startTime=%s'%(movie_id,self.starttime)
self.headers = {'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'}
def GetCommentNum(self):
'''
查询总评论数
用于建立循环
'''
response = requests.get(self.starturl,headers = self.headers)
text = response.json()
num = text['total']
print '>>>>查询时间:%s\n>>>>评论数量:%s'%(self.starttime,num)
return num
def FormatUrl(self,starttime):
url = 'http://m.maoyan.com/mmdb/comments/movie/%s.json?_v_=yes&offset=30&startTime=%s'%(self.movie_id,starttime)
return url
def QueryComent(self,url):
'''
评论请求部分
nickName:用户昵称
cityName:城市
content:评论内容
score:用户评分🌟🌟
startTime:评论时间,每次取最早的时间传入下次请求
'''
try:
response = requests.get(url, headers = self.headers, timeout = 5)
if response.status_code == 200:
attrs = ''
comments = response.json()['cmts']
for index in range(15):
try:
nickName = comments[index]['nickName']
cityName = comments[index]['cityName']
content = comments[index]['content']
score = comments[index]['score']
startTime = comments[index]['startTime']
param = '%s|%s|%s|%s|%s\n'%(startTime,nickName,cityName,score,content)
attrs = attrs+param
except KeyError as e:
attrs = ''
return attrs ,startTime, True
else:
print '>>>>查询过于频繁,请休息几分钟♨️♨️'
return response.content.encode('utf-8'),'',False
except BaseException as e:
print '>>>>请检查网络...🔗🔗\n'
print e.message
return e.message,'',False
def SaveComent(self):
'''
保存评论到txt文件
如果请求成功保存,失败sleep100秒
tqdm用于实现进度条
'''
num = self.GetCommentNum()
pages = num/15
with open('comment.txt','a+') as f:
for i in tqdm(range(pages)):
if i == 0:
starttime = self.starttime
url = self.FormatUrl(starttime)
attrs,starttime,IsOk = self.QueryComent(url)
else:
url = self.FormatUrl(starttime)
attrs,starttime,IsOk = self.QueryComent(url)
if IsOk:
f.write(attrs.encode('utf-8'))
else:
while True:
time.sleep(100)
attrs,starttime,IsOk = self.QueryComent(url)
if IsOk:
f.write(attrs.encode('utf-8'))
break
else:
continue
print '>>>>评论保存完毕...'
if __name__ == '__main__':
p = MaoYan('341737')
p.SaveComent()
评论算保存完了,近期会再做一个关于此次数据的可视化分析。另外阿汤哥真心太帅了,全程打肾上腺素,各位还没去看的赶紧~
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。