python学习教程:模拟登录知乎

大家都知道知乎是比较专业的知识问答平台。里面不仅具有专业的问答,还有社交属性,吸引了一大部分人使用Python来爬取答案,社交关系等。今天我们来进行爬虫操作的第一步:模拟登陆

项目源码

# -*- coding:UTF-8 -*-

import  requests , time
import  hmac ,json
from bs4 import BeautifulSoup
from hashlib import sha1


def get_captcha(data,need_cap):
    ''' 处理验证码 '''
    if need_cap is False:
        return
    with open('captcha.gif','wb') as fb:
        fb.write(data)
    return input('captcha:')
    
def get_signature(grantType,clientId,source,timestamp):
    ''' 处理签名 '''
  
    hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4',None,sha1)
    hm.update(str.encode(grantType))
    hm.update(str.encode(clientId))
    hm.update(str.encode(source))
    hm.update(str.encode(timestamp))

    return  str(hm.hexdigest())



def login(username,password,oncaptcha,sessiona,headers):
    ''' 处理登录 '''
    
    resp1 = sessiona.get('https://www.zhihu.com/signin',headers=headers)  # 拿cookie:_xsrf
    resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers)  # 拿cookie:capsion_ticket 
    need_cap = json.loads(resp2.text)["show_captcha"]  # {"show_captcha":false} 表示不用验证码

    grantType = 'password'
    clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20'
    source ='com.zhihu.web'
    timestamp = str((time.time()*1000)).split('.')[0]  # 签名只按这个时间戳变化
       
    captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content
    
    data = {
        "client_id":clientId,
        "grant_type":grantType,
        "timestamp":timestamp,
        "source":source,
        "signature": get_signature(grantType,clientId,source,timestamp), # 获取签名
        "username":username,
        "password":password,
        "lang":"cn",
        "captcha":oncaptcha(captcha_content,need_cap), # 获取图片验证码
        "ref_source":"other_",
        "utm_source":""
    }
    
    print("**2**: "+str(data))
    print("-"*50)
    resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in',data,headers=headers).content
    print(BeautifulSoup(resp,'html.parser'))
    
    print("-"*50)
    return resp 

if __name__ == "__main__":
    sessiona = requests.Session()
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}

    login('12345678@qq.com','12345678',get_captcha,sessiona,headers) # 用户名密码换自己的就好了
    resp = sessiona.get('https://www.zhihu.com/inbox',headers=headers)  # 登录进去了,可以看私信了
    print(BeautifulSoup(resp.content ,'html.parser'))

本文分享自微信公众号 - python教程(pythonjc)

原文出处及转载信息见文内详细说明,如有侵权,请联系 yunjia_community@tencent.com 删除。

原始发表时间:2019-10-21

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区

领取腾讯云代金券