import scrapy
import re
class GithubSpider(scrapy.Spider):
name = 'github'
allowed_domains = ['github.com']
# 登录页面 URL
start_urls = ['https://github.com/login']
def parse(self, response):
# 获取请求参数
commit = response.xpath("//input[@name='commit']/@value").extract_first()
utf8 = response.xpath("//input[@name='utf8']/@value").extract_first()
authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
ga_id = response.xpath("//input[@name='ga_id']/@value").extract_first()
if ga_id is None:
ga_id = ""
webauthn_support = response.xpath("//input[@name='webauthn-support']/@value").extract_first()
webauthn_iuvpaa_support = response.xpath("//input[@name='webauthn-iuvpaa-support']/@value").extract_first()
# required_field_157f = response.xpath("//input[@name='required_field_4ed5']/@value").extract_first()
timestamp = response.xpath("//input[@name='timestamp']/@value").extract_first()
timestamp_secret = response.xpath("//input[@name='timestamp_secret']/@value").extract_first()
# 构造 post 参数
post_data = {
"commit": commit,
"utf8": utf8,
"authenticity_token": authenticity_token,
"ga_id": ga_id,
"login": "xxx@qq.com",
"password": "xxx",
"webauthn-support": webauthn_support,
"webauthn-iuvpaa-support": webauthn_iuvpaa_support,
# "required_field_4ed5": required_field_4ed5,
"timestamp": timestamp,
"timestamp_secret": timestamp_secret
}
# 打印参数
print(post_data)
# 发送 post 请求
yield scrapy.FormRequest(
"https://github.com/session", # 登录请求方法
formdata=post_data,
callback=self.after_login
)
# 登录成功之后操作
def after_login(self, response):
# 找到页面上的 Issues 字段并打印
print(re.findall("Issues", response.body.decode()))
# -*- coding: utf-8 -*-
import scrapy
import re
class Github2Spider(scrapy.Spider):
name = 'github2'
allowed_domains = ['github.com']
start_urls = ['http://github.com/login']
def parse(self, response):
yield scrapy.FormRequest.from_response(
response, # 自动从response中寻找form表单
formdata={"login": "xxx@qq.com", "password": "xxx"},
callback=self.after_login
)
# 登录成功之后操作
def after_login(self, response):
# 找到页面上的 Issues 字段并打印
print(re.findall("Issues", response.body.decode()))
# -*- coding: utf-8 -*-
import scrapy
import re
class RenrenSpider(scrapy.Spider):
name = 'renren'
allowed_domains = ['renren.com']
# 个人中心页网址
start_urls = ['http://www.renren.com/972990680/profile']
def start_requests(self):
# 登录之后用 chrome 的 debug 工具从请求中获取的 cookies
cookiesstr = "anonymid=k3miegqc-hho317; depovince=ZGQT; _r01_=1; JSESSIONID=abcDdtGp7yEtG91r_U-6w; ick_login=d2631ff6-7b2d-4638-a2f5-c3a3f46b1595; ick=5499cd3f-c7a3-44ac-9146-60ac04440cb7; t=d1b681e8b5568a8f6140890d4f05c30f0; societyguester=d1b681e8b5568a8f6140890d4f05c30f0; id=972990680; xnsid=404266eb; XNESSESSIONID=62de8f52d318; jebecookies=4205498d-d0f7-4757-acd3-416f7aa0ae98|||||; ver=7.0; loginfrom=null; jebe_key=8800dc4d-e013-472b-a6aa-552ebfc11486%7Cb1a400326a5d6b2877f8c884e4fe9832%7C1575175011619%7C1%7C1575175011639; jebe_key=8800dc4d-e013-472b-a6aa-552ebfc11486%7Cb1a400326a5d6b2877f8c884e4fe9832%7C1575175011619%7C1%7C1575175011641; wp_fold=0"
cookies = {i.split("=")[0]:i.split("=")[1] for i in cookiesstr.split("; ")}
# 携带 cookies 的 Request 请求
yield scrapy.Request(
self.start_urls[0],
callback=self.parse,
cookies=cookies
)
def parse(self, response):
# 从个人中心页查找关键词"闲欢"并打印
print(re.findall("闲欢", response.body.decode()))
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。