1、scrapy startproject si
2、vi settings.py USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5' ROBOTSTXT_OBEY = False COOKIES_ENABLED = True DOWNLOADER_MIDDLEWARES = { 'loginscrapy.middlewares.LoginscrapyDownloaderMiddleware': 543, } 3、vi middlewares.py from scrapy import signals from scrapy.http import HtmlResponse from selenium import webdriver import os,sys from PIL import Image import time import tesserocr import requests class LoginscrapyDownloaderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def yzm(self, path):
ima = Image.open(path)
image = ima.resize((480, 200), Image.ANTIALIAS)
image = image.convert('L')
# 这个是二值化阈值
threshold = 160
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
# 通过表格转换成二进制图片,1的作用是白色,不然就全部黑色了
image = image.point(table, "1")
return tesserocr.image_to_text(image)[:5] # print ocr text from image
def dl(self, browser, username, password):
un = browser.find_element_by_id("username")
un.clear()
un.send_keys(username)
pw = browser.find_element_by_id("password")
pw.clear()
pw.send_keys(password)
if os.path.exists("Login_page.png"):
os.remove("Login_page.png")
if os.path.exists("Verification.png"):
os.remove("Verification.png")
browser.save_screenshot('Login_page.png')
photo = Image.open('Login_page.png')
box = (980, 378, 1076, 417)
photo.crop(box).save('Verification.png')
yz = browser.find_element_by_id("jcaptcha")
yzmmmmm = self.yzm("Verification.png")
print(yzmmmmm)
yz.send_keys(yzmmmmm)
lg = browser.find_element_by_id("login-btn")
lg.click()
time.sleep(1)
try:
err = browser.find_element_by_class_name("err-vaild")
return err.text.strip()
except:
return None
def process_request(self, request, spider):
if spider.name == "login":
if request.url.find("login") != -1:
chromedriver = "chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
spider.driver = webdriver.Chrome(chromedriver, chrome_options=options) # Get local session of firefox
spider.driver.set_window_size(1200, 900)
spider.driver.get(request.url)
username = "yourname"
password = "yourpass"
err = self.dl(spider.driver, username, password)
i = 0
while err != None and i < 10:
i += 1
err = self.dl(spider.driver, username, password)
if err != None:
print("无法登录,用户名或密码或验证码错误!需要重新执行程序.")
sys.exit()
time.sleep(2)
spider.cookies = spider.driver.get_cookies()
#spider.driver.close()
return HtmlResponse(url=spider.driver.current_url, # 登录后的url
body=spider.driver.page_source, # html源码
encoding='utf-8')
else:
req = requests.session() # 会话
for cookie in spider.cookies:
req.cookies.set(cookie['name'], cookie["value"])
req.headers.clear() # 清空头
newpage = req.get(request.url)
return HtmlResponse(url=request.url, # 当前连接
body=newpage.text, # 源代码
encoding="utf-8") # 返回页面信息
return None
4、vi si/si/main.py from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(file))) execute(['scarpy', 'crawl', 'login']) 5、vi si/si/spiders/login.py
import scrapy
class LoginSpider(scrapy.Spider): name = 'login' allowed_domains = ['chinanetcenter'] start_urls = ['https://portal.chinanetcenter.com/cas/login?service=https%3A%2F%2Fsi.chinanetcenter.com%2Fr_sec_login&appcode=serviceinsight' ,'https://si.chinanetcenter.com/']
def __init__(self):
super().__init__()
driver=None
cookies=None
def parse(self, response):
print(response.url)