

在当今互联网时代,数据抓取(爬虫)技术广泛应用于数据分析、市场调研、自动化测试等领域。然而,许多网站采用动态加载技术(如Ajax、React、Vue.js等框架)来渲染页面,传统的**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">requests</font>**库无法直接获取动态生成的内容。这时,Selenium成为解决动态页面爬取的重要工具。
豆瓣作为一个典型的动态加载网站,其登录页面涉及表单提交、动态验证码、Ajax请求等复杂交互。本文将通过Python + Selenium,详细介绍如何模拟登录豆瓣,并处理动态加载的登录页面。
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">requests</font>**)无法执行JS,而Selenium可以完整加载动态内容。**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">pip install selenium</font>**)访问豆瓣登录页(https://accounts.douban.com/passport/login),可以发现:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
# 配置Chrome选项(无头模式、禁用自动化提示)
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
# 启动浏览器
driver = webdriver.Chrome(options=options)
driver.get("https://accounts.douban.com/passport/login")默认页面是二维码登录,需点击切换:
# 等待并点击“密码登录”标签
switch_login = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@class="account-tab-account"]'))
switch_login.click()# 定位输入框并填写信息
username = driver.find_element(By.ID, "username")
password = driver.find_element(By.ID, "password")
username.send_keys("your_email@example.com") # 替换为你的豆瓣账号
password.send_keys("your_password") # 替换为你的密码# 点击登录按钮
login_button = driver.find_element(By.XPATH, '//a[@class="btn btn-account"]')
login_button.click()
# 等待登录成功(检查是否跳转到首页)
WebDriverWait(driver, 10).until(
EC.url_contains("www.douban.com"))
print("登录成功!当前页面:", driver.current_url)from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import random
# 代理配置
proxyHost = "www.16yun.cn"
proxyPort = "5445"
proxyUser = "16QMSOML"
proxyPass = "280651"
# 配置浏览器选项
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
# 添加代理认证
options.add_argument(f'--proxy-server=http://{proxyHost}:{proxyPort}')
options.add_extension = None # 禁用扩展
# 启动浏览器
driver = webdriver.Chrome(options=options)
# 处理代理认证(使用AutoAuth插件)
def enable_proxy_auth(proxyUser, proxyPass):
from selenium.webdriver.common.proxy import Proxy, ProxyType
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': f'{proxyHost}:{proxyPort}',
'sslProxy': f'{proxyHost}:{proxyPort}',
'noProxy': ''
})
proxy.add_to_capabilities(options.to_capabilities())
enable_proxy_auth(proxyUser, proxyPass)
try:
# 访问登录页
driver.get("https://accounts.douban.com/passport/login")
# 切换至账号登录
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//*[@class="account-tab-account"]'))
).click()
# 模拟人类输入
def human_type(element, text):
"""模拟人类输入(带随机延迟)"""
for char in text:
element.send_keys(char)
time.sleep(random.uniform(0.1, 0.3))
username = driver.find_element(By.ID, "username")
password = driver.find_element(By.ID, "password")
ActionChains(driver).move_to_element(username).click().perform()
human_type(username, "your_email@example.com") # 替换为你的账号
ActionChains(driver).move_to_element(password).click().perform()
human_type(password, "your_password") # 替换为你的密码
# 处理验证码(如有)
try:
captcha = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.ID, "captcha_image")))
if captcha:
print("检测到验证码,请手动处理或调用OCR")
time.sleep(15) # 留出时间手动输入
except:
print("无验证码,继续执行")
# 提交登录
login_btn = driver.find_element(By.XPATH, '//a[@class="btn btn-account"]')
ActionChains(driver).move_to_element(login_btn).click().perform()
# 等待登录成功
try:
WebDriverWait(driver, 10).until(
EC.url_contains("www.douban.com"))
print("登录成功!当前URL:", driver.current_url)
except:
print("登录失败,可能触发反爬")
# 获取Cookies
cookies = driver.get_cookies()
print("登录后的Cookies:", cookies)
finally:
driver.quit() # 关闭浏览器本文通过Selenium实现了豆瓣动态登录页面的自动化操作,涵盖:
适用场景:
进一步优化方向:
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">requests</font>**+**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cookies</font>**提高效率(避免每次启动浏览器)。