
在当今的互联网环境中,越来越多的网站采用JavaScript动态渲染技术来展示内容,传统的静态爬虫工具对此类页面束手无策。搜狗图片搜索正是这样一个典型应用,其瀑布流式的图片加载、动态滚动的页面设计以及复杂的AJAX请求,使得常规的Requests+BeautifulSoup组合难以有效抓取数据。
针对这一问题,本文将详细介绍如何使用Splash这一强大的JavaScript渲染服务,结合Scrapy框架,实现对搜狗图片动态页面的高效爬取。我们将从原理分析到实战代码,全面解析这一技术方案的实现过程。
Splash是一个带有HTTP API的轻量级浏览器,专门为网页渲染而设计。它基于WebKit引擎,支持JavaScript执行、页面渲染和截图功能。与Selenium相比,Splash具有以下优势:
搜狗图片搜索页面具有以下技术特征:
# 拉取Splash镜像
docker pull scrapinghub/splash
# 运行Splash容器
docker run -p 8050:8050 scrapinghub/splash
# 验证安装
curl http://localhost:8050# 创建Scrapy项目
scrapy startproject sogou_image_crawler
cd sogou_image_crawler
# 修改settings.py配置
import scrapy_splash
SPLASH_URL = 'http://localhost:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'import scrapy
from scrapy_splash import SplashRequest
import json
import time
from urllib.parse import quote
class SogouImageSpider(scrapy.Spider):
name = 'sogou_image'
allowed_domains = ['pic.sogou.com']
def __init__(self, keyword='风景', max_pages=10, *args, **kwargs):
super().__init__(*args, **kwargs)
self.keyword = keyword
self.max_pages = max_pages
self.base_url = f'https://pic.sogou.com/pics?query={quote(keyword)}'
def start_requests(self):
"""生成初始请求"""
for page in range(1, self.max_pages + 1):
url = f'{self.base_url}&start={page * 48}'
yield SplashRequest(
url,
self.parse_image_page,
args={
'wait': 2,
'timeout': 90,
'images': 0,
'resource_timeout': 10
},
meta={'page': page}
)
def parse_image_page(self, response):
"""解析图片列表页"""
page = response.meta['page']
self.logger.info(f'正在爬取第{page}页')
# 使用JavaScript提取图片数据
script = """
function main(splash)
local urls = {}
local elements = splash:select_all('.img-box a')
for _, element in ipairs(elements) do
local style = element.node:getAttribute('style')
if style then
local url_match = style:match('url%((.-)%)')
if url_match then
table.insert(urls, {
url = url_match,
title = element.node:getAttribute('title') or ''
})
end
end
end
-- 模拟滚动加载更多
splash:runjs("window.scrollTo(0, document.body.scrollHeight);")
splash:wait(1.5)
-- 再次获取新加载的图片
local new_elements = splash:select_all('.img-box a')
for _, element in ipairs(new_elements) do
local style = element.node:getAttribute('style')
if style then
local url_match = style:match('url%((.-)%)')
if url_match then
table.insert(urls, {
url = url_match,
title = element.node:getAttribute('title') or ''
})
end
end
end
return {
urls = urls,
html = splash:html()
}
end
"""
yield SplashRequest(
response.url,
self.extract_images,
endpoint='execute',
args={
'lua_source': script,
'timeout': 90
},
meta={'page': page}
)
def extract_images(self, response):
"""提取图片信息"""
try:
data = json.loads(response.text)
images = data.get('urls', [])
for img in images:
if not img.get('url'):
continue
item = {
'page': response.meta['page'],
'image_url': img['url'],
'title': img['title'],
'keyword': self.keyword,
'crawl_time': time.strftime('%Y-%m-%d %H:%M:%S'),
'referer': response.url
}
# 请求图片详情页获取高清图
if 'thumb' in img['url']:
hd_url = img['url'].replace('thumb', 'hd')
yield SplashRequest(
hd_url,
self.parse_hd_image,
meta={'item': item},
args={'wait': 1}
)
else:
yield item
except json.JSONDecodeError as e:
self.logger.error(f'JSON解析错误: {e}')-- advanced_image_extractor.lua
function main(splash, args)
-- 设置用户代理
splash:set_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
-- 访问页面
assert(splash:go(args.url))
assert(splash:wait(args.wait or 2))
-- 创建结果表
local results = {
images = {},
page_info = {}
}
-- 多次滚动加载
local scroll_attempts = args.scroll_attempts or 3
for i = 1, scroll_attempts do
-- 提取当前可见图片
local js_code = [[
function extractImages() {
var images = [];
var items = document.querySelectorAll('.img-box, .pic-item');
items.forEach(function(item) {
var img = item.querySelector('img');
var link = item.querySelector('a');
if (img && img.src) {
var imageInfo = {
src: img.src,
alt: img.alt || '',
width: img.naturalWidth,
height: img.naturalHeight,
data_src: img.getAttribute('data-src') || ''
};
if (link) {
imageInfo.link = link.href;
imageInfo.title = link.title || link.getAttribute('data-title') || '';
}
images.push(imageInfo);
}
});
return images;
}
return extractImages();
]]
local current_images = splash:evaljs(js_code)
-- 添加到结果
for _, img in ipairs(current_images) do
table.insert(results.images, img)
end
-- 滚动页面
if i < scroll_attempts then
splash:runjs("window.scrollBy(0, window.innerHeight * 1.5);")
assert(splash:wait(1.5))
end
end
-- 获取页面信息
results.page_info = {
url = splash:url(),
title = splash:evaljs("document.title"),
image_count = #results.images,
scroll_height = splash:evaljs("document.body.scrollHeight")
}
return results
end# pipelines.py
import os
import requests
from PIL import Image
from io import BytesIO
import hashlib
class SogouImagePipeline:
def __init__(self, storage_path='./images'):
self.storage_path = storage_path
if not os.path.exists(storage_path):
os.makedirs(storage_path)
@classmethod
def from_crawler(cls, crawler):
return cls(
storage_path=crawler.settings.get('IMAGES_STORE', './images')
)
def process_item(self, item, spider):
"""处理并保存图片"""
try:
# 生成唯一文件名
img_url = item['image_url']
file_hash = hashlib.md5(img_url.encode()).hexdigest()[:8]
file_name = f"{item['keyword']}_{file_hash}.jpg"
file_path = os.path.join(self.storage_path, file_name)
# 下载图片
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': item.get('referer', 'https://pic.sogou.com/')
}
response = requests.get(img_url, headers=headers, timeout=10)
response.raise_for_status()
# 验证图片格式
try:
img = Image.open(BytesIO(response.content))
img.verify() # 验证完整性
# 保存图片
with open(file_path, 'wb') as f:
f.write(response.content)
# 更新item信息
item['file_name'] = file_name
item['file_path'] = file_path
item['file_size'] = len(response.content)
item['image_format'] = img.format
item['image_mode'] = img.mode
spider.logger.info(f"成功保存图片: {file_name}")
except Exception as e:
spider.logger.error(f"图片格式错误: {e}")
item['download_status'] = 'failed'
except requests.RequestException as e:
spider.logger.error(f"下载失败: {e}")
item['download_status'] = 'failed'
return item# middlewares.py
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
class RotateUserAgentMiddleware(UserAgentMiddleware):
"""随机用户代理中间件"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
]
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agents)
request.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,en;q=0.8'本文详细介绍了基于Splash的搜狗图片动态页面爬取方案。通过Splash的JavaScript渲染能力,我们能够有效处理现代Web应用中的动态内容加载问题。关键技术点包括:
这种技术方案不仅适用于搜狗图片,也可推广到其他使用JavaScript动态渲染的网站。在实际应用中,建议根据具体需求调整爬取策略,并始终遵守相关法律法规和网站使用条款。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。