现在在维护一个反反爬虫技术的项目,有朋友提交了一个采集网站,普通请求和scrapy都无法拿到数据,我来尝试了一波
仓库地址在:https://github.com/KCPClub/Anti-Anti-Spider 本次代码在phantomjs目录下
确保你安装好了phantomjs 工具,采集目标是:http://www.shilladfs.com/estore/kr/zh/Domestic-Brand/Skin-Care/Basic-Skin-Care/p/359582
首先是用代码一来来尝试运用js解析能力的请求: https://github.com/KCPClub/Anti-Anti-Spider/blob/master/phantomjs/get_page_Source_Code/request.js
运行:
phantomjs request.js http://www.shilladfs.com/estore/kr/zh/Domestic-Brand/Skin-Care/Basic-Skin-Care/p/359582
/***********************************
code:javascript
system:win || linux
auther: luyi
mail : **@qq.com
github: luyishisi
blog: https://www.urlteam.org
date:2016.9.12
逻辑说明:使用phantomjs无界面浏览器作为操作平台,破解对方针对js解析的反爬虫辨别
************************************/
var page = require('webpage').create(),
system = require('system'),
address;
address = system.args[1];
//init and settings
page.settings.resourceTimeout = 30000 ;
page.settings.XSSAuditingEnabled = true ;
//page.viewportSize = { width: 1000, height: 1000 };
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
page.customHeaders = {
"Connection" : "keep-alive",
"Cache-Control" : "max-age=0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
};
page.open(address, function() {
console.log(address);
console.log('begin');
});
//加载页面完毕运行
page.onLoadFinished = function(status) {
console.log('Status: ' + status);
console.log(page.content);
phantom.exit();
};
返回的情况不乐观,还是一样很短的数据。
再使用截图功能加上延时
使用代码二:https://github.com/KCPClub/Anti-Anti-Spider/blob/master/phantomjs/get_page_printscreen/rasterize.js
phantomjs rasterize.js "http://www.shilladfs.com/estore/kr/zh/Domestic-Brand/Skin-Care/Basic-Skin-Care/p/359582" 800px*800px > wait.html
代码如下:
var page = require('webpage').create(),
system = require('system'),
address, output, size;
//phantom.addCookie({
//'name': '',
//'value': '',
//'domain': ''
//});
if (system.args.length < 3 || system.args.length > 5) {
console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
console.log(' paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
console.log(' image (png/jpg output) examples: "1920px" entire page, window width 1920px');
console.log(' "800px*600px" window, clipped to 800x600');
phantom.exit(1);
}
else {
address = system.args[1];
output = system.args[2];
//page.viewportSize = { width: 1000, height: 1000 };
if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
size = system.args[3].split('*');
page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
: { format: system.args[3], orientation: 'portrait', margin: '1cm' };
} else if (system.args.length > 3 && system.args[3].substr(-2) === "px") {
size = system.args[3].split('*');
if (size.length === 2) {
pageWidth = parseInt(size[0], 10);
pageHeight = parseInt(size[1], 10);
page.viewportSize = { width: pageWidth, height: pageHeight };
page.clipRect = { top: 0, left: 0, width: pageWidth, height: pageHeight };
} else {
console.log("size:", system.args[3]);
pageWidth = parseInt(system.args[3], 10);
pageHeight = parseInt(pageWidth * 3/4, 10); // it's as good an assumption as any
console.log ("pageHeight:",pageHeight);
page.viewportSize = { width: pageWidth, height: pageHeight };
}
}
if (system.args.length > 4) {
page.zoomFactor = system.args[4];
}
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit(1);
}
else {
console.log('able to load the address!');
window.setTimeout(function () {
page.render(output);
page.render('jietu_6.png');
// phantom.exit();
console.log('asd')
}, 1000);
window.setInterval(function () {
page.render(output);
page.render('jietu_8.png');
// phantom.exit();
console.log('111asd')
}, 5000);
}
});
}
t = 17
interval = setInterval(function(){
if ( t > 0 ) {
console.log(t--);
}
//接下来是根据不同的时间段保留不同的截图,
if (t == 0) {
console.log("jietu_6");
page.render('jietu_6.png');
//打印出页面源代码。
console.log(page.content);
phantom.exit();
}
if (t == 2) {
console.log("jietu_5");
page.render('jietu_5.png');
}
if (t == 4){
console.log("jietu_4");
page.render('jietu_4.png');
}
if (t == 5){
console.log("jietu_3");
page.render('jietu_3.png');
}
if ( t == 10 ){
console.log("jietu——1");
page.render('jietu_1.png');
console.log('click_begin');
}
}, 1000);
完成采集,页面的截图如下:
原创文章,转载请注明: 转载自URl-team
本文链接地址: 使用phantomjs采集运用了强制跳转与页面等待等反爬技术的网站