使用phantomjs采集运用了强制跳转与页面等待等反爬技术的网站

十四君

发布于 2019-11-27 14:54:49

1.1K0

发布于 2019-11-27 14:54:49

文章被收录于专栏：UrlteamUrlteam

现在在维护一个反反爬虫技术的项目，有朋友提交了一个采集网站，普通请求和scrapy都无法拿到数据，我来尝试了一波

仓库地址在：https://github.com/KCPClub/Anti-Anti-Spider 本次代码在phantomjs目录下

确保你安装好了phantomjs 工具，采集目标是：http://www.shilladfs.com/estore/kr/zh/Domestic-Brand/Skin-Care/Basic-Skin-Care/p/359582

首先是用代码一来来尝试运用js解析能力的请求： https://github.com/KCPClub/Anti-Anti-Spider/blob/master/phantomjs/get_page_Source_Code/request.js

运行：

phantomjs request.js http://www.shilladfs.com/estore/kr/zh/Domestic-Brand/Skin-Care/Basic-Skin-Care/p/359582

/***********************************
code:javascript
system:win  ||  linux
auther: luyi
mail : **@qq.com
github: luyishisi
blog: https://www.urlteam.org
date：2016.9.12
逻辑说明：使用phantomjs无界面浏览器作为操作平台，破解对方针对js解析的反爬虫辨别
************************************/
var page = require('webpage').create(),
    system = require('system'),
    address;
address = system.args[1];
 
//init and settings
page.settings.resourceTimeout = 30000 ;
page.settings.XSSAuditingEnabled = true ;
//page.viewportSize = { width: 1000, height: 1000  };
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
page.customHeaders = {
    "Connection" : "keep-alive",
    "Cache-Control" : "max-age=0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
 
};
page.open(address, function() {
  console.log(address);
  console.log('begin');
 
		});
//加载页面完毕运行
page.onLoadFinished = function(status) {
  console.log('Status: ' + status);
  console.log(page.content);
  phantom.exit();
 
};

返回的情况不乐观，还是一样很短的数据。

再使用截图功能加上延时

使用代码二：https://github.com/KCPClub/Anti-Anti-Spider/blob/master/phantomjs/get_page_printscreen/rasterize.js

phantomjs rasterize.js "http://www.shilladfs.com/estore/kr/zh/Domestic-Brand/Skin-Care/Basic-Skin-Care/p/359582" 800px*800px > wait.html

代码如下：

var page = require('webpage').create(),
    system = require('system'),
    address, output, size;
 
//phantom.addCookie({
  //'name': '',
  //'value': '',
  //'domain': ''
//});
 
if (system.args.length < 3 || system.args.length > 5) {
    console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
    console.log('  paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
    console.log('  image (png/jpg output) examples: "1920px" entire page, window width 1920px');
    console.log('                                   "800px*600px" window, clipped to 800x600');
    phantom.exit(1);
} 
else {
    address = system.args[1];
    output = system.args[2];
    //page.viewportSize = { width: 1000, height: 1000 };
    if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
        size = system.args[3].split('*');
        page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
                                           : { format: system.args[3], orientation: 'portrait', margin: '1cm' };
    } else if (system.args.length > 3 && system.args[3].substr(-2) === "px") {
        size = system.args[3].split('*');
        if (size.length === 2) {
            pageWidth = parseInt(size[0], 10);
            pageHeight = parseInt(size[1], 10);
            page.viewportSize = { width: pageWidth, height: pageHeight };
            page.clipRect = { top: 0, left: 0, width: pageWidth, height: pageHeight };
        } else {
            console.log("size:", system.args[3]);
            pageWidth = parseInt(system.args[3], 10);
            pageHeight = parseInt(pageWidth * 3/4, 10); // it's as good an assumption as any
            console.log ("pageHeight:",pageHeight);
            page.viewportSize = { width: pageWidth, height: pageHeight };
        }
    }
    if (system.args.length > 4) {
        page.zoomFactor = system.args[4];
    }
    page.open(address, function (status) {
        if (status !== 'success') {
            console.log('Unable to load the address!');
            phantom.exit(1);
        } 
		else {
            console.log('able to load the address!');
            window.setTimeout(function () {
                page.render(output);
        		page.render('jietu_6.png');
//                phantom.exit();
				  console.log('asd')	
            }, 1000);
            window.setInterval(function () {
                page.render(output);
        		page.render('jietu_8.png');
//                phantom.exit();
				  console.log('111asd')	
            }, 5000);
        }
    });
}
t = 17
interval = setInterval(function(){
    if ( t > 0 ) {
        console.log(t--);
    }
    //接下来是根据不同的时间段保留不同的截图，
    if (t == 0) {
        console.log("jietu_6");
        page.render('jietu_6.png');
        //打印出页面源代码。
        console.log(page.content);
        phantom.exit();
    }
    if (t == 2) {
        console.log("jietu_5");
        page.render('jietu_5.png');
    }
    if (t == 4){
        console.log("jietu_4");
        page.render('jietu_4.png');
    }
    if (t == 5){
        console.log("jietu_3");
        page.render('jietu_3.png');
    }
 
    if ( t == 10 ){
      console.log("jietu——1");
      page.render('jietu_1.png');
      console.log('click_begin');
    }
}, 1000);

完成采集，页面的截图如下：