反反爬虫系列(一)

这篇文章来自知乎大佬——不吃夹生饭,一位Python爬虫工程师。


前言

笔者决定写一个系列反反爬虫,目的是站在生产角度如何绕过各类网站的反爬虫,提供反反爬虫思路。

关于工程化,这里笔者暂不提及。希望各位看官能复现我的思路来完成反反爬虫过程,即提升了自己技术和思路同时也促使网站迭代自己的反爬虫策略(手动狗头。

首先我们来解决的网站是 同程旅游的酒店部分。这里设计的反爬虫是 antitoken,一个全局的token。

难度:中等

开发环境:

  1. MacOS 10.14.2
  2. python: python3.5.2
  3. Sublime 支持JavaScript

假设当前需求是:获取该酒店的评论数据

那么我们需要做的事情:

  1. 调研
  2. 开发
  3. 部署并维护

我们着重研究调研部分,进入任意酒店页面可见长这个样的

然后评论部分长这个样的,通过ajax加载

接下应该做的是观察这个api

确定:

  1. url
  2. 请求方式 get/post
  3. 参数及格式 params/payloads
  4. 请求头 headers

然后这个api长酱色的:

提取出信息

url = 'https://www.ly.com/hotel/api/tmapi/comment/list'
headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Connection': 'keep-alive',
    'Referer': 'https://www.ly.com/HotelInfo-50101461.html?spm0=10002.2001.1.0.1.4.11',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest'
}
params = {
    'hotelid': '50101461',
    'page': '2',
    'pageSize': '10',
    'commentType': '0',
    'roomTypeId': '',
    'tripPurposeId': '',
    'RankType': '1',
    'mainTagId': '',
    'subTagId': '',
    'antitoken': '9f5c5f8c288f4687e965d600e5115808'
}

可以观察到参数里有个antitoken 特别碍眼,接下来我们要开始撸这个antitoken

在调试的时候首先遇到的问题就是断点问题

同程的反爬虫策略:在打开开发者工具时候 debug,同时不断向内存写东西, 一会儿浏览器就卡的不行。大概长这个样

追踪源码可以看到是 leonid-tq-jq-v3-min.js 在耍怪:

好了,各位看官看好了,接下来我们要做的是,就是将这个网页保存下来,然后在leonid-tq-jq-v3-min.js 把我上图里红框里的这个函数内容全部删掉

注意,不要给这段js重新排版

好了,再打开本地的html,再打开开发者工具,不会再debug

接下来,咱们开始研究这个antitoken

首先是找到antitoken,我们先搜索

这里笔者打开的是本体保存后的html页面哈

然后我们可以看见 antitoken 是在 last.js这个文件里,那我们接下来去last.js里看看

嗯,起手就看到antitoken 的生成方法了

接下来是打断点调试

嗯?什么是打断点调试? 呃,这是爬虫工程师的基本功。

我们可以看到先从cookie拿到一个参数的值,没错就是这个 wangba ~ 网吧?王八?

看了下面如果wangba为空,则重新创建一个

var e = $.cookie("wangba");
        e && void 0 !== e || (e = (new Date).getTime().toString()

其实这个e就是个时间戳

好了,我打断点的地方就是这个antitoken的值,现在我们要去看看这个antitoken的生成过程

首先把框里的函数重写,自己定一个e,然后长酱色的

然后调试,两步转跳入antitoken的生成函数

function(e, t, a) {
    var n, i, o, s, r;
    n = a(117),
    i = a(56).utf8,
    o = a(118),
    s = a(56).bin,
    (r = function(e, t) {
        e.constructor == String ? e = t && "binary" === t.encoding ? s.stringToBytes(e) : i.stringToBytes(e) : o(e) ? e = Array.prototype.slice.call(e, 0) : Array.isArray(e) || (e = e.toString());
        for (var a = n.bytesToWords(e), l = 8 * e.length, c = 1732584193, d = -271733879, p = -1732584194, u = 271733878, m = 0; m < a.length; m++)
            a[m] = 16711935 & (a[m] << 8 | a[m] >>> 24) | 4278255360 & (a[m] << 24 | a[m] >>> 8);
        a[l >>> 5] |= 128 << l % 32,
        a[14 + (l + 64 >>> 9 << 4)] = l;
        var f = r._ff
          , h = r._gg
          , v = r._hh
          , g = r._ii;
        for (m = 0; m < a.length; m += 16) {
            var y = c
              , _ = d
              , b = p
              , $ = u;
            d = g(d = g(d = g(d = g(d = v(d = v(d = v(d = v(d = h(d = h(d = h(d = h(d = f(d = f(d = f(d = f(d, p = f(p, u = f(u, c = f(c, d, p, u, a[m + 0], 7, -680876936), d, p, a[m + 1], 12, -389564586), c, d, a[m + 2], 17, 606105819), u, c, a[m + 3], 22, -1044525330), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 4], 7, -176418897), d, p, a[m + 5], 12, 1200080426), c, d, a[m + 6], 17, -1473231341), u, c, a[m + 7], 22, -45705983), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 8], 7, 1770035416), d, p, a[m + 9], 12, -1958414417), c, d, a[m + 10], 17, -42063), u, c, a[m + 11], 22, -1990404162), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 12], 7, 1804603682), d, p, a[m + 13], 12, -40341101), c, d, a[m + 14], 17, -1502002290), u, c, a[m + 15], 22, 1236535329), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 1], 5, -165796510), d, p, a[m + 6], 9, -1069501632), c, d, a[m + 11], 14, 643717713), u, c, a[m + 0], 20, -373897302), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 5], 5, -701558691), d, p, a[m + 10], 9, 38016083), c, d, a[m + 15], 14, -660478335), u, c, a[m + 4], 20, -405537848), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 9], 5, 568446438), d, p, a[m + 14], 9, -1019803690), c, d, a[m + 3], 14, -187363961), u, c, a[m + 8], 20, 1163531501), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 13], 5, -1444681467), d, p, a[m + 2], 9, -51403784), c, d, a[m + 7], 14, 1735328473), u, c, a[m + 12], 20, -1926607734), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 5], 4, -378558), d, p, a[m + 8], 11, -2022574463), c, d, a[m + 11], 16, 1839030562), u, c, a[m + 14], 23, -35309556), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 1], 4, -1530992060), d, p, a[m + 4], 11, 1272893353), c, d, a[m + 7], 16, -155497632), u, c, a[m + 10], 23, -1094730640), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 13], 4, 681279174), d, p, a[m + 0], 11, -358537222), c, d, a[m + 3], 16, -722521979), u, c, a[m + 6], 23, 76029189), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 9], 4, -640364487), d, p, a[m + 12], 11, -421815835), c, d, a[m + 15], 16, 530742520), u, c, a[m + 2], 23, -995338651), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 0], 6, -198630844), d, p, a[m + 7], 10, 1126891415), c, d, a[m + 14], 15, -1416354905), u, c, a[m + 5], 21, -57434055), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 12], 6, 1700485571), d, p, a[m + 3], 10, -1894986606), c, d, a[m + 10], 15, -1051523), u, c, a[m + 1], 21, -2054922799), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 8], 6, 1873313359), d, p, a[m + 15], 10, -30611744), c, d, a[m + 6], 15, -1560198380), u, c, a[m + 13], 21, 1309151649), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 4], 6, -145523070), d, p, a[m + 11], 10, -1120210379), c, d, a[m + 2], 15, 718787259), u, c, a[m + 9], 21, -343485551),
            c = c + y >>> 0,
            d = d + _ >>> 0,
            p = p + b >>> 0,
            u = u + $ >>> 0
        }
        return n.endian([c, d, p, u])
    }
    )._ff = function(e, t, a, n, i, o, s) {
        var r = e + (t & a | ~t & n) + (i >>> 0) + s;
        return (r << o | r >>> 32 - o) + t
    }
    ,
    r._gg = function(e, t, a, n, i, o, s) {
        var r = e + (t & n | a & ~n) + (i >>> 0) + s;
        return (r << o | r >>> 32 - o) + t
    }
    ,
    r._hh = function(e, t, a, n, i, o, s) {
        var r = e + (t ^ a ^ n) + (i >>> 0) + s;
        return (r << o | r >>> 32 - o) + t
    }
    ,
    r._ii = function(e, t, a, n, i, o, s) {
        var r = e + (a ^ (t | ~n)) + (i >>> 0) + s;
        return (r << o | r >>> 32 - o) + t
    }
    ,
    r._blocksize = 16,
    r._digestsize = 16,
    e.exports = function(e, t) {
        if (e === undefined || null === e)
            throw new Error("Illegal argument " + e);
        var a = n.wordsToBytes(r(e, t));
        return t && t.asBytes ? a : t && t.asString ? s.bytesToString(a) : n.bytesToHex(a)
    }
}

不可避免的,我们需要知道这个函数里各个参数(n,i,o,s)的含义,那只有继续打断点调试咯

一个一个看

首先是 n

n = a(117)

这个a又是啥,在 该出打断点,刷新,console看

原来 a(117)是一个函数,那么我们就去回到last.js拿到这个函数

然后n这个参数长这个样

    n = {
        rotl: function(e, t) {
            return e << t | e >>> 32 - t
        },
        rotr: function(e, t) {
            return e << 32 - t | e >>> t
        },
        endian: function(e) {
            if (e.constructor == Number)
                return 16711935 & n.rotl(e, 8) | 4278255360 & n.rotl(e, 24);
            for (var t = 0; t < e.length; t++)
                e[t] = n.endian(e[t]);
            return e
        },
        randomBytes: function(e) {
            for (var t = []; e > 0; e--)
                t.push(Math.floor(256 * Math.random()));
            return t
        },
        bytesToWords: function(e) {
            for (var t = [], a = 0, n = 0; a < e.length; a++,
            n += 8)
                t[n >>> 5] |= e[a] << 24 - n % 32;
            return t
        },
        wordsToBytes: function(e) {
            for (var t = [], a = 0; a < 32 * e.length; a += 8)
                t.push(e[a >>> 5] >>> 24 - a % 32 & 255);
            return t
        },
        bytesToHex: function(e) {
            for (var t = [], a = 0; a < e.length; a++)
                t.push((e[a] >>> 4).toString(16)),
                t.push((15 & e[a]).toString(16));
            return t.join("")
        },
        hexToBytes: function(e) {
            for (var t = [], a = 0; a < e.length; a += 2)
                t.push(parseInt(e.substr(a, 2), 16));
            return t
        },
        bytesToBase64: function(e) {
            for (var t = [], n = 0; n < e.length; n += 3)
                for (var i = e[n] << 16 | e[n + 1] << 8 | e[n + 2], o = 0; o < 4; o++)
                    8 * n + 6 * o <= 8 * e.length ? t.push(a.charAt(i >>> 6 * (3 - o) & 63)) : t.push("=");
            return t.join("")
        },
        base64ToBytes: function(e) {
            e = e.replace(/[^A-Z0-9+\/]/gi, "");
            for (var t = [], n = 0, i = 0; n < e.length; i = ++n % 4)
                0 != i && t.push((a.indexOf(e.charAt(n - 1)) & Math.pow(2, -2 * i + 8) - 1) << 2 * i | a.indexOf(e.charAt(n)) >>> 6 - 2 * i);
            return t
        }
    }

然后是 i和s这两个参数

    i = a(56).utf8,
    s = a(56).bin,

我们需要去看看a(56)又是啥玩意

可见a(56)也是一个函数

这时候拿出a(56),然后我们单独定义一个a56

var a56 = {
    utf8: {
       stringToBytes: function(e) {
            return a.bin.stringToBytes(unescape(encodeURIComponent(e)))
        },
        bytesToString: function(e) {
            return decodeURIComponent(escape(a.bin.bytesToString(e)))
        }
    },
    bin: {
        stringToBytes: function(e) {
            for (var t = [], a = 0; a < e.length; a++)
                t.push(255 & e.charCodeAt(a));
            return t
        },
        bytesToString: function(e) {
            for (var t = [], a = 0; a < e.length; a++)
                t.push(String.fromCharCode(e[a]));
            return t.join("")
        }
    }
};
var i = a56.utf8
var s = a56.bin

好了,接下来就是o哦这个参数

o = a(118)

嗯,我们再看 a(118)

可见 o这个参数值为null就行了

var o = null

接下来,我们把这几个参数替换回去

笔者这里是写到Sublime里 然后运行

Sublime得自己去配置支持JavaScript哈

报错了诶

还有个t参数

那么t参数又是个啥呢

打断点调试咯

可见

t = undefined

那么在脚本里添加一个

var t = null

再运行

OK!,拿到antitoken了

接下来,我们去做个验证

随便打开一个同程的酒店,随便看一页评论,在cookie拿到 wangba 的值,同时看antitoken 的值

然后把wangba=1547187485089里的值带入改写后的脚本里运行

好了,咱们一起就这么把同程的反爬虫攻克了。

脚本代码在文末

// 先找到e,就是时间戳,也可以自己定义
e = (new Date()).getTime().toString();
// e = "1547187485089"

//定义antitoken
function antitoken(e){
    var a56 = {
        utf8: {
            stringToBytes: function(e) {
                return a56.bin.stringToBytes(unescape(encodeURIComponent(e)))
            },
            bytesToString: function(e) {
                return decodeURIComponent(escape(a.bin.bytesToString(e)))
            }
        },
        bin: {
            stringToBytes: function(e) {
                for (var t = [], a = 0; a < e.length; a++)
                    t.push(255 & e.charCodeAt(a));
                return t
            },
            bytesToString: function(e) {
                for (var t = [], a = 0; a < e.length; a++)
                    t.push(String.fromCharCode(e[a]));
                return t.join("")
            }
        }
    };
    // 这里t取任意值都行
    // var t = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    var t = null;
    var n, i, o, s, r;
    // n = a117,
    n = {
        rotl: function(e, t) {
            return e << t | e >>> 32 - t
        },
        rotr: function(e, t) {
            return e << 32 - t | e >>> t
        },
        endian: function(e) {
            if (e.constructor == Number)
                return 16711935 & n.rotl(e, 8) | 4278255360 & n.rotl(e, 24);
            for (var t = 0; t < e.length; t++)
                e[t] = n.endian(e[t]);
            return e
        },
        randomBytes: function(e) {
            for (var t = []; e > 0; e--)
                t.push(Math.floor(256 * Math.random()));
            return t
        },
        bytesToWords: function(e) {
            for (var t = [], a = 0, n = 0; a < e.length; a++,
            n += 8)
                t[n >>> 5] |= e[a] << 24 - n % 32;
            return t
        },
        wordsToBytes: function(e) {
            for (var t = [], a = 0; a < 32 * e.length; a += 8)
                t.push(e[a >>> 5] >>> 24 - a % 32 & 255);
            return t
        },
        bytesToHex: function(e) {
            for (var t = [], a = 0; a < e.length; a++)
                t.push((e[a] >>> 4).toString(16)),
                t.push((15 & e[a]).toString(16));
            return t.join("")
        },
        hexToBytes: function(e) {
            for (var t = [], a = 0; a < e.length; a += 2)
                t.push(parseInt(e.substr(a, 2), 16));
            return t
        },
        bytesToBase64: function(e) {
            for (var t = [], n = 0; n < e.length; n += 3)
                for (var i = e[n] << 16 | e[n + 1] << 8 | e[n + 2], o = 0; o < 4; o++)
                    8 * n + 6 * o <= 8 * e.length ? t.push(a.charAt(i >>> 6 * (3 - o) & 63)) : t.push("=");
            return t.join("")
        },
        base64ToBytes: function(e) {
            e = e.replace(/[^A-Z0-9+\/]/gi, "");
            for (var t = [], n = 0, i = 0; n < e.length; i = ++n % 4)
                0 != i && t.push((a.indexOf(e.charAt(n - 1)) & Math.pow(2, -2 * i + 8) - 1) << 2 * i | a.indexOf(e.charAt(n)) >>> 6 - 2 * i);
            return t
        }
    },
    i = a56.utf8,
    o = null,
    s = a56.bin,
       (r = function(e, t) {
        e.constructor == String ? e = t && "binary" === t.encoding ? s.stringToBytes(e) : i.stringToBytes(e) : o(e) ? e = Array.prototype.slice.call(e, 0) : Array.isArray(e) || (e = e.toString());
        for (var a = n.bytesToWords(e), l = 8 * e.length, c = 1732584193, d = -271733879, p = -1732584194, u = 271733878, m = 0; m < a.length; m++)
            a[m] = 16711935 & (a[m] << 8 | a[m] >>> 24) | 4278255360 & (a[m] << 24 | a[m] >>> 8);
        a[l >>> 5] |= 128 << l % 32,
        a[14 + (l + 64 >>> 9 << 4)] = l;
        var f = r._ff
          , h = r._gg
          , v = r._hh
          , g = r._ii;
        for (m = 0; m < a.length; m += 16) {
            var y = c
              , _ = d
              , b = p
              , $ = u;
            d = g(d = g(d = g(d = g(d = v(d = v(d = v(d = v(d = h(d = h(d = h(d = h(d = f(d = f(d = f(d = f(d, p = f(p, u = f(u, c = f(c, d, p, u, a[m + 0], 7, -680876936), d, p, a[m + 1], 12, -389564586), c, d, a[m + 2], 17, 606105819), u, c, a[m + 3], 22, -1044525330), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 4], 7, -176418897), d, p, a[m + 5], 12, 1200080426), c, d, a[m + 6], 17, -1473231341), u, c, a[m + 7], 22, -45705983), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 8], 7, 1770035416), d, p, a[m + 9], 12, -1958414417), c, d, a[m + 10], 17, -42063), u, c, a[m + 11], 22, -1990404162), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 12], 7, 1804603682), d, p, a[m + 13], 12, -40341101), c, d, a[m + 14], 17, -1502002290), u, c, a[m + 15], 22, 1236535329), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 1], 5, -165796510), d, p, a[m + 6], 9, -1069501632), c, d, a[m + 11], 14, 643717713), u, c, a[m + 0], 20, -373897302), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 5], 5, -701558691), d, p, a[m + 10], 9, 38016083), c, d, a[m + 15], 14, -660478335), u, c, a[m + 4], 20, -405537848), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 9], 5, 568446438), d, p, a[m + 14], 9, -1019803690), c, d, a[m + 3], 14, -187363961), u, c, a[m + 8], 20, 1163531501), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 13], 5, -1444681467), d, p, a[m + 2], 9, -51403784), c, d, a[m + 7], 14, 1735328473), u, c, a[m + 12], 20, -1926607734), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 5], 4, -378558), d, p, a[m + 8], 11, -2022574463), c, d, a[m + 11], 16, 1839030562), u, c, a[m + 14], 23, -35309556), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 1], 4, -1530992060), d, p, a[m + 4], 11, 1272893353), c, d, a[m + 7], 16, -155497632), u, c, a[m + 10], 23, -1094730640), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 13], 4, 681279174), d, p, a[m + 0], 11, -358537222), c, d, a[m + 3], 16, -722521979), u, c, a[m + 6], 23, 76029189), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 9], 4, -640364487), d, p, a[m + 12], 11, -421815835), c, d, a[m + 15], 16, 530742520), u, c, a[m + 2], 23, -995338651), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 0], 6, -198630844), d, p, a[m + 7], 10, 1126891415), c, d, a[m + 14], 15, -1416354905), u, c, a[m + 5], 21, -57434055), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 12], 6, 1700485571), d, p, a[m + 3], 10, -1894986606), c, d, a[m + 10], 15, -1051523), u, c, a[m + 1], 21, -2054922799), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 8], 6, 1873313359), d, p, a[m + 15], 10, -30611744), c, d, a[m + 6], 15, -1560198380), u, c, a[m + 13], 21, 1309151649), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 4], 6, -145523070), d, p, a[m + 11], 10, -1120210379), c, d, a[m + 2], 15, 718787259), u, c, a[m + 9], 21, -343485551),
            c = c + y >>> 0,
            d = d + _ >>> 0,
            p = p + b >>> 0,
            u = u + $ >>> 0
        }
        return n.endian([c, d, p, u])
    }
    )._ff = function(e, t, a, n, i, o, s) {
        var r = e + (t & a | ~t & n) + (i >>> 0) + s;
        return (r << o | r >>> 32 - o) + t
    }
    ,
    r._gg = function(e, t, a, n, i, o, s) {
        var r = e + (t & n | a & ~n) + (i >>> 0) + s;
        return (r << o | r >>> 32 - o) + t
    }
    ,
    r._hh = function(e, t, a, n, i, o, s) {
        var r = e + (t ^ a ^ n) + (i >>> 0) + s;
        return (r << o | r >>> 32 - o) + t
    }
    ,
    r._ii = function(e, t, a, n, i, o, s) {
        var r = e + (a ^ (t | ~n)) + (i >>> 0) + s;
        return (r << o | r >>> 32 - o) + t
    }
    ,
    r._blocksize = 16,
    r._digestsize = 16;

    var a = n.wordsToBytes(r(e, t));
    return t && t.asBytes ? a : t && t.asString ? s.bytesToString(a) : n.bytesToHex(a);
};

//召唤神兽
console.log(antitoken(e))

看到这里,都是真爱

谢谢捧场

阅读原文可以关注大佬

原文发布于微信公众号 - Python爬虫与算法进阶(zhangslob)

原文发表时间:2019-01-23

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区

领取腾讯云代金券