我是不太喜欢这个库的,所以整个文章中对于这个库也不会赋予太多感情 爬虫我觉得还得是python3,就酱...
https://nmap.org/nsedoc/lib/httpspider.html
一个很小的httpspider库,提供基本的抓取功能,它包含以下类
下面是一个简单的使用的例子
local crawler = httpspider.Crawler:new( host, port, '/', { scriptname = SCRIPT_NAME } )
crawler:set_timeout(10000)
local result
while(true) do
local status, r = crawler:crawl()
if ( not(status) ) then
break
end
if ( r.response.body:match(str_match) ) then
crawler:stop()
result = r.url
break
end
end
return result
下面是一个例子,我们将覆盖默认的 withinhost 方法,并且仅允许在主机中非“ js”或“ css”资源上进行爬网
crawler.options.withinhost = function(url)
if crawler:iswithinhost(url)
and not crawler:isresource(url, "js")
and not crawler:isresource(url, "css") then
return true
end
end
new 方法的options 可以包含一下参数
虽然我在做nmap相关工作,但是在爬虫方面还是不推荐使用这个库
从各个脚本调用的情况来看,基本上很少使用 options 参数,调用就是把默认的 host , port , url 填写明白就行了
我们看看到底有多少个脚本使用了 httpspider
可以看到,一共http相关的脚本有 134 个,调用了 httpspider 的有 22 个,我们挑选一个来看一看
以 http-backup-finder 这个脚本为例分析,这个脚本是用来检查备份文件泄露的
local coroutine = require "coroutine"
local http = require "http"
local httpspider = require "httpspider"
local shortport = require "shortport"
local stdnse = require "stdnse"
local table = require "table"
local url = require "url"
description = [[
Spiders a website and attempts to identify backup copies of discovered files.
It does so by requesting a number of different combinations of the filename (eg. index.bak, index.html~, copy of index.html).
]]
---
-- @usage
-- nmap --script=http-backup-finder <target>
--
-- @output
-- PORT STATE SERVICE REASON
-- 80/tcp open http syn-ack
-- | http-backup-finder:
-- | Spidering limited to: maxdepth=3; maxpagecount=20; withindomain=example.com
-- | http://example.com/index.bak
-- | http://example.com/login.php~
-- | http://example.com/index.php~
-- |_ http://example.com/help.bak
--
-- @args http-backup-finder.maxdepth the maximum amount of directories beneath
-- the initial url to spider. A negative value disables the limit.
-- (default: 3)
-- @args http-backup-finder.maxpagecount the maximum amount of pages to visit.
-- A negative value disables the limit (default: 20)
-- @args http-backup-finder.url the url to start spidering. This is a URL
-- relative to the scanned host eg. /default.html (default: /)
-- @args http-backup-finder.withinhost only spider URLs within the same host.
-- (default: true)
-- @args http-backup-finder.withindomain only spider URLs within the same
-- domain. This widens the scope from <code>withinhost</code> and can
-- not be used in combination. (default: false)
--
author = "Patrik Karlsson"
license = "Same as Nmap--See https://nmap.org/book/man-legal.html"
categories = {"discovery", "safe"}
portrule = shortport.http
local function backupNames(filename)
local function createBackupNames()
local dir = filename:match("^(.*/)") or ""
local basename, suffix = filename:match("([^/]*)%.(.*)$")
local backup_names = {}
if basename then
table.insert(backup_names, "{basename}.bak") -- generic bak file
end
if basename and suffix then
table.insert(backup_names, "{basename}.{suffix}~") -- emacs
table.insert(backup_names, "{basename} copy.{suffix}") -- mac copy
table.insert(backup_names, "Copy of {basename}.{suffix}") -- windows copy
table.insert(backup_names, "Copy (2) of {basename}.{suffix}") -- windows second copy
table.insert(backup_names, "{basename}.{suffix}.1") -- generic backup
table.insert(backup_names, "{basename}.{suffix}.~1~") -- bzr --revert residue
end
local replace_patterns = {
["{filename}"] = filename,
["{basename}"] = basename,
["{suffix}"] = suffix,
}
for _, name in ipairs(backup_names) do
local backup_name = name
for p, v in pairs(replace_patterns) do
backup_name = backup_name:gsub(p,v)
end
coroutine.yield(dir .. backup_name)
end
end
return coroutine.wrap(createBackupNames)
end
action = function(host, port)
local crawler = httpspider.Crawler:new(host, port, nil, { scriptname = SCRIPT_NAME } )
crawler:set_timeout(10000)
-- Identify servers that answer 200 to invalid HTTP requests and exit as these would invalidate the tests
local status_404, result_404, known_404 = http.identify_404(host,port)
if ( status_404 and result_404 == 200 ) then
stdnse.debug1("Exiting due to ambiguous response from web server on %s:%s. All URIs return status 200.", host.ip, port.number)
return nil
end
-- Check if we can use HEAD requests
local use_head = http.can_use_head(host, port, result_404)
local backups = {}
while(true) do
local status, r = crawler:crawl()
-- if the crawler fails it can be due to a number of different reasons
-- most of them are "legitimate" and should not be reason to abort
if ( not(status) ) then
if ( r.err ) then
return stdnse.format_output(false, r.reason)
else
break
end
end
-- parse the returned url
local parsed = url.parse(tostring(r.url))
-- handle case where only hostname was provided
if ( parsed.path == nil ) then
parsed.path = '/'
end
-- only pursue links that have something looking as a file
if ( parsed.path:match(".*%.*.$") ) then
-- iterate over possible backup files
for link in backupNames(parsed.path) do
local host = parsed.host
local port = parsed.port or url.get_default_port(parsed.scheme)
-- the url.escape doesn't work here as it encodes / to %2F
-- which results in 400 bad request, so we simple do a space
-- replacement instead.
local escaped_link = link:gsub(" ", "%%20")
local response
if(use_head) then
response = http.head(host, port, escaped_link, {redirect_ok=false})
else
response = http.get(host, port, escaped_link, {redirect_ok=false})
end
if http.page_exists(response, result_404, known_404, escaped_link, false) then
if ( not(parsed.port) ) then
table.insert(backups,
("%s://%s%s"):format(parsed.scheme, host, link))
else
table.insert(backups,
("%s://%s:%d%s"):format(parsed.scheme, host, port, link))
end
end
end
end
end
if ( #backups > 0 ) then
backups.name = crawler:getLimitations()
return stdnse.format_output(true, backups)
end
end
其中可以看到在 action 方法中进行实例化一个爬虫,并且设置超市时间为 10000ms
local crawler = httpspider.Crawler:new(host, port, nil, { scriptname = SCRIPT_NAME } )
crawler:set_timeout(10000)
之后就是调用 crawl 方法
local status, r = crawler:crawl()
返回值为状态和一个表,我们输出一下这个表
可以看到有两个键,一个 response, url
调用 getLimitations 方法获取爬网的限制条件
大概使用的就这些