我试图遵循所有的内部链接,同时跟踪所有内部和外部链接到一个网站。我刚刚开始与Scrapy合作,我无法弄清楚我如何爬行,同时遵循网站上的所有内部链接。
它只获取深度1上的链接,但不跟随它们。
class BRS(CrawlSpider):
name = "brs"
rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def __init__(self):
global start_urls
#settings.overrides['DEPTH_LIMIT'] = 10
path = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(path,"urls.txt"), "rt") as f:
self.start_urls = filter(None,[url.strip() for url in f.readlines()])
start_urls = self.start_urls
def parse(self, response):
brsitem = BrsItem()
brsitem['url'] = response.url
internal = LinkExtractor(allow_domains=[response.url])
external = LinkExtractor(deny_domains=[response.url])
links = internal.extract_links(response)
internal = []
fd = open('output.txt','a+')
for link in links:
internal.append(link.url)
links = external.extract_links(response)
external = []
for link in links:
external.append(link.url)
for link in internal:
fd.write(link+"\tinternal\n")
for link in external:
fd.write(link+"\texternal\n")
return brsitem截至目前,我的urls.txt包含:http://www.stackoverflow.com
任何帮助都是非常感谢的。
发布于 2017-07-04 20:56:14
我使用这个link的引用让它工作,并且当我忘记设置DEPTH_LIMIT参数时,还在stackoverflow上阻止了我的ip。有些东西是通过艰苦的方式学到的。
import scrapy
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
import urllib2,requests
from scrapy.conf import settings
from requests.auth import HTTPBasicAuth
import urllib2,requests,os,sys
from urlparse import urlparse
from brs.items import BrsItem
class BRS(CrawlSpider):
name = "brs"
def __init__(self):
global start_urls,rules
settings.overrides['DEPTH_LIMIT'] = 10
path = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(path,"urls.txt"), "r+") as f:
self.start_urls = filter(None,[url.strip() for url in f.readlines()])
start_urls = self.start_urls
self.rules = (Rule(SgmlLinkExtractor(allow=()), callback=self.parse_items, follow=True),)
rules = self.rules
self._rules = rules
def extract_domain(self,url):
return urlparse(url).netloc
def parse_items(self, response):
internal = LinkExtractor(allow_domains=[self.extract_domain(response.url)])
external = LinkExtractor(deny_domains=[self.extract_domain(response.url)])
links = internal.extract_links(response)
internal = []
fd = open('output.txt','a+')
for link in links:
internal.append(link.url)
for link in internal:
fd.write(link+"\tinternal\n")
links = external.extract_links(response)
external = []
for link in links:
external.append(link.url)
for link in external:
fd.write(link+"\texternal\n")
for link in internal:
yield scrapy.Request(link.strip(), callback=self.parse_attr)
def parse_attr(self, response):
brsitem = BrsItem()
brsitem['url'] = response.url.strip()
return brsitemhttps://stackoverflow.com/questions/44905472
复制相似问题