我有一个广泛的抓取抓取csv文件约20,000行。该文件具有名称、start_url和允许的域列。如下所示:
Name start_url allowed_domain
place1 https://place1.co.uk place1.co.uk
place2 https://place2.co.uk place2.co.uk
place3 https://place3.co.uk place3.co.uk下面是我的爬虫代码示例:
class FinalSpider(CrawlSpider):
name = "final"
df = pd.read_csv("places.csv")
start_urls = df["start_url"].values.tolist()
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(allow_domains=(df["allowed_domain"].values.tolist())), callback='parse_item', follow=True),)
def __init__(self):
pass
def parse_item(self, response):
# do stuff问题是,我的爬虫程序被允许跟踪任何允许的域中的链接,而不仅仅是与start_url关联的域。
发布于 2020-04-27 19:28:15
您不能在start_urls中为每个链接分配allowed_domain
您必须在DownloaderMiddleware的process_request方法中过滤urls
这是你的爬虫代码
class FinalSpider(CrawlSpider):
name = "final"
custom_settings = {
'DOWNLOAD_DELAY': 3,
'DOWNLOADER_MIDDLEWARES': {
'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
},
'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'RETRY_ENABLED': False,
'AJAXCRAWL_ENABLED': True
}
rules = (Rule(LinkExtractor(), follow=True),)
def start_requests(self):
df = pd.read_csv("places.csv")
for key, row in df.iterrows():
yield Request(url=row['start_url'],
callback=self.parse_item,
meta={'allowed_domain': row['allowed_domain']})以下是中间件代码
import tldextract
from scrapy.exceptions import IgnoreRequest
class MySpiderDownloaderMiddleware(object):
def process_request(self, request, spider):
site = tldextract.extract(request.url)
site = "{}.{}".format(site.domain, site.suffix)
if request.meta['allowed_domain'] not in site:
raise IgnoreRequest("Filtered offsite request")https://stackoverflow.com/questions/61455543
复制相似问题