我已经创建了一个使用scrapy.The爬虫爬虫的网站和抓取链接。**所使用的技术:**Python,Scrapy Error抓取相对urls,因为刮刀器无法抓取网页。我要爬虫只取无源网址。救命啊!!
import scrapy
import os
class MySpider(scrapy.Spider):
name = 'feed_exporter_test'
# this is equivalent to what you would set in settings.py file
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'file1.csv'
}
filePath='file1.csv'
if os.path.exists(filePath):
os.remove(filePath)
else:
print("Can not delete the file as it doesn't exists")
start_urls = ['https://www.jamoona.com/']
def parse(self, response):
titles = response.xpath("//a/@href").extract()
for title in titles:
yield {'title': title}
发布于 2021-06-29 15:25:37
这是答案。
import scrapy
import os
class MySpider(scrapy.Spider):
name = 'feed_exporter_test'
# this is equivalent to what you would set in settings.py file
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'file1.csv'
}
filePath = 'file1.csv'
if os.path.exists(filePath):
os.remove(filePath)
else:
print("Can not delete the file as it doesn't exists")
start_urls = ['https://www.jamoona.com/']
def parse(self, response):
urls = response.xpath("//a/@href").extract()
for url in urls:
abs_url = response.urljoin(url)
yield {'title': abs_url}https://stackoverflow.com/questions/68178958
复制相似问题