SPIDER_MODULES = ['Amazon.spiders'] NEWSPIDER_MODULE = 'Amazon.spiders'
USER_AGENT = 'Amazon (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True
TELNETCONSOLE_ENABLED = False TELNETCONSOLE_HOST = '127.0.0.1' TELNETCONSOLE_PORT = [6023,]
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
#爬虫程序类中设置
custom_settings = {
'DEFAULT_REQUEST_HEADERS' : {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
}
CONCURRENT_REQUESTS = 32默认值16
CONCURRENT_REQUESTS_PER_DOMAIN = 16默认值8
默认值0,代表无限制,需要注意两点
这个值就代表一个规定死的值,代表对同一网址延迟请求的秒数
DOWNLOAD_DELAY = 3
开启True,默认False
AUTOTHROTTLE_ENABLED = True
起始的延迟
AUTOTHROTTLE_START_DELAY = 5
最小延迟
DOWNLOAD_DELAY = 3
最大延迟
AUTOTHROTTLE_MAX_DELAY = 10
每秒并发请求数的平均值
AUTOTHROTTLE_TARGET_CONCURRENCY = 16.0不能高于 CONCURRENT_REQUESTS_PER_DOMAIN或CONCURRENT_REQUESTS_PER_IP
AUTOTHROTTLE_DEBUG = True CONCURRENT_REQUESTS_PER_DOMAIN = 16 CONCURRENT_REQUESTS_PER_IP = 16