即使脚本访问的页面数量不到50个,我也会收到429个错误。如何调整脚本以避免错误。User-Agent是适用于我的Chrome浏览器的。我试图将time.sleep(10)
放在请求之后,但这没有帮助。我在Windows10上使用jupyter笔记本。
def get_wiki_list_italian_movies(year):
import sys, bs4, requests, textwrap, , time, re
from textblob import TextBlob
p = r'https://en.wikipedia.org/wiki/List_of_Italian_films_of_' + str(year)
list_wiki, list_wiki_links = [], []
header = {'User-agent':
'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/86.0.4240.111 safari/537.36'}
request = requests.get(p, headers = header)
time.sleep(10)
soup = bs4.BeautifulSoup(request.text, 'lxml')
table = soup.find_all('table', class_='wikitable')
for el in table:
td = el.find_all('td')
for t in td:
i = t.find('i')
if i:
for a in i.find_all('a', href=True):
result = re.sub(r'[()]','',a['href'])
if '/w/index' in result:
list_wiki.append(re.sub(r'\(.*','',a['title']).strip() + ' ' + str(year))
else:
list_wiki_links.append('https://en.wikipedia.org'+result)
for link in list_wiki_links:
request = requests.get(link, headers = header)
time.sleep(10)
soup = bs4.BeautifulSoup(request.text, 'lxml')
i_list = soup.find_all('i')
for i in i_list:
b = i.find('b')
if b:
t= b.text
if len(t) > 4 and TextBlob(t).detect_language() == 'it':
list_wiki.append(t.strip() + ' ' + str(year))
return sorted(list(set(list_wiki)))
def movies_wiki_list(years_span):
ll = []
for year in years_span:
ll += get_wiki_list_italian_movies(year)
time.sleep(10)
return ll
italian_movies_1932_1933 = movies_wiki_list(range(1932, 1934))
italian_movies_1932_1933
这是错误:
HTTPError Traceback (most recent call last)
<ipython-input-12-6a4bc670faa6> in <module>
53 return ll
54
---> 55 italian_movies_1932_1933 = movies_wiki_list(range(1932, 1934))
56 italian_movies_1932_1933
<ipython-input-12-6a4bc670faa6> in movies_wiki_list(years_span)
49 ll = []
50 for year in years_span:
---> 51 ll += get_wiki_list_italian_movies(year)
52 time.sleep(10)
53 return ll
<ipython-input-12-6a4bc670faa6> in get_wiki_list_italian_movies(year)
41 if b:
42 t= b.text
---> 43 if len(t) > 4 and TextBlob(t).detect_language() == 'it':
44 list_wiki.append(t.strip() + ' ' + str(year))
45
~\anaconda3\lib\site-packages\textblob\blob.py in detect_language(self)
566 :rtype: str
567 """
--> 568 return self.translator.detect(self.raw)
569
570 def correct(self):
~\anaconda3\lib\site-packages\textblob\translate.py in detect(self, source, host, type_)
70 data = {"q": source}
71 url = u'{url}&sl=auto&tk={tk}'.format(url=self.url, tk=_calculate_tk(source))
---> 72 response = self._request(url, host=host, type_=type_, data=data)
73 result, language = json.loads(response)
74 return language
~\anaconda3\lib\site-packages\textblob\translate.py in _request(self, url, host, type_, data)
90 if host or type_:
91 req.set_proxy(host=host, type=type_)
---> 92 resp = request.urlopen(req)
93 content = resp.read()
94 return content.decode('utf-8')
~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\anaconda3\lib\urllib\request.py in error(self, proto, *args)
561 http_err = 0
562 args = (dict, proto, meth_name) + args
--> 563 result = self._call_chain(*args)
564 if result:
565 return result
~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\anaconda3\lib\urllib\request.py in http_error_302(self, req, fp, code, msg, headers)
753 fp.close()
754
--> 755 return self.parent.open(new, timeout=req.timeout)
756
757 http_error_301 = http_error_303 = http_error_307 = http_error_302
~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 429: Too Many Requests
发布于 2020-10-29 04:07:08
根据维基百科的API指南:API:Etiquette和API:FAQ。
对读请求没有硬性限制,但我们要求您考虑周到,尽量不要关闭站点。大多数系统管理员保留不客气地阻止你的权利,如果你确实危及他们的网站的稳定性。
如果你的请求是串行的,而不是并行的(即,在发送新的请求之前等待一个请求完成,这样你永远不会同时发出多个请求),那么你肯定是可以的。还可以尝试将所有内容合并到一个请求中(例如,在titles参数中使用多个标题,而不是为每个标题发出新的请求
这建议您可以为每个API请求获取50个页面。
如果你需要离线内容,你也可以使用Data Dumps (可能有点过时了,我认为这对你来说不是问题)。
如果您达到了任何限制,您可以使用these状态消息来处理API调用中的错误和警告。
发布于 2020-10-29 18:17:51
HTTP 429错误具有误导性。这个错误不是来自Wiki服务器,而是来自google。谷歌对textblob的每日请求有费用限制。但是googletran的请求是免费的。所以我不得不安装googletrans并使用它。
!pip install googletrans
from googletrans import Translator
translator = Translator()
我不得不改变
TextBlob(t).detect_language() == 'it'
至
translator.detect(t).lang == 'it'
现在,即使我使用更大的搜索,即跨度为10年而不是2年。我也没有得到429个错误。
发布于 2021-06-09 10:16:42
您可以使用:
try:
urlpage=urllib.request.urlopen('url')
except:
time.sleep(1)
urlpage=urllib.request.urlopen('url')
这对我很有效。在收到任何错误后,只需等待一段时间,然后再次尝试相同的请求
https://stackoverflow.com/questions/64578913
复制相似问题