当使用以下功能扫描大量网站时,我收到了一个错误(见下文)。我是否应该在下面的函数中添加except
步骤来处理这样的错误,或者我的函数中的try
/ except
部分有问题?
函数:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import io
import requests.exceptions
import time
import asyncio
from concurrent.futures import ProcessPoolExecutor, as_completed
import io
df = pd.read_csv('myScan.csv')
urls = df.T.values.tolist()[2]
results = {}
status = {}
async def scrape(url):
try:
r = requests.get(url, timeout=(3, 6))
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
if soup.body:
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"Currently Unavailable": soup.body.findAll(text = re.compile("Currently Unavailable", re.I)),
"button_2": soup.findAll(text = re.compile('button_2.js'))}
results[url] = data
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, requests.exceptions.MissingSchema):
status[url] = "Connection Error"
except (requests.exceptions.HTTPError):
status[url] = "Http Error"
except (requests.exceptions.TooManyRedirects):
status[url] = "Redirects"
except (requests.exceptions.RequestException) as err:
status[url] = "Fatal Error: " + err + url
else:
status[url] = "OK"
async def main():
await asyncio.wait([scrape(url) for url in urls])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
comingList= []
openingList = []
underList = []
button_2 = []
statusList = []
for url in urls:
if(not results.get(url)):
statusList.append(status.get(url))
comingList.append("-")
openingList.append("-")
underList.append("-")
button_2.append("-")
else:
statusList.append(status.get(url))
comingList.append("x" if len(results[url].get("coming soon")) > 0 else "-")
openingList.append("x" if len(results[url].get("Opening Soon")) > 0 else "-")
underList.append("x" if len(results[url].get("Under Construction")) > 0 else "-")
button_2.append("x" if len(results[url].get("button_2")) > 0 else "-")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["button_2"] = pd.DataFrame(button_2, columns=['button_2'])
df['status'] = pd.DataFrame(statusList, columns=['Status'])
df.to_csv('myScanCompleted.csv', index=False)
错误:
Task exception was never retrieved
future: <Task finished name='Task-43943' coro=<scrape() done, defined at crawler.py:69> exception=TypeError('can only concatenate str (not "ChunkedEncodingError") to str')>
Traceback (most recent call last):
File "/usr/local/lib/python3.9/site-packages/urllib3/response.py", line 697, in _update_chunk_length
self.chunk_left = int(line, 16)
ValueError: invalid literal for int() with base 16: b''
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.9/site-packages/urllib3/response.py", line 438, in _error_catcher
yield
File "/usr/local/lib/python3.9/site-packages/urllib3/response.py", line 764, in read_chunked
self._update_chunk_length()
File "/usr/local/lib/python3.9/site-packages/urllib3/response.py", line 701, in _update_chunk_length
raise InvalidChunkLength(self, line)
urllib3.exceptions.InvalidChunkLength: InvalidChunkLength(got length b'', 0 bytes read)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.9/site-packages/requests/models.py", line 753, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "/usr/local/lib/python3.9/site-packages/urllib3/response.py", line 572, in stream
for line in self.read_chunked(amt, decode_content=decode_content):
File "/usr/local/lib/python3.9/site-packages/urllib3/response.py", line 793, in read_chunked
self._original_response.close()
File "/usr/local/Cellar/python@3.9/3.9.0_5/Frameworks/Python.framework/Versions/3.9/lib/python3.9/contextlib.py", line 135, in __exit__
self.gen.throw(type, value, traceback)
File "/usr/local/lib/python3.9/site-packages/urllib3/response.py", line 455, in _error_catcher
raise ProtocolError("Connection broken: %r" % e, e)
urllib3.exceptions.ProtocolError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "crawler.py", line 71, in scrape
r = requests.get(url, timeout=(3, 6))
File "/usr/local/lib/python3.9/site-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/usr/local/lib/python3.9/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python3.9/site-packages/requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.9/site-packages/requests/sessions.py", line 697, in send
r.content
File "/usr/local/lib/python3.9/site-packages/requests/models.py", line 831, in content
self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
File "/usr/local/lib/python3.9/site-packages/requests/models.py", line 756, in generate
raise ChunkedEncodingError(e)
requests.exceptions.ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "crawler.py", line 89, in scrape
status[url] = "Fatal Error: " + err + url
TypeError: can only concatenate str (not "ChunkedEncodingError") to str
发布于 2022-10-11 14:12:04
我也遇到了同样的错误。无法说明原因,但从html.parser
解析器切换到lxml
为我修复了它。
可能有用:解析器之间的差异
https://stackoverflow.com/questions/65263445
复制相似问题