我有这个完成的python web报废脚本。不幸的是,在执行了几个小时的其他时间之后,我得到了以下错误。我真的不知道是什么导致了错误,而且是如此令人沮丧,因为每次我想尝试修复时,都要花一天的时间才能发现它不起作用。
该脚本是通过100个网络共享代理随机提出请求。
如果你们谁能告诉我怎么解决这个问题,我会非常感激的。
ERROR 1 ->
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 445, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 440, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.9/http/client.py", line 1347, in getresponse
response.begin()
File "/usr/lib/python3.9/http/client.py", line 307, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.9/http/client.py", line 276, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 755, in urlopen
retries = retries.increment(
File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 574, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.website.com', port=443): Max retries exceeded with url: /api (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Scrapper/./run_15days.py", line 112, in <module>
response = reqs.get(url,proxies={
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 555, in get
return self.request('GET', url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 510, in send
raise ProxyError(e, request=request)
requests.exceptions.ProxyError: HTTPSConnectionPool(host='www.website.com', port=443): Max retries exceeded with url: /api (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response')))
ERROR 2 ->
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 696, in urlopen
self._prepare_proxy(conn)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 966, in _prepare_proxy
conn.connect()
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 411, in connect
self.sock = ssl_wrap_socket(
File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket
ssl_sock = _ssl_wrap_socket_impl(
File "/usr/lib/python3/dist-packages/urllib3/util/ssl_.py", line 493, in _ssl_wrap_socket_impl
return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
File "/usr/lib/python3.9/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/usr/lib/python3.9/ssl.py", line 1040, in _create
self.do_handshake()
File "/usr/lib/python3.9/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:1123)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 755, in urlopen
retries = retries.increment(
File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 574, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.website.com', port=443): Max retries exceeded with url: /api (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1123)')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Scrapper/./run_365days.py", line 111, in <module>
response = reqs.get(url,proxies={
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 555, in get
return self.request('GET', url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 514, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='www.website.com', port=443): Max retries exceeded with url: /api (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1123)')))脚本的代码如下:
import requests
import json
import time
from datetime import date, timedelta
import mariadb
import logging
logging.basicConfig(filename="Logs/log_14days_flights.txt", level=logging.DEBUG, filemode = "w+")
conn = mariadb.connect(
user="",
password="",
host="localhost",
database="")
z = 0
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" ,
'referer':'https://www.google.com/'
}
status_code = 403
while status_code == 403 or status_code == 404:
reqs = requests.Session()
response = reqs.get("https://website.com",proxies={
"http": "http://p.webshare.io:9999",
"https": "http://p.webshare.io:9999"
}, headers=header)
z = z + 1
print('Response HTTP Status Code: ', response.status_code)
status_code = response.status_code
airprot_response = response.content
# print(response.content)
my_json = airprot_response.decode('utf8')
airport_data = json.loads(my_json)
print(type(airport_data))
# airport_data = json.dumps(airport_data, indent=4, sort_keys=True)
# print(type(airport_data))
airport_lst = airport_data.get('airports')
airport_codes_lst = []
for ai in airport_lst:
airport_code = ai.get('iataCode')
airport_codes_lst.append(airport_code)
print(len(airport_codes_lst))
# print(airport_codes_lst)
airport_number = 1
for i in airport_codes_lst:
cur = conn.cursor()
query = "create table if not exists {airport}(id INT primary key auto_increment,dep_airport VARCHAR(100),arrival_airport VARCHAR(100),flight_number VARCHAR(100),flight_key VARCHAR(100), dep_date VARCHAR(100),arrival_date VARCHAR(100),flight_duration VARCHAR(100), date_dep_ VARCHAR(100), updated_on timestamp);".format(airport=i)
cur.execute(query)
conn.commit()
print('Airport No. : ', airport_number)
logging.info("Program is working as expected")
url = "https://www.website.com".format(airport = i)
print(url)
status_code = 403
while status_code == 403 or status_code == 404:
reqs = requests.Session()
response = reqs.get(url,proxies={
"http": "http://p.webshare.io:9999",
"https": "http://p.webshare.io:9999"
}, headers=header)
z = z + 1
print('Response HTTP Status Code: ', response.status_code)
status_code = response.status_code
dest_airprot_response = response.content
my_json = dest_airprot_response.decode('utf8')
dest_airport_data = json.loads(my_json)
# print(dest_airport_data)
fares_data_lst = dest_airport_data.get('fares')
dest_airport_lst = []
for fares in fares_data_lst:
outbound_data = fares.get('outbound')
dest_airport = outbound_data.get('arrivalAirport').get('iataCode')
dest_airport_lst.append(dest_airport)
print(dest_airport_lst)
for j in dest_airport_lst:
url = "https://www.website.com".format(org = i, dst = j)
print(url)
status_code = 403
while status_code == 403 or status_code == 404:
reqs = requests.Session()
response = reqs.get(url,proxies={
"http": "http://p.webshare.io:9999",
"https": "http://p.webshare.io:9999"
}, headers=header)
z = z + 1
print('Response HTTP Status Code: ', response.status_code)
status_code = response.status_code
date_response = response.content
my_json = date_response.decode('utf8')
dates_data = json.loads(my_json)
print(type(dates_data))
days15_date = date.today() + timedelta(days=15)
days15_date = days15_date.strftime('%Y-%m-%d')
# print(type(days15_date))
for dts in dates_data:
# print(type(date))
if dts >= days15_date:
continue
url = "https://www.website.com".format(dst = j, org = i, date = dts)
print(url)
status_code = 403
while status_code == 403 or status_code == 404:
reqs = requests.Session()
response = reqs.get(url,proxies={
"http": "http://p.webshare.io:9999",
"https": "http://p.webshare.io:9999"
}, headers=header)
z = z + 1
print('Response HTTP Status Code: ', response.status_code)
status_code = response.status_code
flight_response = response.content
my_json = flight_response.decode('utf8')
flight_data = json.loads(my_json)
# print(flight_data)
trip_lst = flight_data.get('trips')
for trip in trip_lst:
dates = trip.get('dates')
for dt in dates:
flights = dt.get('flights')
# print(flights)
if len(flights)>= 1:
for fl in flights:
flight_key = str(fl.get('flightKey'))
print(flight_key)
arrival_airport = str(fl.get('segments')[0].get("destination"))
print(arrival_airport)
dep_airport = str(fl.get('segments')[0].get("origin"))
print(dep_airport)
flight_number = str(fl.get('segments')[0].get("flightNumber"))
print(flight_number)
dep_date = str(fl.get('segments')[0].get("time")[0])
print(dep_date)
arrival_date = str(fl.get('segments')[0].get("time")[1])
print(arrival_date)
flight_duration = str(fl.get('segments')[0].get("duration"))
print(flight_duration)
date_dep_ = dep_date.split('T')[0]
print(date_dep_)
try:
cur = conn.cursor()
query = f"DELETE FROM " + i
query = query + " WHERE flight_number =" + '"' +flight_number+ '"' + "and date_dep_ =" + '"' +date_dep_+ '"'
print(query)
cur.execute(query)
print(f"{cur.rowcount} details deleted")
conn.commit()
except:
print("Nothing to Delete")
cur = conn.cursor()
query = f"INSERT INTO " + i
query = query + f" (flight_key, arrival_airport,dep_airport, flight_number, dep_date, arrival_date, flight_duration, date_dep_) VALUES ('{flight_key}', '{arrival_airport}', '{dep_airport}', '{flight_number}', '{dep_date}', '{arrival_date}', '{flight_duration}', '{date_dep_}')"
# cur.execute(
# "INSERT INTO flight_data (flight_key,arrival_airport,dep_airport,flight_number,dep_date,arrival_date,flight_duration) VALUES (?, ?, ?, ?, ?, ?, ?)",
# (flight_key,arrival_airport,dep_airport,flight_number,dep_date,arrival_date,flight_duration))
cur.execute(query)
print(f"{cur.rowcount} details inserted")
logging.info(f"{cur.rowcount} details inserted")
conn.commit()
time.sleep(6)
time.sleep(2)
time.sleep(2)
logging_msg = i +" Airport flights Completed"
logging.info(logging_msg)
airport_number = airport_number + 1
print(z)非常感谢你在这方面给予我的帮助和时间。几个星期来,我一直在努力解决这个问题,实在找不到解决办法。
发布于 2022-09-10 15:15:48
这似乎是从服务器端Remote end closed connection without response关闭的连接,这会导致所有这些错误。
建议实现重试策略(有点像urllib3,参见下面的示例)、(如果情况确实不是垃圾邮件过多)或一些刮伤检测。
retries = Retry(connect=5, read=2, redirect=5)
http = PoolManager(retries=retries)
response = http.request("GET", "https://example.com/")或每个请求(这来自于urllib3文档):
response = http.request("GET", "https://example.com/", retries=Retry(10))如果这真的是因为他们知道你在刮擦,你需要更好的缓解技术,减缓刮擦,更多的IP等等。
https://stackoverflow.com/questions/73672884
复制相似问题