import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://papers.gceguide.com/A%20Levels/Physics%20(9702)/2015/"
folder_location = r'C:\Users\'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)我如何过滤掉不必要的东西,让它下载所有的pdf文件,只包含'qp_2‘
发布于 2022-04-03 19:46:24
要下载任何在其文件名中包含pdf的qp_2,可以使用下一个示例:
import requests
from bs4 import BeautifulSoup
url = "https://papers.gceguide.com/A%20Levels/Physics%20(9702)/2015/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for n in soup.select('a.name[href*="qp_2"]'):
print("Downloading", n.text)
with open(n.text, "wb") as f_out:
r = requests.get(url + n.text)
f_out.write(r.content)打印和下载文件:
Downloading 9702_s15_qp_21.pdf
Downloading 9702_s15_qp_22.pdf
Downloading 9702_s15_qp_23.pdf
Downloading 9702_w15_qp_21.pdf
Downloading 9702_w15_qp_22.pdf
Downloading 9702_w15_qp_23.pdf发布于 2022-04-03 19:53:33
选择更具体的链接,并检查qp_2和.pdf在css selector中的位置
soup.select("a[href*='qp_2'][href$='.pdf']")Alternativ是在迭代时重复检查:
for a in soup.select("a[href*='qp_2']"):
if a['href'].endswith('.pdf'):
with open(a['href'], "wb") as f_out:
r = requests.get(url + a['href'])
f_out.write(r.content)示例
import requests
from bs4 import BeautifulSoup
url = "https://papers.gceguide.com/A%20Levels/Physics%20(9702)/2015/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a in soup.select("a[href*='qp_2'][href$='.pdf']"):
with open(a['href'], "wb") as f_out:
r = requests.get(url + a['href'])
f_out.write(r.content)https://stackoverflow.com/questions/71729216
复制相似问题