
以下是一个完整的Python爬虫实现,可爬取世俱杯比分数据。注意:实际爬取前请确认目标网站的robots.txt政策和相关法律法规。
www.hr107.com/fifa-club-world-cup(假设世俱杯数据路径)<table>或<div>容器中pip install requests beautifulsoup4 pandasimport requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
# 配置请求参数
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
'Referer': 'https://www.hr107.com/',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
# 代理设置(如果需要)
PROXIES = {
# 'http': 'http://your_proxy:port',
# 'https': 'https://your_proxy:port'
}
def scrape_match_data(url):
"""爬取单页面比分数据"""
try:
response = requests.get(
url,
headers=HEADERS,
proxies=PROXIES,
timeout=15
)
response.encoding = 'utf-8'
if response.status_code != 200:
print(f"请求失败: HTTP {response.status_code}")
return None
soup = BeautifulSoup(response.text, 'html.parser')
# 假设比分数据在class="match-list"的div中
match_list = soup.find('div', class_='match-list')
if not match_list:
print("未找到比赛数据容器")
return None
matches = []
# 假设每场比赛在class="match-item"的元素中
for item in match_list.find_all('div', class_='match-item'):
try:
# 解析比赛元素(根据实际结构调整选择器)
date = item.find('span', class_='date').text.strip()
home_team = item.find('div', class_='home-team').text.strip()
away_team = item.find('div', class_='away-team').text.strip()
score = item.find('div', class_='score').text.strip()
tournament = item.find('span', class_='tournament').text.strip()
matches.append({
'date': date,
'home_team': home_team,
'score': score,
'away_team': away_team,
'tournament': tournament
})
except Exception as e:
print(f"解析比赛条目失败: {str(e)}")
continue
return matches
except Exception as e:
print(f"爬取过程中出错: {str(e)}")
return None
def main():
# 示例URL(根据实际URL修改)
base_url = "https://www.hr107.com/fifa-club-world-cup/scores?page={}"
all_matches = []
# 爬取多页数据(根据实际分页调整)
for page in range(1, 6):
url = base_url.format(page)
print(f"正在爬取第 {page} 页: {url}")
matches = scrape_match_data(url)
if matches:
all_matches.extend(matches)
print(f"已获取 {len(matches)} 条比赛数据")
else:
print(f"第 {page} 页无数据")
# 随机延迟防止被封
time.sleep(random.uniform(1.5, 3.5))
# 保存到CSV
if all_matches:
df = pd.DataFrame(all_matches)
df.to_csv('fifa_club_worldcup_scores.csv', index=False, encoding='utf-8-sig')
print(f"共爬取 {len(df)} 条数据,已保存到CSV文件")
else:
print("未获取到有效数据")
if __name__ == "__main__":
main().find()替代.find_all()定位唯一元素# 若数据通过AJAX加载,可直接请求API
API_URL = "https://www.hr107.com/api/matches?tournament=fifa&season=2023"
response = requests.get(API_URL, headers=HEADERS)
data = response.json() # 直接解析JSON数据原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。