前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >python爬取电影信息

python爬取电影信息

作者头像
小王不头秃
发布2024-06-19 15:06:55
1730
发布2024-06-19 15:06:55
举报

刚刚接触爬虫,模仿之前写的代码对80s网站的电影信息进行爬取,爬取的网址为80s

使用的库

代码语言:javascript
复制
import re  # 正则表达式
import urllib.request, urllib.error  # 指定url,获取网页数据
from bs4 import BeautifulSoup  # 网页解析

爬虫代码

代码语言:javascript
复制
from api import test as t
# 引入第三方模块
import re  # 正则表达式
import urllib.request, urllib.error  # 指定url,获取网页数据
from bs4 import BeautifulSoup  # 网页解析

baseurl = 'https://www.80s.tw/hot'

imglink = re.compile(r'<img alt="(.*?)" class="p" id="(.*?)" src="(.*?)"/>')

titlelink = re.compile(r'<a href="(.*?)" title="(.*?)">')
findlink = re.compile(r'<a href="(.*?)">')  # 创建正则表达式 表示规则


# 1.爬取网页
def getData():
    urllist = []
    valuelist = []
    # 2.解析数据
    img = []
    src = []
    title = []
    fens = []
    contents = []
    html = askURL(baseurl)
    bs = BeautifulSoup(html, "html.parser")
    for item in bs.find_all('div', class_="lpelmt2 me2li"):
        item = str(item)
        titlel = re.findall(titlelink, item)
        for t in titlel:
            title.append(t[1])
            print(t[1])
            tsrc = "https://www.80s.tw" + t[0]
            fen, content = getContentAndFen(tsrc)
            # fen, content = "6","2"
            fens.append(fen)
            contents.append(content)
            src.append(tsrc)
            print(fen,content)
        imgl = re.findall(imglink, item)
        for i in imgl:
            img.append("https:" + i[2])
    return title, img, src, fens, contents;


# 得到一个url的网页内容1
def askURL(url):
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
        "Cookie": "BAIDU_SSP_lcr=https://www.baidu.com/link?url=HMnQR6d-rPO0YlyHtrIM7E4dn4YUvW6Vm1bNsMLt4WO&wd=&eqid=e3e4166c0000b93600000003603caae8; Hm_lvt_caa88905362b7957005130b28e579d36=1614588653; _ga=GA1.2.617854400.1614588655; _gid=GA1.2.1808945187.1614588655; beitouviews_3758=OUHKI5ksCimBxsKCLklg%252BlwvUZh1FuJ6Vyi9m6XmS6eaAV9W6jgPS14FvCyFS4GHUf3YfgIhBBj5A%252FQLXbdsgSrgYpyHGtzo%252BLBHH0vHJdqh8jvMZEDRH%252FSbbFZITKsr5ErvsUY2Ao%252B5ID8ZFZIeOtAU%252F%252F6wFTelIC3oCspNs%252BbSHJcV2GtqrjikD4mrMGEkdsd3tL0z9v6mHtZh8cPS48AvWQtlpbvQi%252F6jyNUEP1ziCm9fHUmufiDHQEPZNMx0LXzlQATlHuRiScjiXziIgn9w%252BXqCyODFwuwkhDsdEmE1W%252FpFNiIfS9FE1Om0jr22Ig5Ybaavihtfb4NPt89qtQ%253D%253D; 3758_2470_111.36.138.122=1; richviews_3760=tNiZFpEMqXWe%252BFIoHRJd6y6X7RfaTIM3payNSGO2qHjxpAF9DWNOhKKdRJppp4O4V5EHhtbdcrsdgMHtJ04HLqx%252B94djknSuo1i%252B4mFZgv1hOId%252FB49VuDfByAxn5GkjahAWEq3XZww2iosVDdJQtudDjU5V%252BZH17hqG%252FQQB0XHUTOpmaLSMwQB8uoBynw%252F3xAd0ZnPNenng5MOlP2jZBh4%252Fnyan4yKv1zE33NWayTbIyXKnk1NVN1xaiKlRWO6r2Xo9b71Uk97wu9TAG9qJ54szIm90ke%252BDsPoBO1M3ZjeLBgPwN%252F9djQV6daKpCeJjPJqkY2tzbrxnKvddMmFJ1Q%253D%253D; 3760_2444_111.36.138.122=1; Hm_lpvt_caa88905362b7957005130b28e579d36=1614588658"
    }
    req = urllib.request.Request(url=url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(req)
        html = response.read()
    except Exception as result:
        print(result)
    return html


def getContentAndFen(url):
    contentlink = re.compile(r'<span class="font_888">剧情介绍:</span>(.*?)<', re.S)
    fenlink = re.compile(r'<span class="score .*?"></span>(.*?)</span>', re.S)
    html = askURL(str(url))
    f = ""
    c = ""
    bs = BeautifulSoup(html, "html.parser")
    for item in bs.find_all('div', class_="info"):
        item = str(item)
        content = re.findall(contentlink, item)
        fen = re.findall(fenlink, item)
        if len(fen) > 0:
            f = fen[0]
        if len(content) > 0:
            c = content[0]
    return f, c
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2024-06-19,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档