Python json数据爬取处理，红点官网大奖设计作品爬取

二爷

发布于 2020-07-22 11:28:47

1.2K1

发布于 2020-07-22 11:28:47

文章被收录于专栏：二爷记二爷记

红点奖，源自德国。是与IF设计奖齐名的一个工业设计大奖，是世界上知名设计竞赛中最大最有影响的竞赛。红点奖与德国“IF奖”、美国“IDEA奖”一起并称为世界三大设计奖，想要知晓好设计，红点大奖设计不容错过！

官网：

https://www.red-dot.org/zh/search?q&p=0&a=&c=&y=2019&sort=relevance%20desc&f=product-design&tab=1

通过抓包可以发现，数据是通过json来获取！

get params参数：

我们来用python来实现：

分页初始设置为100，当获取不到我们想要的json数据时跳出循环！

#获取json数据
def get_datas(category):
    path=f'Red Dot Design Award/{category}'
    os.makedirs(path, exist_ok=True)  #创建目录
    for i in range(100):
        print(f'正在采集第{i+1}页...')
        params={
            'rows': '2',
            'start': f'{i}',
            'eID': 'tx_solr_proxy',
            'L': '2',
            'id': '1',
            'grouping': '0',
            #'fq': '(altType_stringS:"Product Design") AND year_intS:"2019"',
            'fq':f'(altType_stringS:"Product Design") AND year_intS: "2019" AND awardCategory_stringM:"{category}"',
            'sort': 'relevance desc',
        }
        url="https://www.red-dot.org/index.php?"
        response=requests.get(url,params=params)
        time.sleep(2)
        req = json.loads(response.content)
        print(req)
        docs = req['response']['docs']
        if len(docs) <1:
            break
        print(len(docs))
        print(docs)
        get_content(path, docs)

解析json数据，获取内容：

#获取内容
def get_content(path,docs):
    for doc in docs:
        h1=doc['title'] #标题
        h1 = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", h1)  # 剔除不合法字符
        img_name=f'{h1}.jpg'
        description=doc['description'] #描述
        remark=doc['juryStatement_stringS'] #评审团评语
        remark='评审团评语%s%s'%('\n',remark)
        img_url=doc['overviewImage_stringS']['large'] #大图
        get_img(img_name, img_url, path)
        texts = '%s%s%s%s%s' % (h1, '\n', description, '\n', remark)
        print(texts)
        get_texts(h1, texts, path)

图片下载：

#下载图片
def get_img(img_name,img_url,path):
    r=requests.get(img_url)
    print(f"开始下载 {img_name}图片 ...")
    with open(f'{path}/{img_name}','wb') as f:
        f.write(r.content)
    print(f">>>下载 {img_name}图片 成功！")

保存文本内容：

#保存文本内容
def get_texts(h1,texts,path):
    print(f"开始保存 {h1}.txt ...")
    with open(f'{path}/{h1}.txt','w',encoding='utf-8') as f:
        f.write(texts)
    print(f">>>保存 {h1}.txt 成功！")

主函数：

由于分类比较多，而且数据不好抓取，我们手动构建列表！

同时写了下错误代码！

def mian():
    categorys=["Audio","Automobile und Motorräder","Baby und Kind","Bad und Sanitär","Badarmaturen und Duschköpfe","Boote",
               "Brillen","Bürobedarf und Schreibwaren","Büromöbel und Bürostühle","Computer- und Informationstechnik",
               "Fahrräder und Fahrradzubehör","Fahrzeugzubehör","Gartengeräte und Gartenzubehör","Gartenmöbel",
               "Gartenwerkzeuge","Haushaltsgeräte und Haushaltszubehör","Healthcare","Heiz- und Klimatechnik","Hobby und Freizeit",
               "Industriegeräte","Innenarchitektur und Interior Design","Interior-Design-Elemente","Kameras und Kamerazubehör",
               "Kochgeschirr und Kochutensilien","Koffer und Taschen","Kommunikationstechnik","Küchen und Küchenmöbel",
               "Küchenarmaturen und Spülen","Küchengeräte und Küchenzubehör","Licht und Leuchten","Maschinen und Automation",
               "Materialien und Oberflächen","Medizinische Geräte und Medizintechnik","Mobiltelefone","Mode und Lifestyle-Accessoires",
               "Nutzfahrzeuge","Outdoor- und Camping-Equipment","Personal Care",
               ]
    for category in categorys:
        print(category)
        try:
            get_datas(category)
        except Exception as e:
            print(f"获取数据失败了，错误代码：{e}")

运行效果：

爬取效果：

附完整代码：

#红点奖作品采集
#20191212 by 微信：huguo00289
# -*- coding: utf-8 -*-

import requests
import re,time,os
import json

#获取json数据
def get_datas(category):
    path=f'Red Dot Design Award/{category}'
    os.makedirs(path, exist_ok=True)  #创建目录
    for i in range(100):
        print(f'正在采集第{i+1}页...')
        params={
            'rows': '2',
            'start': f'{i}',
            'eID': 'tx_solr_proxy',
            'L': '2',
            'id': '1',
            'grouping': '0',
            #'fq': '(altType_stringS:"Product Design") AND year_intS:"2019"',
            'fq':f'(altType_stringS:"Product Design") AND year_intS: "2019" AND awardCategory_stringM:"{category}"',
            'sort': 'relevance desc',
        }
        url="https://www.red-dot.org/index.php?"
        response=requests.get(url,params=params)
        time.sleep(2)
        req = json.loads(response.content)
        print(req)
        docs = req['response']['docs']
        if len(docs) <1:
            break
        print(len(docs))
        print(docs)
        get_content(path, docs)


#获取内容
def get_content(path,docs):
    for doc in docs:
        h1=doc['title'] #标题
        h1 = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", h1)  # 剔除不合法字符
        img_name=f'{h1}.jpg'
        description=doc['description'] #描述
        remark=doc['juryStatement_stringS'] #评审团评语
        remark='评审团评语%s%s'%('\n',remark)
        img_url=doc['overviewImage_stringS']['large'] #大图
        get_img(img_name, img_url, path)
        texts = '%s%s%s%s%s' % (h1, '\n', description, '\n', remark)
        print(texts)
        get_texts(h1, texts, path)


#下载图片
def get_img(img_name,img_url,path):
    r=requests.get(img_url)
    print(f"开始下载 {img_name}图片 ...")
    with open(f'{path}/{img_name}','wb') as f:
        f.write(r.content)
    print(f">>>下载 {img_name}图片 成功！")

#保存文本内容
def get_texts(h1,texts,path):
    print(f"开始保存 {h1}.txt ...")
    with open(f'{path}/{h1}.txt','w',encoding='utf-8') as f:
        f.write(texts)
    print(f">>>保存 {h1}.txt 成功！")


def mian():
    categorys=["Audio","Automobile und Motorräder","Baby und Kind","Bad und Sanitär","Badarmaturen und Duschköpfe","Boote",
               "Brillen","Bürobedarf und Schreibwaren","Büromöbel und Bürostühle","Computer- und Informationstechnik",
               "Fahrräder und Fahrradzubehör","Fahrzeugzubehör","Gartengeräte und Gartenzubehör","Gartenmöbel",
               "Gartenwerkzeuge","Haushaltsgeräte und Haushaltszubehör","Healthcare","Heiz- und Klimatechnik","Hobby und Freizeit",
               "Industriegeräte","Innenarchitektur und Interior Design","Interior-Design-Elemente","Kameras und Kamerazubehör",
               "Kochgeschirr und Kochutensilien","Koffer und Taschen","Kommunikationstechnik","Küchen und Küchenmöbel",
               "Küchenarmaturen und Spülen","Küchengeräte und Küchenzubehör","Licht und Leuchten","Maschinen und Automation",
               "Materialien und Oberflächen","Medizinische Geräte und Medizintechnik","Mobiltelefone","Mode und Lifestyle-Accessoires",
               "Nutzfahrzeuge","Outdoor- und Camping-Equipment","Personal Care",
               ]
    for category in categorys:
        print(category)
        try:
            get_datas(category)
        except Exception as e:
            print(f"获取数据失败了，错误代码：{e}")


if __name__=='__main__':
    mian()

本文参与腾讯云自媒体分享计划，分享自微信公众号。

原始发表：2019-12-12，如有侵权请联系 cloudcommunity@tencent.com 删除

json

本文分享自 Python与SEO学习微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体分享计划，欢迎热爱写作的你一起参与！

json

登录后参与评论

0 条评论

热度

Python json数据爬取处理，红点官网大奖设计作品爬取

Python json数据爬取处理，红点官网大奖设计作品爬取

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐