前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >python文件下载

python文件下载

作者头像
热心的社会主义接班人
发布2018-05-16 15:52:41
1.4K0
发布2018-05-16 15:52:41
举报
文章被收录于专栏:cs

百度云源代码和几k,mm图片分享

下载简书交友的图片.网站 https://www.jianshu.com/c/bd38bd199ec6

代码语言:javascript
复制
import urllib.request
import urllib.parse
import re
import os
import random

def get_road(url0):
    req=urllib.request.Request(url0)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
                                 '(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
    response=urllib.request.urlopen(req)
    html=response.read().decode("utf-8")
    pattern=re.compile(r'<a class="title" target="_blank" href="(.*?)"')
    result=re.findall(pattern,html)
    return result

def get_jiaoyou_url(result,s0):
    s=s0
    return geturl(result,s)
    

def gethtml(ur):
    url=ur
    req=urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
                                 '(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
    response=urllib.request.urlopen(req)
    html=response.read().decode("utf-8")
    return html

def getpath(html):
    #reg=r'.*?\.png'
    reg=r'<img data-original-src="(.*?\.png)"'
    imgre=re.compile(reg)
    urls=imgre.findall(html)
    return urls

def  geturl(url,s):
    urls=[s+str(i) for i in url]
    for i in range(len(urls)):
        print(urls[i])
    print("url_length=",len(urls))
    return urls
    
    
def download(urls):
    global x
    print("++++++++++++++++")
    print(urls)
    print("length=",len(urls))
    if len(urls)==0:
        print("not download")
        return 0
    for url in urls:
        filename='/home/dflx/下载/jiaoyou_photo/'+str(x)+'.png'
        urllib.request.urlretrieve(url,filename)
        x+=1
    print(x)
    
def download_one(url):
    #ur='https://www.jianshu.com/p/407dac18983c'
    ur=url
    html=gethtml(ur)
    path=getpath(html)
    urls=geturl(path,'https:')
    download(urls)
    
    
def download_all(urls):
    print(len(urls))
    print('---------------')
    index=0
    while index<len(urls):
        print(urls[index])
        download_one(urls[index])
        index+=1
        print("********")
        

#urpath="https://www.jianshu.com/c/bd38bd199ec6?order_by=added_at&page="        
def page(url,start,end):
    print("$$$$$$$$$")
    lturl=[]
    for i in range(start,end):
        lturl.append(url+str(i))
    print(lturl)
    return lturl
        
        
        
       
        
 x=0   
def main():
    
    if __name__  == '__main__':
        """
        ur='https://www.jianshu.com/p/189d1b8101e6'
        download_one(ur)
        """
        
        urpath="https://www.jianshu.com/c/bd38bd199ec6?order_by=added_at&page=" 
        urall=page(urpath,0,999)
        for url in urall:
            print("the end url")
            print(url)
            result=get_road(url)
            allurls=get_jiaoyou_url(result,'https://www.jianshu.com')
            download_all(allurls)
        
    
    """
    url0="https://www.jianshu.com/c/bd38bd199ec6"
    #ur='https://www.jianshu.com/p/407dac18983c'
    ur='https://www.jianshu.com/p/189d1b8101e6'
    html=gethtml(ur)
    path=getpath(html)
    urls=geturl(path,'https:')
    download(urls)
    
    url0="https://www.jianshu.com/c/bd38bd199ec6"
    result=get_road(url0)
    allurls=get_jiaoyou_url(result,'https://www.jianshu.com')
    download_all(allurls)
    
 """

meizi.png

有500m,应该大概遍历了所有的文章

爬取 http://www.mm29.com/ ,下载图片

代码语言:javascript
复制
import urllib.request
import urllib.parse
import re
import os
import random

def get_road(url0):
    req=urllib.request.Request(url0)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
                                 '(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
    response=urllib.request.urlopen(req)
    html=response.read().decode("utf-8")
    pattern=re.compile(r'<a href="http://www.mm29.com/tag/(.*?)"')
    result=re.findall(pattern,html)
    print(result)
    return result

def get_jiaoyou_url(result,s0):
    s=s0
    return geturl(result,s)
    

def gethtml(ur):
    url=ur
    req=urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
                                 '(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
    response=urllib.request.urlopen(req)
    html=response.read().decode("utf-8")
    return html

def getpath(html):
    #reg=r'.*?\.png'
    reg=r'<img class="scrollLoading" data-original="http://img.mm29.com/images/(.*?.jpg)/800.jpg"'
    imgre=re.compile(reg)
    urls=imgre.findall(html)
    print("////////////////////////")
    print(urls)
    return urls

def  geturl(url,s0):
    urls=[s0+str(i) for i in url]
    for i in range(len(urls)):
        print(urls[i])
    print("url_length=",len(urls))
    return urls

def  get_imag_url(url,s1,s2):
    urls=[s1+str(i) for i in url]
    urls=[str(i)+s2 for i in urls]
    for i in range(len(urls)):
        print(urls[i])
    print("url_length=",len(urls))
    return urls
    
    
def download(urls):
    global x
    print("++++++++++++++++")
    print(urls)
    print("length=",len(urls))
    if len(urls)==0:
        print("not download")
        return 0
    for url in urls:
        filename='/home/dflx/下载/mm_picture/'+str(x)+'.jpg'
        urllib.request.urlretrieve(url,filename)
        x+=1
    print(x)
    
def download_one(mm_url):
    print("*****++++++")
    print(mm_url)
    html=gethtml(mm_url)
    url=getpath(html)
    imag_url=get_imag_url(url,"http://img.mm29.com/images/","/800.jpg")
    download(imag_url)
    
    
def download_all(urls):
    print("for count=",len(urls))
    print('---------------')
    index=0
    while index<len(urls):
        print(urls[index])
        download_one(urls[index])
        index+=1
        print("********")
        

#urpath="https://www.jianshu.com/c/bd38bd199ec6?order_by=added_at&page="        
def page(url,start,end):
    print("$$$$$$$$$")
    lturl=[]
    for i in range(start,end):
        lturl.append(url+'/'+str(i))
    print(lturl)
    return lturl
        
        
        
       
        
x=0
def main():
    url0="http://www.mm29.com/"
    s0="http://www.mm29.com/tag/"
    result=get_road(url0)
    mm_url=get_jiaoyou_url(result,s0)
    print("88888",mm_url[22])
    for i in range(23):
        url=page(mm_url[i],0,16)
        download_all(url)
        
      
    
```

下载了5000多张,大约也是500M,网站应该有反扒措施,我离开了后被禁止了.
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2018.05.07 ,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档