最近在学习线程与进程,看网上评论大家都说进程更快一些,所以今天本着探索的精神自己分别写了一个线程和一个进程程序执行同样的操作,下面我们对比下两个程序的下载速度:(抱歉上传视频正在审核中)
通过对比我发现线程就像兰博基尼,在开始下载的时候动力不足,但后面马力十足。这是因为它开始需要拿到GIL,然后进行锁竞争、切换线程,会消耗资源,所以在在CPU密集型任务下,多进程会更快;而进程因就想法拉利,它因有进程池在开始下载就表现出不错的马力,但后面马力却提升不上去,所以它在IO密集型时,效率会更好一些。
下面提供下代码,有兴趣可以自己copy到本地跑一下:
-------------------------------进程-----------------------------------
importrequests
frombs4importBeautifulSoup
fromrequests.exceptionsimportRequestException
importos
fromurllib.requestimporturlretrieve
frommultiprocessingimportPool
importmultiprocessing
BASE_PAGE_URL='http://www.doutula.com/photo/list/?page='
#定义一个列表用于存储翻页目标url
PAGE_URLS_LIST=[]
#定义一个列表用于存储表情包url
FACE_URLS_LIST=[]
#获取所有翻页urls,并存储到列表
defget_urls():
foriinrange(101):
urls=BASE_PAGE_URL+str(i)
PAGE_URLS_LIST.append(urls)
returnPAGE_URLS_LIST
#获取网页源代码
defget_html(url):
try:
response=requests.get(url)
#如果返回网页状态码为200,表示访问成功,否则返回为空
ifresponse.status_code==200:
response.encoding='utf-8'
html=response.content
returnhtml
return None
exceptRequestException:
return None
#从网页源代码筛选数据,并存储到列表
defget_img_url(img_url):
soup=BeautifulSoup(img_url,'lxml')
img_infos=soup.find_all('img',attrs={'class':'img-responsive lazy image_dta'})
forimg_infoinimg_infos:
img_url=img_info['data-original']
FACE_URLS_LIST.append(img_url)
returnFACE_URLS_LIST
#保存图片到本地
defdownload_img(img_url):
img_name = img_url.split('/')[-1]
filename='images'
path = os.path.join(filename, img_name)
urlretrieve(img_url,path)
defmain():
forurlinget_urls():
html=get_html(url)
forimg_urlinget_img_url(html):
download_img(img_url)
if__name__ =='__main__':
pool=Pool(multiprocessing.cpu_count())
pool.map(main())
pool.close()
pool.join()
-------------------------------线程-----------------------------------
importrequests
frombs4importBeautifulSoup
fromurllib.requestimporturlretrieve
importos
fromthreadingimportThread
importthreading
BASE_PAGE_URL ='http://www.doutula.com/photo/list/?page='
# 页面的url列表
PAGE_URL_LIST = []
# 表情的url列表
FACE_URL_LIST = []
# 全局锁
gLock = threading.Lock()
foriinrange(101):
url = BASE_PAGE_URL +str(i)
PAGE_URL_LIST.append(url)
#获取图片下载地址
defget_img_url():
while True:
gLock.acquire()
iflen(PAGE_URL_LIST) ==:
gLock.release()
break
else:
page_url = PAGE_URL_LIST.pop()
gLock.release()
response = requests.get(page_url)
content = response.content
soup = BeautifulSoup(content,'lxml')
img_list = soup.find_all('img',attrs={'class':'img-responsive lazy image_dta'})
gLock.acquire()
forimginimg_list:
url = img['data-original']
FACE_URL_LIST.append(url)
gLock.release()
defdownload_img():
while True:
gLock.acquire()
iflen(FACE_URL_LIST) ==:
gLock.release()
continue
else:
facr_url = FACE_URL_LIST.pop()
gLock.release()
filename = facr_url.split('/')[-1]
path = os.path.join('images', filename)
urlretrieve(facr_url,filename=path)
defmain():
# 创建两个多线程来作为生产者,取爬取表情的url
foriinrange(3):
th = threading.Thread(target=get_img_url)
th.start()
# 创建4个线程作为消费者,去把表情图片下载下来
foriinrange(4):
th = threading.Thread(target=download_img)
th.start()
if__name__ =='__main__':
main()
领取专属 10元无门槛券
私享最新 技术干货