前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >异步爬虫+asyncio+python3.7+(async + await )

异步爬虫+asyncio+python3.7+(async + await )

作者头像
用户2337871
发布2020-04-24 10:13:55
4400
发布2020-04-24 10:13:55
举报
文章被收录于专栏:gitgit
代码语言:javascript
复制

 import  asyncio 
 import  json 
 import  time 
 import  traceback 
 import  aiohttp 
 import  logging 
 from  aiohttp  import  ContentTypeError 
 from  motor.motor_asyncio  import  AsyncIOMotorClient 
 logging.basicConfig( level =logging.INFO, 
   format = ' %(asctime)s  -  %(levelname)s :  %(message)s ' ) 
 INDEX_URL =  'https://dynamic5.scrape.cuiqingcai.com/api/book/?limit=18&offset= {offset} ' 
 DETAIL_URL =  'https://dynamic5.scrape.cuiqingcai.com/api/book/ {id} ' 
 PAGE_SIZE =  18 
 PAGE_NUMBER =  1 
 CONCURRENCY =  5 
 MONGO_CONNECTION_STRING =  'mongodb://localhost:27017' 
 MONGO_DB_NAME =  'books' 
 MONGO_COLLECTION_NAME =  'books' 
 client = AsyncIOMotorClient(MONGO_CONNECTION_STRING) 
 db = client[MONGO_DB_NAME] 
 collection = db[MONGO_CONNECTION_STRING] 
 loop = asyncio.get_event_loop() 
 class   Spider ( object ): 
   def   __init__ ( self ): 
   self .semaphore = asyncio.Semaphore(CONCURRENCY) 
   async   def   scrape_api ( self ,  url ): 
   async   with   self .semaphore: 
   try : 
                 logging.info( 'scraping  %s ' , url) 
   async   with   self .session.get(url)  as  response: 
   await  asyncio.sleep( 1 ) 
   return   await  response.json() 
   except  ContentTypeError  as  e: 
                 traceback.print_exc(e) 
                 logging.error( 'error occurred while scraping  %s ' , url,  exc_info = True ) 
   async   def   scrape_index ( self ,  page ): 
         url = INDEX_URL.format( offset =PAGE_SIZE * (page -  1 )) 
   return   await   self .scrape_api(url) 
   async   def   scrape_detail ( self ,  id ): 
         url = DETAIL_URL.format( id = id ) 
         data =  await   self .scrape_api(url) 
   await   self .save_data(data) 
   async   def   save_data ( self ,  data ): 
         logging.info( 'saving data  %s ' , data) 
   if  data: 
   return   await  collection.update_one({ 
   'id' : data.get( 'id' ) 
             }, { 
   '$set' : data 
             },  upsert = True ) 
   async   def   main ( self ): 
   self .session = aiohttp.ClientSession() 
   # index tasks 
         scrape_index_tasks = [asyncio.ensure_future( self .scrape_index(page))  for  page  in   range ( 1 , PAGE_NUMBER +  1 )] 
         results =  await  asyncio.gather(*scrape_index_tasks) 
   # detail tasks 
   print ( 'results' , results) 
         ids = [] 
   for  index_data  in  results: 
   if   not  index_data:  continue 
   for  item  in  index_data.get( 'results' ): 
                 ids.append(item.get( 'id' )) 
         scrape_detail_tasks = [asyncio.ensure_future( self .scrape_detail( id ))  for   id   in  ids] 
   await  asyncio.wait(scrape_detail_tasks) 
   await   self .session.close() 
 if   __name__  ==  '__main__' : 
     spider = Spider() 
     loop.run_until_complete(spider.main()) 
本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档