前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >Python3--爬取海词信息

Python3--爬取海词信息

作者头像
K同学啊
发布2019-01-22 14:59:40
2980
发布2019-01-22 14:59:40
举报

上代码:

代码语言:javascript
复制
#!/usr/bin/python3

import queue
import threading
import requests,csv,time,random  
from bs4 import BeautifulSoup  
from fake_useragent import UserAgent 
import pandas as pd 

exitFlag = 0

#利用pandas读取csv文件
def getNames(csvfile):
    data = pd.read_csv(csvfile,delimiter='|')                   # 1--读取的文件编码问题有待考虑
    names = data['EnName']
    return names

#获取ip列表  
def get_ip_list():      
    f=open('ip.txt','r')      
    ip_list=f.readlines()      
    f.close()      
    return ip_list      
      
#从IP列表中获取随机IP      
def get_random_ip(ip_list):      
    proxy_ip = random.choice(ip_list)      
    proxy_ip=proxy_ip.strip('\n')      
    proxies = {'https': proxy_ip}      
    return proxies   
  
#功能:将信息写入文件      
def write_file(filePath,row):        
    with open(filePath,'a+',encoding='utf-8',newline='') as csvfile:        
        spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)        
        spanreader.writerow(row)  

def get_content(url,ip_list):
    
    try:
        try:
            time.sleep(1)
            proxies = get_random_ip(ip_list)
            headers = {'User-Agent':str(UserAgent().random)}
            req = requests.get(url=url, proxies=proxies,headers=headers,timeout=20)
        except:
            print("重新运行")
            time.sleep(10)
            proxies = get_random_ip(ip_list)
            headers = {'User-Agent':str(UserAgent().random)}
            req = requests.get(url=url, proxies=proxies,headers=headers,timeout=40)
    except:
        print("第二次重新运行")
        time.sleep(15)
        proxies = get_random_ip(ip_list)
        headers = {'User-Agent':str(UserAgent().random)}
        req = requests.get(url=url, proxies=proxies,headers=headers)

    req.encoding = 'utf-8'
    soup = BeautifulSoup(req.text,'lxml')

    content = soup.find_all('div',class_='mbox')
    return req.status_code, content


#获取准确的英文名、中文名、名字含义、来源、性别等信息
def get_infor_header(content):
    content = content.find_all('span')
    
    EnName = []
    CnName = []
    Gender = []
    Source = []
    Meaning = []

    EnName.append(content[0].get_text())
    if len(content) != 1:
        CnName.append(content[1].get_text())
        Meaning.append(content[2].get_text()) 
        Source.append(content[3].get_text())
        Gender.append(content[4].em.get('title'))
    else:
        CnName.append('')
        Meaning.append('') 
        Source.append('')
        Gender.append('')

    #信息的链接方式EnName|CnName|Gender|Source|Meaning
    list_header = EnName + CnName + Gender + Source + Meaning

    return list_header

#获取英文名对应的名人
def get_infor_celebrity(content):
    content = content.find_all('li')
    list_celebrity = []
    str_celebrity=''
    for each in content:
        if not str_celebrity:
            str_celebrity +=each.get_text()
        else:
            str_celebrity +='@' + each.get_text()
    list_celebrity.append(str_celebrity)
    return list_celebrity

class myThread (threading.Thread):
    def __init__(self, threadID, name, q,ip_list):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.q = q
        self.ip_list = ip_list
    def run(self):
        print ("开启线程:" + self.name)
        process_data(self.name, self.q,ip_list)
        print ("退出线程:" + self.name)

def process_data(threadName, q,ip_list):
    while not exitFlag:
        queueLock.acquire()
        if not workQueue.empty():
            data = q.get()
            queueLock.release()
            print ("%s processing %s" % (threadName, data))
            url = 'http://ename.dict.cn/{}'.format(data)
            status_code, content = get_content(url,ip_list)
            if status_code==200:
                #获取准确的中文名、名字含义、来源、性别等信息
                list_header = get_infor_header(content[0])
                #获取名人信息
                list_celebrity = get_infor_celebrity(content[1])
                row = list_header + list_celebrity
                queueLock.acquire()
                write_file('haici_infor.csv',row)
                queueLock.release()
        else:
            queueLock.release()
        time.sleep(1)

threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5", "Thread-6", "Thread-7", "Thread-8", "Thread-9", "Thread-10"]
nameList = getNames('A-Z.csv')
queueLock = threading.Lock()
workQueue = queue.Queue(100000)
threads = []
threadID = 1

# 创建新线程
ip_list = get_ip_list()
for tName in threadList:
    thread = myThread(threadID, tName, workQueue,ip_list)
    thread.start()
    threads.append(thread)
    threadID += 1

# 填充队列
queueLock.acquire()
for word in nameList:
    workQueue.put(word)
queueLock.release()

# 等待队列清空
while not workQueue.empty():
    pass

# 通知线程是时候退出
exitFlag = 1

# 等待所有线程完成
for t in threads:
    t.join()
print ("退出主线程")
本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2018年04月19日,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档