上代码:
'''
本代码用来爬取https://www.babyment.com/yingwenming/kaitou.php?start_letter=A&page=1的信息
'''
import requests,csv,time,random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
#获取ip列表
def get_ip_list():
f=open('IP.txt','r')
ip_list=f.readlines()
f.close()
return ip_list
#从IP列表中获取随机IP
def get_random_ip(ip_list):
proxy_ip = random.choice(ip_list)
proxy_ip=proxy_ip.strip('\n')
proxies = {'http': proxy_ip}
return proxies
#功能:将信息写入文件
def write_file(filePath,row):
with open(filePath,'a+',encoding='utf-8',newline='') as csvfile:
spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)
spanreader.writerow(row)
#解析baby网
def get_EnNames_list(url,ip_list):
print('输入进来的url为:{}'.format(url))
#获取随机IP,headers防止ip被封
headers = {'User-Agent':str(UserAgent().random)}
proxies = get_random_ip(ip_list)
try:
req = requests.get(url=url,headers=headers,proxies=proxies,timeout=10)
except:
print('运行出错10秒后重新运行')
time.sleep(10)
headers = {'User-Agent':str(UserAgent().random)}
proxies = get_random_ip(ip_list)
req = requests.get(url=url,headers=headers,proxies=proxies,timeout=10)
#在利用find_all()注意要准确定位
soup = BeautifulSoup(req.text,'lxml')
content = soup.find('table',class_='table')
content = content.find('tbody')
content = content.find_all('tr')
name = []
#列表中没有find_all()方法,故需要利用for语句
for each in content:
name.append(each.find_all('b')[0].get_text())
return name
#获取baby网中所有的的英文名
def get_EnNames(letter,ip_list):
for number in range(1,100):
url = 'https://www.babyment.com/yingwenming/kaitou.php?start_letter={}&page={}'.format(letter,number)
#一个网页一个网页的获取我们需要的英文名
name = get_EnNames_list(url,ip_list)
#当page遇到最大值时,name就会为空,我们利用这一点进行切换,进入下一个字母的爬取
if not name:
print('{}开头的英文名共{}个'.format(letter,number-1))
break
for each in name:
#将一个列表分为多个列表,从而实现换行
a=[]
a.append(each)
write_file('A-Z.csv',a)
if __name__ == "__main__":
ip_list = get_ip_list()
for letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
get_EnNames(letter,ip_list)