今天主要给大家介绍一下Selenium测试工具,它是一款浏览器测试专用的工具,能够模拟用户对浏览器进行鼠标点击、页面滑动等功能。一方面能够用于网页测试,另一方面能够辅助网页内容爬取(例如通过滑动页面置底通过Ajax自动加载的页面内容,或通过点击Button才能下载的内容等)。
本文主要介绍Selenium辅助海量基金数据的获取。以腾讯证券为例进行说明(网址:http://stockhtm.finance.qq.com/fund/jzzx/index.htm),对应页面如下所示:
通过上图可以看到,我们需要在页面输入历史数据对应的时间,以及按键“输出到EXCEL”才能导出对应的历史数据。需要使用Selenium,(1)找到历史数据的context控件,输入对应的时间;(2)找到“输出到EXCEL”的Button控件名,并按下Button控件。
elem = driver.find_element_by_name("textfield") #通过id找到日期输入框的
elem.clear() #内容清空
elem.send_keys(date) #date是输入的日期
elem1 = driver.find_element_by_name("Submit01") #确定输入的日期
action1 = ActionChains(driver).move_to_element(elem1)
action1.click(elem1)
action1.perform()
elem2 = driver.find_element_by_name("Submit02") #导出数据到excel
action2 = ActionChains(driver).move_to_element(elem2)
action2.click(elem2)
action2.perform()
3. 数据可视化,通过使用baidu的echart工具,对获取的数据进行可视化展示。
(1)所有基金增长每日总量的变化趋势图:可以看出八月份(最后三十天)基金大幅度增长
(2)每天增长的基金数量占所有基金比例的变化趋势,可以看出80-144天时超过半数的基金在涨。
(3)每天所有基金价格增长的总和如下:
代码附录:
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
dates = []
Stri = "20170"
Mon = [1,3,5,7,8]
for i in range(3,9):
if i in Mon:
N = 31
else:
N = 30
for j in range(1,N+1):
if j <10:
tmpS = '0'+str(j)
else:
tmpS = str(j)
dates.append(Stri+str(i)+tmpS)
driver = webdriver.Chrome()
driver.get("http://stockhtm.finance.qq.com/fund/jzzx/index.htm")
elem = driver.find_element_by_name("textfield")
for dat in dates:
elem.clear()
elem.send_keys(dat)
elem1 = driver.find_element_by_name("Submit01")
action1 = ActionChains(driver).move_to_element(elem1)
action1.click(elem1)
action1.perform()
elem2 = driver.find_element_by_name("Submit02")
action2 = ActionChains(driver).move_to_element(elem2)
action2.click(elem2)
action2.perform()
# -*- coding: utf-8 -*-
import os
import numpy as np
import pandas as pd
from pyecharts import Line
def Getfile(dirName):
'''
获取文件夹数据
'''
files = os.listdir(dirName)
return files
def ReName(path,files):
'''
文件重命名
'''
for f in files:
os.rename(dirName+f,dirName+f[9:])
def PlotLine(line,datx,daty,Marker):
'''
绘制线型图像
'''
#line = Line(Title)
line.add(Marker,datx,daty, mark_point=["average","max"])
#return line
#line.show_config()
#line.render()
def CalIncRate(NewFiles):
'''
计算每日增长基金占比
'''
dayInc = []
dayIncVal = []
mk = 0
for f in NewFiles:
mk += 1
print mk
data = pd.read_html(dirName+f)
Inc = data[0][5][1:]
IncVal = map(float,data[0][4][1:])
dayIncVal.append(sum(IncVal))
cnt = 0
for rate in Inc:
if float(rate) > 0:
cnt += 1
dayInc.append(cnt/float(len(Inc)))
return dayInc,dayIncVal,np.cumsum(dayIncVal)
def CalIncNum(NewFiles):
'''
计算每日总基金增长点
'''
dayIncNum = []
return dayIncNum
dirName = u'./股票型基金/'
NewFiles = Getfile(dirName)
dayInc,dayIncVal,dayIncCum = CalIncRate(NewFiles)
line = Line("每日增长基金占比","3月-8月")
PlotLine(line,range(len(dayInc)),dayInc,"比例")
line.show_config()
line.render('h1.html')
line = Line("每日基金增长总量","3月-8月")
PlotLine(line,range(len(dayInc)),dayIncVal,"增长量")
line.show_config()
line.render('h2.html')
line = Line("基金累计增长总量","3月-8月")
PlotLine(line,range(len(dayInc)),list(dayIncCum),"增长累积量")
line.show_config()
line.render('h3.html')