内容爬虫完毕,校验完毕,缺失信息暂未统计。总数据720万,地区3200个,年份从2011-2019,大小950Mb,原始数据已丢失,需要的朋友可以自己运行脚本挂一晚上。中间遇到了很多坑,有机会我再写一遍博客专门讲讲大量数据爬虫遇到的坑。
本人在使用基于java的脚本语言groovy做爬虫时,用了获取城市历史天气的任务做练习。数据源隐藏了,有需要的我可以直接发数据。使用过程中虽然有些绊脚石,总体来说还是很良好的,脚本语言groovy相比java的确省事儿很多。分享代码,供大家参考。(城市对应的编码存在一个js里面了,这里不写)
1package com.fan
2
3import com.fission.source.httpclient.ApiLibrary
4import com.fission.source.httpclient.FanRequest
5import com.fission.source.mysql.MySqlTest
6import com.fission.source.source.WriteRead
7import com.fission.source.utils.Log
8import net.sf.json.JSONException
9import net.sf.json.JSONObject
10
11class Weather extends ApiLibrary {
12
13/**
14 * 获取城市2011-2018年数据
15 * @param cityId
16 */
17 static getCityAll(int cityId) {
18 for (int j in 2011..2018) {
19 getCityYear(cityId, j)
20 sleep(1000 + getRandomInt(1000))
21 }
22 }
23
24/**
25 * 获取当年的数据
26 * @param cityId
27 * @param year
28 */
29 static getCityYear(int cityId, int year) {
30 for (int i in 1..12) {
31 if (year == 2019 && i > 9) continue
32 getMonth(cityId, year, i)
33 sleep(1000 + getRandomInt(1000))
34 }
35 }
36/**
37 * 获取某个城市某一年某一月的数据
38 * @param cityId
39 * @param year
40 * @param month
41 */
42 static getMonth(int cityId, int year, int month) {
43 def yyyymm;
44 def uri;
45 if (year > 2016) {
46 yyyymm = year * 100 + month
47 uri = "http://tianqi.***.com/t/wea_history/js/" + yyyymm + "/" + cityId + "_" + yyyymm + ".js"
48 } else {
49 yyyymm = year + EMPTY + month
50 uri = "http://tianqi.***.com/t/wea_history/js/" + cityId + "_" + yyyymm + ".js"
51 }
52 output(uri)
53 def response = FanRequest.isGet()
54 .setUri(uri)
55 .getResponse()
56 .getString("content")
57 .substring(16)
58 .replace(";", EMPTY)
59 def weather = JSONObject.fromObject(response)
60 def city = weather.getString("city")
61 def array = weather.getJSONArray("tqInfo")
62 output(array.size())
63 for (int i in 0..array.size() - 1) {
64 JSONObject info = array.get(i)
65 if (!info.containsKey("ymd")) continue
66 def date = info.getString("ymd")
67 def low = info.getString("bWendu").replace("℃", EMPTY)
68 def high = info.getString("yWendu").replace("℃", EMPTY)
69 def wea = info.getString("tianqi")
70 def wind = info.getString("fengxiang")
71 def fengli = info.getString("fengli")
72 def aqi = TEST_ERROR_CODE, aqiInfo = EMPTY, aqiLevel = TEST_ERROR_CODE;
73 if (info.containsKey("aqi")) {
74 aqi = info.getInt("aqi")
75 aqiInfo = info.getString("aqiInfo")
76 aqiLevel = info.getInt("aqiLevel")
77 }
78 String sql = "INSERT INTO weather (city,low,high,date,wind,windsize,weather,aqi,aqilevel,aqiinfo) VALUES (\"%s\",%d,%d,\"%s\",\"%s\",\"%s\",\"%s\",%d,%d,\"%s\");"
79 sql = String.format(sql, city, changeStringToInt(low), changeStringToInt(high), date, wind, fengli, wea, aqi, aqiLevel, aqiInfo)
80 output(sql)
81 MySqlTest.sendWork(sql)
82 }
83 }
84}
这里是数据库的数据截图: