最近做了一个历史上今天的爬虫程序,跟历史天气数据源一致,数据量比较小,几十秒就爬完了。中间遇到一些问题,一起分享出来供大家参考。本项目源码和相关数据已经上传到了github,有兴趣的朋友可以去看看,会不定期更新。
git传送门:https://github.com/Fhaohaizi/fan
1static void main(String[] args) {
2 DEFAULT_CHARSET = GBK;
3
4 for (int i in 1..12) {
5 for (int j in 1..31) {
6 if (i == 2 && (j == 30 || j == 31)) continue
7 if ((i in [4, 6, 9, 11]) && j == 31) continue
8 def month = i > 9 ? i + EMPTY : "0" + i;
9 def day = j > 9 ? j + EMPTY : "0" + j;
10 def date = month + "-" + day
11 getInfo(date)
12 }
13 }
14 testOver()
15 }
16 static getInfo(String date) {
17 def url = "http://tools.***.com/his/" + date.replace("-", EMPTY) + "_c.js"
18 def all = FanRequest.isGet()
19 .setUri(url)
20 .getResponse()
21 .getString("content")
22 .substring(8)
23 .replace(";", EMPTY)
24 .replaceAll("( )+", EMPTY)
25 .replaceAll("\\t", EMPTY)
26 .replace("##", EMPTY)
27 .replaceAll(SPACE_1, EMPTY)
28 def json = JSONObject.fromObject(all)
29 def keys = json.keySet()
30 keys.each { key ->
31 def s = json.get(key).toString()
32 def all1 = Regex.regexAll(s, "\\{\"title.+?\\}")
33 for (int i in 0..all1.size() - 1) {
34 def info = all1.get(i)
35 def inf = JSONObject.fromObject(info.toString())
36 def title = inf.getString("title")
37 def keyword = inf.getString("keyword")
38 def content = inf.getString("content")
39 def alt = inf.getString("alt")
40 String sql = "INSERT INTO today_histroy (date,title,keyword,content,alt) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");"
41 sql = String.format(sql, key + "-" + date, title, keyword, content.replace(" ", EMPTY), alt)
42 MySqlTest.sendWork(sql)
43 }
44 }
45 }