本人在使用 httpclient 的过程中,突然想起来可以爬取一些数据,比如全国的中学名。当然不是空穴来风,之前也做过这方面的爬虫,不过基于selenium 做的 UI 脚本,效率非常慢,而且很不稳定,所以这次采取了接口的形式,果然效率提升了几个档次。一共6万+数据,用了16分钟左右,期间包括数据库的存储。现在分享代码供大家参考。关键信息隐去,大家看一下思路就好了。
1package practise;
2
3import java.util.ArrayList;
4import java.util.HashMap;
5import java.util.List;
6import java.util.Map;
7import java.util.Set;
8import java.util.regex.Matcher;
9import java.util.regex.Pattern;
10import org.apache.http.client.methods.HttpGet;
11import net.sf.json.JSONObject;
12import source.ApiLibrary;
13import source.Concurrent;
14
15public class Crawler extends ApiLibrary {
16 public static String host = "";
17 public static Map<String, Integer> countrys = new HashMap<>();
18 public static Map<String, Integer> citys = new HashMap<>();
19 public static Map<String, Integer> address = new HashMap<>();
20 public static Map<String, Integer> school = new HashMap<>();
21 public static List<String> total = new ArrayList<>();
22
23 public static void main(String[] args) {
24 Crawler crawler = new Crawler();
25 crawler.getCountry1();// 省份
26 Set<String> countryId = countrys.keySet();
27 for (String name : countryId) {
28 int id = countrys.get(name);
29 crawler.getCountry2(id);// 市
30 Set<String> cityId = citys.keySet();
31 for (String city : cityId) {
32 int cid = citys.get(city);
33 crawler.getCountry3(cid);// 县
34 Set<String> adresss = address.keySet();
35 for (String adres : adresss) {
36 int aid = address.get(adres);
37 crawler.getCountry4(aid);// 名
38 Set<String> schol = school.keySet();
39 for (String sch : schol) {
40 String line = name + PART + city + PART + adres + PART + sch;
41 total.add(line);
42 }
43 }
44 }
45 }
46 Concurrent.saveRequestTimes(total);
47 testOver();
48 }
49
50 /**
51 * 查询省份
52 */
53 public void getCountry1() {
54 String url = host + "/user/editinfo/getSchollCountryList";
55 HttpGet httpGet = getHttpGet(url);
56 // httpGet.addHeader("Cookie", cookies);
57 // httpGet.addHeader("User-Agent", userangent);
58 JSONObject response = getHttpResponseEntityByJson(httpGet);
59 String[] country = response.getString("content").split("</a>");
60 int size = country.length;
61 for (int i = 0; i < size; i++) {
62 String msg = country[i];
63 int code = getCode(msg);
64 String name = getName(msg);
65 countrys.put(name, code);
66 }
67 }
68
69 /**
70 * 查询市
71 *
72 * @param id
73 */
74 public void getCountry2(int id) {
75 String url = host + "/user/editinfo/getSchollCityList?region_id=" + id;
76 HttpGet httpGet = getHttpGet(url);
77 JSONObject response = getHttpResponseEntityByJson(httpGet);
78 String[] ssString = response.getString("content").split("</a>");
79 int size = ssString.length;
80 citys.clear();
81 for (int i = 0; i < size; i++) {
82 String msg = ssString[i];
83 int code = getCode(msg);
84 String name = getName(msg);
85 citys.put(name, code);
86 }
87
88 }
89
90 /**
91 * 查询县
92 *
93 * @param id
94 */
95 public void getCountry3(int id) {
96 String url = host + "/user/editinfo/getSchollAddressList?region_id=" + id;
97 HttpGet httpGet = getHttpGet(url);
98 JSONObject response = getHttpResponseEntityByJson(httpGet);
99 String[] ssString = response.getString("content").split("</a>");
100 int size = ssString.length;
101 address.clear();
102 for (int i = 0; i < size; i++) {
103 String msg = ssString[i];
104 int code = getCode(msg);
105 String name = getName(msg);
106 address.put(name, code);
107 }
108 }
109
110 /**
111 * 查询学校
112 *
113 * @param id
114 */
115 public void getCountry4(int id) {
116 String url = host + "/user/editinfo/getSchoolNameList?region_id=" + id;
117 HttpGet httpGet = getHttpGet(url);
118 JSONObject response = getHttpResponseEntityByJson(httpGet);
119 String[] ssString = response.getString("content").split("</a>");
120 int size = ssString.length;
121 school.clear();
122 for (int i = 0; i < size; i++) {
123 String msg = ssString[i];
124 int code = getCode(msg);
125 String name = getName(msg);
126 school.put(name, code);
127 }
128 }
129
130 /**
131 * 获取 code
132 *
133 * @param text
134 * @return
135 */
136 public int getCode(String text) {
137 int code = 0;
138 Pattern pattern = Pattern.compile("\"\\d+\"");
139 Matcher matcher = pattern.matcher(text);
140 if (matcher.find()) {
141 code = changeStringToInt(matcher.group(0).replace("\"", ""));
142 }
143 return code;
144 }
145
146 /**
147 * 获取名称
148 *
149 * @param text
150 * @return
151 */
152 public String getName(String text) {
153 String name = text.substring(text.lastIndexOf(">") + 1, text.length());
154 return name;
155 }
156
157}
下面是爬取到数据截图