遇见问题
嘘嘘,不要让太多的人知道!!!!
代理ip是爬虫工资必要的消费,那么如何很好的利用各家服务商提供的免费代理IP呢?
使用方案
讲解一下,就是在爬取之前先到各家服务商哪里爬取最新的可利用代理IP,然后放在ip池里,然后再去爬取目标网站。如此以来,维护好这个ip池,就可以源源不断的接收新的可以使用的代理ip,剩下的工作就是从网上四处寻找代理网站了。
给个demo吧
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.util.ArrayList;
import java.util.List;
/**
* 使用免费代理demo
*
* Created by zc on 2017/8/11. */
public class HttpClientUtilTest {
public static void main(String[] args) throws Exception {
//第一步,爬取ip代理池
List<ProxyModel> proxyModelList = spriderProxyIp();
String ip = proxyModelList.get(2).getIp();
int port = proxyModelList.get(2).getPort();
System.out.println(ip + " " + port + " " + proxyModelList.get(2).getAnony());
//添加白名单
whilteList();
//请求目标地址
reqWeb(ip, port);
}
/**
* 请求目标地址
*
* @param ip 代理Ip
* @param port 端口
* @throws Exception 异常
*/
private static void reqWeb(String ip, int port) throws Exception {
HttpClientBuilder build = HttpClients.custom();
HttpHost proxy = new HttpHost(ip, port);
CloseableHttpClient client = build.setProxy(proxy).build();
String url = "http://write.blog.csdn.net/postedit/77099632";
HttpGet request = new HttpGet(url);
CloseableHttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
System.out.println(EntityUtils.toString(entity));
}
/**
* 添加自己外网ip到讯代理白名单
*
* @throws Exception 异常
*/
private static void whilteList() throws Exception {
String url = "http://www.xdaili.cn/ipagent/whilteList/addIp?spiderId=dce0442efaac42618205f177c2xxxxip=xx.xx.xx.xx";
HttpGet request = new HttpGet(url);
CloseableHttpClient client = HttpClients.custom().build();
CloseableHttpResponse response = client.execute(request);
System.out.println(response.getStatusLine());
}
/**
* 爬取讯代理IP池
*
* @return ip集合
* @throws Exception 异常
*/
private static List<ProxyModel> spriderProxyIp() throws Exception {
List<ProxyModel> proxyModelList = new ArrayList<>();
String url = "http://www.xdaili.cn/ipagent//freeip/getFreeIps?page=1&rows=10";
HttpGet request = new HttpGet(url);
CloseableHttpClient client = HttpClients.custom().build();
CloseableHttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
String resTxt = EntityUtils.toString(entity);
JSONObject jsonObject = JSON.parseObject(resTxt);
JSONArray rows = jsonObject.getJSONArray("rows");
rows.stream().map(v -> (JSONObject) v).filter(v -> v.getString("anony").equals("高匿")).forEach(v -> {
ProxyModel model = new ProxyModel();
model.setIp(v.getString("ip"));
model.setPort(Integer.parseInt(v.getString("port")));
model.setResponsetime(v.getString("responsetime"));
model.setAnony(v.getString("anony"));
proxyModelList.add(model);
});
return proxyModelList;
}
public static class ProxyModel {
private String ip;
private int port;
private String responsetime;
private String anony;
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public int getPort() {
return port;
}
public void setPort(int port) {
this.port = port;
}
public String getResponsetime() {
return responsetime;
}
public void setResponsetime(String responsetime) {
this.responsetime = responsetime;
}
public String getAnony() {
return anony;
}
public void setAnony(String anony) {
this.anony = anony;
}
}