这前段时间有一件事“格力举报奥克斯空调质量",我看了一下京东这两家店铺,感觉很有意思,看着就觉得奥克斯空调选购指数高很多。所以,就尝试爬一下看看,练手小demo。
jd页面数据绝大多数是通过Ajax请求获取的,我用浏览器调试工具(F12),发现这些Ajax很复杂,多层调用,并且关键数据做了些混淆,就是直接去请求Ajax链接返回的数据还需要通过特定JS处理,得到原有数据。一直被卡住了,最后通过一个HttpUnit(带JS解析器,可以爬取动态页面)。
主要是想爬格力和奥克斯 空调的各型号的选购指数,顺带把商品标题、价格、评论人数、店铺、选购指数等都爬了一遍,
由于一个系列,有多个型号(大小匹数),但这个系列的选购指数是差别不大了,就不用都爬了。
jdk1.8
maven
mysql
SpringBoot
根据分析和结合实际,我们创建如下表
CREATE TABLE `jd_item` (
`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',
`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',
`title` varchar(100) DEFAULT NULL COMMENT '商品标题',
`price` double DEFAULT NULL COMMENT '商品价格',
`url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
`created` datetime DEFAULT NULL COMMENT '创建时间',
`updated` datetime DEFAULT NULL COMMENT '更新时间',
`comment` double DEFAULT NULL COMMENT '评价人数',
`score` double DEFAULT NULL COMMENT '选购指数',
`shop` varchar(100) DEFAULT NULL COMMENT '选购店铺',
PRIMARY KEY (`id`),
KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京东商品表';
使用Spring Boot+Spring Data JPA和定时任务进行开发,HtmlUnit获取动态网页,Jsoup解析页面。
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.2.RELEASE</version>
</parent>
<groupId>cn.itcast</groupId>
<artifactId>itcast-crawler-jd</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!--SpringMVC-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--SpringData Jpa-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<!--MySQL连接包-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!-- HttpClient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<!--HtmlUnit-->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.32</version>
</dependency>
<!--Jsoup-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!--工具包-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>
#DB Configuration:
spring.datasource.driverClassName=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler
spring.datasource.username=root
spring.datasource.password=root
#JPA Configuration:
spring.jpa.database=MySQL
spring.jpa.show-sql=true
package cn.itcast.jd.pojo;
import javax.persistence.*;
import java.util.Date;
@Entity
@Table(name = "jd_item")
public class Itempojo {
//主键
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
//标准产品单位(商品集合)/#plist > ul > li:nth-child(26) > div
private Long spu;
//库存量单位(最小品类单元)/#plist > ul > li:nth-child(26) > div
private Long sku;
//商品标题/#plist > ul > li:nth-child(26) > div > div.p-name > a > em
private String title;
//商品价格/#plist > ul > li:nth-child(26) > div > div.p-price > strong:nth-child(1) > i
private Double price;
//商品详情地址/#plist > ul > li:nth-child(26) > div > div.p-name > a
private String url;
//创建时间
private Date created;
//更新时间
private Date updated;
//评价人数/#plist > ul > li:nth-child(26) > div > div.p-commit > strong > a
private Double comment;
//选购指数/#plist > ul > li:nth-child(26) > div > div.p-commit > strong > a
private Double score;
//选购店铺/#plist > ul > li:nth-child(26) > div > div.p-shop > span > a
private String shop;
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public Long getSpu() {
return spu;
}
public void setSpu(Long spu) {
this.spu = spu;
}
public Long getSku() {
return sku;
}
public void setSku(Long sku) {
this.sku = sku;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Double getPrice() {
return price;
}
public void setPrice(Double price) {
this.price = price;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Date getCreated() {
return created;
}
public void setCreated(Date created) {
this.created = created;
}
public Date getUpdated() {
return updated;
}
public void setUpdated(Date updated) {
this.updated = updated;
}
public Double getComment() {
return comment;
}
public void setComment(Double comment) {
this.comment = comment;
}
public Double getScore() {
return score;
}
public void setScore(Double score) {
this.score = score;
}
public String getShop() {
return shop;
}
public void setShop(String shop) {
this.shop = shop;
}
}
public interface ItemDao extends JpaRepository<Itempojo,Long> {
}
public interface ItemService {
public void save(Itempojo itempojo);
public List<Itempojo> findAll(Itempojo itempojo);
}
@Service
public class ItemServiceImpl implements ItemService {
@Autowired
private ItemDao itemDao;
@Override
@Transactional
public void save(Itempojo itempojo) {
this.itemDao.save(itempojo);
}
@Override
public List<Itempojo> findAll(Itempojo itempojo) {
//声明查询条件
Example<Itempojo> example = Example.of(itempojo);
//根据查询条件进行查询数据
List<Itempojo> list = this.itemDao.findAll(example);
return list;
}
}
@SpringBootApplication
//使用定时任务,需要先开启定时任务,需要添加注解
@EnableScheduling
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
}
@Component
public class Httputils {
public Httputils() {
}
public String doGetHtml(String str) {
final WebClient webClient = new WebClient(BrowserVersion.CHROME);//新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常, 这里选择不需要
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
webClient.waitForBackgroundJavaScript(15000);//异步JS执行需要耗时,所以这里线程要阻塞15秒,等待异步JS执行结束
//cookie删除了
HtmlPage page = null;
try {
page = webClient.getPage(str);//尝试加载网页
} catch (Exception e) {
e.printStackTrace();
} finally {
webClient.close();
}
String pageXml = page.asXml();//直接将加载完成的页面转换成xml格式的字符串
return pageXml;
}
}
在一个类完成了,本来应该分开的,模块发开发嘛,但是,这个小Demo只是练手的,就和一起呗
@Component
public class ItemTask {
@Autowired
private Httputils httputils;
@Autowired
private ItemService itemService;
private static final ObjectMapper MAPPER = new ObjectMapper();
//当下载任务完成后,间隔多长时间进行下一次的任务。
@Scheduled(fixedDelay = 11 * 2500)
public void itemTask() throws Exception {
//声明需要解析的初始地址
// String url = "https://list.jd.com/list.html?cat=737,794,870&ev=exbrand_7420&page=";//格力
String url = "https://list.jd.com/list.html?cat=737,794,870&ev=exbrand_3659&page=";//奥克斯
//按照页面对手机的搜索结果进行遍历解析
for (int i = 1; i < 6; i++) {
Thread.sleep(5000);
String html = httputils.doGetHtml(url + i);
System.out.println("测试" + i);
// System.out.println(html);
//解析页面,获取商品数据并存储
this.parse(html);
}
System.out.println("手机数据抓取完成!");
}
//解析页面,获取商品数据并存储
private void parse(String html) throws Exception {
//解析html获取Document
Document doc = Jsoup.parse(html);
//获取spu信息/
Elements spuEles = doc.select("li.gl-item > div");
for (Element spuEle : spuEles) {
//获取spu
long spu = Long.parseLong(spuEle.attr("data-sku_temp"));
//获取sku
long sku = Long.parseLong(spuEle.attr("data-active-sku"));
Itempojo item = new Itempojo();
item.setSku(sku);
//在数据库中查询商品数据,感觉并不需要
List<Itempojo> list = this.itemService.findAll(item);
if (list.size() > 0) {
//如果商品存在,就进行下一个循环,该商品不保存,因为已存在
continue;
}
//设置商品的spu
item.setSpu(spu);
//获取商品的详情的url
String itemUrl = "https://item.jd.com/" + sku + ".html";
item.setUrl(itemUrl);
//获取商品的标题#plist > ul > li:nth-child(26) > div > div.p-name > a > em
String title = spuEle.select("div.p-name > a > em").text();
item.setTitle(title);
//获取商品的价格#plist > ul > li > div > div.p-price > strong:nth-child(1) > i,这里会出现一个”暂无报价“
String pricetext = spuEle.select("div.p-price > strong > i").first().text();
if (pricetext == null || pricetext.length() <= 0 || pricetext.equals("暂无报价")) {
System.out.println("数据异常");
item.setPrice(0.0);
} else {
Double price = Double.parseDouble(pricetext);
item.setPrice(price);
}
//爬取时间
item.setCreated(new Date());
item.setUpdated(item.getCreated());
//评价人数/#plist > ul > li> div > div.p-commit > strong > a
String cm = spuEle.select("div.p-commit > strong > a").text();
if (cm == null || cm.length() <= 1) {
System.out.println("数据异常");
item.setComment(0.0);
} else {
String substring = cm.substring(cm.length() - 2, cm.length() - 1);
if (substring.equals("万")) {
item.setComment(Double.parseDouble(cm.substring(0, cm.length() - 2)) * 10000);
} else {
item.setComment(Double.parseDouble(cm.substring(0, cm.length() - 1)));
}
}
//选购指数/li > div > div.p-commit > span > em
String scoretest = spuEle.select("div.p-commit > span > em").text();
if (scoretest == null || scoretest.length() <= 0) {
System.out.println("数据异常");
item.setScore(0.0);
} else {
item.setScore(Double.parseDouble(scoretest));
}
//选购店铺/#plist > ul > li:nth-child(26) > div > div.p-shop > span > a
String shop = spuEle.select("div.p-shop > span > a").text();
item.setShop(shop);
//检测是否存在异常数据,并处理
Itempojo itempojo = this.check(item);
//保存商品数据到数据库中
this.itemService.save(itempojo);
}
}
public Itempojo check(Itempojo item) {
if (item.getComment() == 0.0 || item.getScore() == 0.0) {
//解析页面
String page = httputils.doGetHtml(item.getUrl());
Document document = Jsoup.parse(page);
//取得评论数
String cm = document.select("#comment-count > a").text();
if (cm.length() <= 0 || cm == null) {
} else {
item.setComment(Double.parseDouble(cm));
}
//取得选购指数
String score = document.select("#buy-rate > a").text();
if (cm.length() <= 0 || cm == null) {
} else {
item.setScore(Double.parseDouble(score));
}
}
return item;
}
}
(此部分代码注释很多删除了,如需下载源码,请访问!!)
这篇文章太长了,就单单讲代码吧,关于数据、及数据分析的部分,就下一篇文章吧。