学习了HttpClient和Jsoup,就掌握了如何抓取数据和如何解析数据。但是HttpClient对动态数据解析支持不是很友好,所以又学习了HtmlUtil,用于解析动态数据。
主要目的是HtmlUtil和Jsoup的学习。
爬取凤凰网、网易、搜狐、今日头条。
除了今日头条,其他页面的数据都是静态的,很好爬取。
由于技术有限,对今日头条的详情页面爬取还是有点技术上的问题,待解决。
根据需求分析,我们创建的表如下
-- auto-generated definition
CREATE TABLE news
(
id INT AUTO_INCREMENT
PRIMARY KEY,
title VARCHAR(128) NULL,
url VARCHAR(256) NULL,
image VARCHAR(256) NULL,
create_date DATETIME NULL,
news_date DATETIME NULL,
content TEXT NULL,
source VARCHAR(32) NULL
);
使用`Spring Boot 进行开发
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.3.2.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.ray</groupId>
<artifactId>newscrawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>news-crawler</name>
<description>新闻爬虫</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<!-- 使用Jetty,需要在spring-boot-starter-web排除spring-boot-starter-tomcat,因为SpringBoot默认使用tomcat -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<exclusions>
<exclusion>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-tomcat</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Jetty适合长连接应用,就是聊天类的长连接 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jetty</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>1.3.2</version>
</dependency>
<dependency>
<groupId>com.github.pagehelper</groupId>
<artifactId>pagehelper-spring-boot-starter</artifactId>
<version>1.2.5</version>
</dependency>
<!--<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.11</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.0.29</version>
</dependency>
<!--swagger2的jar包-->
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger2</artifactId>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
<version>2.9.2</version>
</dependency>
<!--引入视觉的样式的UI-->
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger-ui</artifactId>
<version>2.9.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.42.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
spring.datasource.url=jdbc:mysql://localhost:3306/ray0804?useUnicode=true&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=UTC
spring.datasource.username=root
spring.datasource.password=root
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
# MyBatis configuration
mybatis.mapper-locations=classpath:mapper/*.xml
# \u5206\u9875\u914D\u7F6E
pagehelper.auto-dialect=true
pagehelper.reasonable=true
pagehelper.support-methods-arguments=true
# Redis configuration
#spring.redis.host=192.168.80.222
#spring.redis.port=6379
# urls
news.neteasy.url=http://news.163.com/rank/
news.toutiao.url=https://www.toutiao.com/ch/news_hot/
news.sohu.url=http://news.sohu.com/
news.ifeng.url=http://www.ifeng.com/
# logging
logging.level.root=info
public class News {
private Integer id;
private String title;
private String url;
private String image;
private Date createDate;
private Date newsDate;
private String source;
private String content;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getImage() {
return StringUtils.isNotBlank(image) ? image : "/img/news.png";
}
public void setImage(String image) {
this.image = image;
}
public Date getCreateDate() {
return createDate;
}
public void setCreateDate(Date createDate) {
this.createDate = createDate;
}
public Date getNewsDate() {
return newsDate;
}
public void setNewsDate(Date newsDate) {
this.newsDate = newsDate;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getSummary() {
// 去除网页中的所有标签,然后取出140个字符
String summary = NewsUtils.getTextFromContent(content);
// 值得注意,如果新闻太短,小于140个字符,则有多少截取多少!!!
summary = summary.substring(0, summary.length() > 140 ? 140 : summary.length()) + "...";
return summary;
}
public String getLargeImage() {
String largeImage = NewsUtils.getImageFromContent(content);
return StringUtils.isNotBlank(largeImage) ? largeImage : image;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append(" [");
sb.append("Hash = ").append(hashCode());
sb.append(", id=").append(id);
sb.append(", title=").append(title);
sb.append(", url=").append(url);
sb.append(", image=").append(image);
sb.append(", createDate=").append(createDate);
sb.append(", newsDate=").append(newsDate);
sb.append(", source=").append(source);
sb.append(", content=").append(content);
sb.append("]");
return sb.toString();
}
}
public class NewsExample {
protected String orderByClause;
protected boolean distinct;
protected List<Criteria> oredCriteria;
public NewsExample() {
oredCriteria = new ArrayList<Criteria>();
}
public void setOrderByClause(String orderByClause) {
this.orderByClause = orderByClause;
}
public String getOrderByClause() {
return orderByClause;
}
public void setDistinct(boolean distinct) {
this.distinct = distinct;
}
public boolean isDistinct() {
return distinct;
}
public List<Criteria> getOredCriteria() {
return oredCriteria;
}
public void or(Criteria criteria) {
oredCriteria.add(criteria);
}
public Criteria or() {
Criteria criteria = createCriteriaInternal();
oredCriteria.add(criteria);
return criteria;
}
public Criteria createCriteria() {
Criteria criteria = createCriteriaInternal();
if (oredCriteria.size() == 0) {
oredCriteria.add(criteria);
}
return criteria;
}
protected Criteria createCriteriaInternal() {
Criteria criteria = new Criteria();
return criteria;
}
public void clear() {
oredCriteria.clear();
orderByClause = null;
distinct = false;
}
protected abstract static class GeneratedCriteria {
protected List<Criterion> criteria;
protected GeneratedCriteria() {
super();
criteria = new ArrayList<Criterion>();
}
public boolean isValid() {
return criteria.size() > 0;
}
public List<Criterion> getAllCriteria() {
return criteria;
}
public List<Criterion> getCriteria() {
return criteria;
}
protected void addCriterion(String condition) {
if (condition == null) {
throw new RuntimeException("Value for condition cannot be null");
}
criteria.add(new Criterion(condition));
}
protected void addCriterion(String condition, Object value, String property) {
if (value == null) {
throw new RuntimeException("Value for " + property + " cannot be null");
}
criteria.add(new Criterion(condition, value));
}
protected void addCriterion(String condition, Object value1, Object value2, String property) {
if (value1 == null || value2 == null) {
throw new RuntimeException("Between values for " + property + " cannot be null");
}
criteria.add(new Criterion(condition, value1, value2));
}
public Criteria andIdIsNull() {
addCriterion("id is null");
return (Criteria) this;
}
public Criteria andIdIsNotNull() {
addCriterion("id is not null");
return (Criteria) this;
}
public Criteria andIdEqualTo(Integer value) {
addCriterion("id =", value, "id");
return (Criteria) this;
}
public Criteria andIdNotEqualTo(Integer value) {
addCriterion("id <>", value, "id");
return (Criteria) this;
}
public Criteria andIdGreaterThan(Integer value) {
addCriterion("id >", value, "id");
return (Criteria) this;
}
public Criteria andIdGreaterThanOrEqualTo(Integer value) {
addCriterion("id >=", value, "id");
return (Criteria) this;
}
public Criteria andIdLessThan(Integer value) {
addCriterion("id <", value, "id");
return (Criteria) this;
}
public Criteria andIdLessThanOrEqualTo(Integer value) {
addCriterion("id <=", value, "id");
return (Criteria) this;
}
public Criteria andIdIn(List<Integer> values) {
addCriterion("id in", values, "id");
return (Criteria) this;
}
public Criteria andIdNotIn(List<Integer> values) {
addCriterion("id not in", values, "id");
return (Criteria) this;
}
public Criteria andIdBetween(Integer value1, Integer value2) {
addCriterion("id between", value1, value2, "id");
return (Criteria) this;
}
public Criteria andIdNotBetween(Integer value1, Integer value2) {
addCriterion("id not between", value1, value2, "id");
return (Criteria) this;
}
public Criteria andTitleIsNull() {
addCriterion("title is null");
return (Criteria) this;
}
public Criteria andTitleIsNotNull() {
addCriterion("title is not null");
return (Criteria) this;
}
public Criteria andTitleEqualTo(String value) {
addCriterion("title =", value, "title");
return (Criteria) this;
}
public Criteria andTitleNotEqualTo(String value) {
addCriterion("title <>", value, "title");
return (Criteria) this;
}
public Criteria andTitleGreaterThan(String value) {
addCriterion("title >", value, "title");
return (Criteria) this;
}
public Criteria andTitleGreaterThanOrEqualTo(String value) {
addCriterion("title >=", value, "title");
return (Criteria) this;
}
public Criteria andTitleLessThan(String value) {
addCriterion("title <", value, "title");
return (Criteria) this;
}
public Criteria andTitleLessThanOrEqualTo(String value) {
addCriterion("title <=", value, "title");
return (Criteria) this;
}
public Criteria andTitleLike(String value) {
addCriterion("title like", value, "title");
return (Criteria) this;
}
public Criteria andTitleNotLike(String value) {
addCriterion("title not like", value, "title");
return (Criteria) this;
}
public Criteria andTitleIn(List<String> values) {
addCriterion("title in", values, "title");
return (Criteria) this;
}
public Criteria andTitleNotIn(List<String> values) {
addCriterion("title not in", values, "title");
return (Criteria) this;
}
public Criteria andTitleBetween(String value1, String value2) {
addCriterion("title between", value1, value2, "title");
return (Criteria) this;
}
public Criteria andTitleNotBetween(String value1, String value2) {
addCriterion("title not between", value1, value2, "title");
return (Criteria) this;
}
public Criteria andUrlIsNull() {
addCriterion("url is null");
return (Criteria) this;
}
public Criteria andUrlIsNotNull() {
addCriterion("url is not null");
return (Criteria) this;
}
public Criteria andUrlEqualTo(String value) {
addCriterion("url =", value, "url");
return (Criteria) this;
}
public Criteria andUrlNotEqualTo(String value) {
addCriterion("url <>", value, "url");
return (Criteria) this;
}
public Criteria andUrlGreaterThan(String value) {
addCriterion("url >", value, "url");
return (Criteria) this;
}
public Criteria andUrlGreaterThanOrEqualTo(String value) {
addCriterion("url >=", value, "url");
return (Criteria) this;
}
public Criteria andUrlLessThan(String value) {
addCriterion("url <", value, "url");
return (Criteria) this;
}
public Criteria andUrlLessThanOrEqualTo(String value) {
addCriterion("url <=", value, "url");
return (Criteria) this;
}
public Criteria andUrlLike(String value) {
addCriterion("url like", value, "url");
return (Criteria) this;
}
public Criteria andUrlNotLike(String value) {
addCriterion("url not like", value, "url");
return (Criteria) this;
}
public Criteria andUrlIn(List<String> values) {
addCriterion("url in", values, "url");
return (Criteria) this;
}
public Criteria andUrlNotIn(List<String> values) {
addCriterion("url not in", values, "url");
return (Criteria) this;
}
public Criteria andUrlBetween(String value1, String value2) {
addCriterion("url between", value1, value2, "url");
return (Criteria) this;
}
public Criteria andUrlNotBetween(String value1, String value2) {
addCriterion("url not between", value1, value2, "url");
return (Criteria) this;
}
public Criteria andImageIsNull() {
addCriterion("image is null");
return (Criteria) this;
}
public Criteria andImageIsNotNull() {
addCriterion("image is not null");
return (Criteria) this;
}
public Criteria andImageEqualTo(String value) {
addCriterion("image =", value, "image");
return (Criteria) this;
}
public Criteria andImageNotEqualTo(String value) {
addCriterion("image <>", value, "image");
return (Criteria) this;
}
public Criteria andImageGreaterThan(String value) {
addCriterion("image >", value, "image");
return (Criteria) this;
}
public Criteria andImageGreaterThanOrEqualTo(String value) {
addCriterion("image >=", value, "image");
return (Criteria) this;
}
public Criteria andImageLessThan(String value) {
addCriterion("image <", value, "image");
return (Criteria) this;
}
public Criteria andImageLessThanOrEqualTo(String value) {
addCriterion("image <=", value, "image");
return (Criteria) this;
}
public Criteria andImageLike(String value) {
addCriterion("image like", value, "image");
return (Criteria) this;
}
public Criteria andImageNotLike(String value) {
addCriterion("image not like", value, "image");
return (Criteria) this;
}
public Criteria andImageIn(List<String> values) {
addCriterion("image in", values, "image");
return (Criteria) this;
}
public Criteria andImageNotIn(List<String> values) {
addCriterion("image not in", values, "image");
return (Criteria) this;
}
public Criteria andImageBetween(String value1, String value2) {
addCriterion("image between", value1, value2, "image");
return (Criteria) this;
}
public Criteria andImageNotBetween(String value1, String value2) {
addCriterion("image not between", value1, value2, "image");
return (Criteria) this;
}
public Criteria andCreateDateIsNull() {
addCriterion("create_date is null");
return (Criteria) this;
}
public Criteria andCreateDateIsNotNull() {
addCriterion("create_date is not null");
return (Criteria) this;
}
public Criteria andCreateDateEqualTo(Date value) {
addCriterion("create_date =", value, "createDate");
return (Criteria) this;
}
public Criteria andCreateDateNotEqualTo(Date value) {
addCriterion("create_date <>", value, "createDate");
return (Criteria) this;
}
public Criteria andCreateDateGreaterThan(Date value) {
addCriterion("create_date >", value, "createDate");
return (Criteria) this;
}
public Criteria andCreateDateGreaterThanOrEqualTo(Date value) {
addCriterion("create_date >=", value, "createDate");
return (Criteria) this;
}
public Criteria andCreateDateLessThan(Date value) {
addCriterion("create_date <", value, "createDate");
return (Criteria) this;
}
public Criteria andCreateDateLessThanOrEqualTo(Date value) {
addCriterion("create_date <=", value, "createDate");
return (Criteria) this;
}
public Criteria andCreateDateIn(List<Date> values) {
addCriterion("create_date in", values, "createDate");
return (Criteria) this;
}
public Criteria andCreateDateNotIn(List<Date> values) {
addCriterion("create_date not in", values, "createDate");
return (Criteria) this;
}
public Criteria andCreateDateBetween(Date value1, Date value2) {
addCriterion("create_date between", value1, value2, "createDate");
return (Criteria) this;
}
public Criteria andCreateDateNotBetween(Date value1, Date value2) {
addCriterion("create_date not between", value1, value2, "createDate");
return (Criteria) this;
}
public Criteria andNewsDateIsNull() {
addCriterion("news_date is null");
return (Criteria) this;
}
public Criteria andNewsDateIsNotNull() {
addCriterion("news_date is not null");
return (Criteria) this;
}
public Criteria andNewsDateEqualTo(Date value) {
addCriterion("news_date =", value, "newsDate");
return (Criteria) this;
}
public Criteria andNewsDateNotEqualTo(Date value) {
addCriterion("news_date <>", value, "newsDate");
return (Criteria) this;
}
public Criteria andNewsDateGreaterThan(Date value) {
addCriterion("news_date >", value, "newsDate");
return (Criteria) this;
}
public Criteria andNewsDateGreaterThanOrEqualTo(Date value) {
addCriterion("news_date >=", value, "newsDate");
return (Criteria) this;
}
public Criteria andNewsDateLessThan(Date value) {
addCriterion("news_date <", value, "newsDate");
return (Criteria) this;
}
public Criteria andNewsDateLessThanOrEqualTo(Date value) {
addCriterion("news_date <=", value, "newsDate");
return (Criteria) this;
}
public Criteria andNewsDateIn(List<Date> values) {
addCriterion("news_date in", values, "newsDate");
return (Criteria) this;
}
public Criteria andNewsDateNotIn(List<Date> values) {
addCriterion("news_date not in", values, "newsDate");
return (Criteria) this;
}
public Criteria andNewsDateBetween(Date value1, Date value2) {
addCriterion("news_date between", value1, value2, "newsDate");
return (Criteria) this;
}
public Criteria andNewsDateNotBetween(Date value1, Date value2) {
addCriterion("news_date not between", value1, value2, "newsDate");
return (Criteria) this;
}
public Criteria andSourceIsNull() {
addCriterion("source is null");
return (Criteria) this;
}
public Criteria andSourceIsNotNull() {
addCriterion("source is not null");
return (Criteria) this;
}
public Criteria andSourceEqualTo(String value) {
addCriterion("source =", value, "source");
return (Criteria) this;
}
public Criteria andSourceNotEqualTo(String value) {
addCriterion("source <>", value, "source");
return (Criteria) this;
}
public Criteria andSourceGreaterThan(String value) {
addCriterion("source >", value, "source");
return (Criteria) this;
}
public Criteria andSourceGreaterThanOrEqualTo(String value) {
addCriterion("source >=", value, "source");
return (Criteria) this;
}
public Criteria andSourceLessThan(String value) {
addCriterion("source <", value, "source");
return (Criteria) this;
}
public Criteria andSourceLessThanOrEqualTo(String value) {
addCriterion("source <=", value, "source");
return (Criteria) this;
}
public Criteria andSourceLike(String value) {
addCriterion("source like", value, "source");
return (Criteria) this;
}
public Criteria andSourceNotLike(String value) {
addCriterion("source not like", value, "source");
return (Criteria) this;
}
public Criteria andSourceIn(List<String> values) {
addCriterion("source in", values, "source");
return (Criteria) this;
}
public Criteria andSourceNotIn(List<String> values) {
addCriterion("source not in", values, "source");
return (Criteria) this;
}
public Criteria andSourceBetween(String value1, String value2) {
addCriterion("source between", value1, value2, "source");
return (Criteria) this;
}
public Criteria andSourceNotBetween(String value1, String value2) {
addCriterion("source not between", value1, value2, "source");
return (Criteria) this;
}
}
public static class Criteria extends GeneratedCriteria {
protected Criteria() {
super();
}
}
public static class Criterion {
private String condition;
private Object value;
private Object secondValue;
private boolean noValue;
private boolean singleValue;
private boolean betweenValue;
private boolean listValue;
private String typeHandler;
public String getCondition() {
return condition;
}
public Object getValue() {
return value;
}
public Object getSecondValue() {
return secondValue;
}
public boolean isNoValue() {
return noValue;
}
public boolean isSingleValue() {
return singleValue;
}
public boolean isBetweenValue() {
return betweenValue;
}
public boolean isListValue() {
return listValue;
}
public String getTypeHandler() {
return typeHandler;
}
protected Criterion(String condition) {
super();
this.condition = condition;
this.typeHandler = null;
this.noValue = true;
}
protected Criterion(String condition, Object value, String typeHandler) {
super();
this.condition = condition;
this.value = value;
this.typeHandler = typeHandler;
if (value instanceof List<?>) {
this.listValue = true;
} else {
this.singleValue = true;
}
}
protected Criterion(String condition, Object value) {
this(condition, value, null);
}
protected Criterion(String condition, Object value, Object secondValue, String typeHandler) {
super();
this.condition = condition;
this.value = value;
this.secondValue = secondValue;
this.typeHandler = typeHandler;
this.betweenValue = true;
}
protected Criterion(String condition, Object value, Object secondValue) {
this(condition, value, secondValue, null);
}
}
}
@Mapper
public interface NewsDao {
long countByExample(NewsExample example);
int deleteByExample(NewsExample example);
int deleteByPrimaryKey(Integer id);
int insert(News record);
int insertSelective(News record);
List<News> selectByExampleWithBLOBs(NewsExample example);
List<News> selectByExample(NewsExample example);
News selectByPrimaryKey(Integer id);
int updateByExampleSelective(@Param("record") News record, @Param("example") NewsExample example);
int updateByExampleWithBLOBs(@Param("record") News record, @Param("example") NewsExample example);
int updateByExample(@Param("record") News record, @Param("example") NewsExample example);
int updateByPrimaryKeySelective(News record);
int updateByPrimaryKeyWithBLOBs(News record);
int updateByPrimaryKey(News record);
}
public interface NewsService {
int saveNews(News news);
List<News> searchNewsForPage(int page, int pageSize, NewsExample example);
Long countByExample(NewsExample example);
}
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.ray.news.crawler.dao.NewsDao">
<resultMap id="BaseResultMap" type="com.ray.news.crawler.entity.News">
<id column="id" jdbcType="INTEGER" property="id" />
<result column="title" jdbcType="VARCHAR" property="title" />
<result column="url" jdbcType="VARCHAR" property="url" />
<result column="image" jdbcType="VARCHAR" property="image" />
<result column="create_date" jdbcType="TIMESTAMP" property="createDate" />
<result column="news_date" jdbcType="TIMESTAMP" property="newsDate" />
<result column="source" jdbcType="VARCHAR" property="source" />
</resultMap>
<resultMap extends="BaseResultMap" id="ResultMapWithBLOBs" type="com.ray.news.crawler.entity.News">
<result column="content" jdbcType="LONGVARCHAR" property="content" />
</resultMap>
<sql id="Example_Where_Clause">
<where>
<foreach collection="oredCriteria" item="criteria" separator="or">
<if test="criteria.valid">
<trim prefix="(" prefixOverrides="and" suffix=")">
<foreach collection="criteria.criteria" item="criterion">
<choose>
<when test="criterion.noValue">
and ${criterion.condition}
</when>
<when test="criterion.singleValue">
and ${criterion.condition} #{criterion.value}
</when>
<when test="criterion.betweenValue">
and ${criterion.condition} #{criterion.value} and #{criterion.secondValue}
</when>
<when test="criterion.listValue">
and ${criterion.condition}
<foreach close=")" collection="criterion.value" item="listItem" open="(" separator=",">
#{listItem}
</foreach>
</when>
</choose>
</foreach>
</trim>
</if>
</foreach>
</where>
</sql>
<sql id="Update_By_Example_Where_Clause">
<where>
<foreach collection="example.oredCriteria" item="criteria" separator="or">
<if test="criteria.valid">
<trim prefix="(" prefixOverrides="and" suffix=")">
<foreach collection="criteria.criteria" item="criterion">
<choose>
<when test="criterion.noValue">
and ${criterion.condition}
</when>
<when test="criterion.singleValue">
and ${criterion.condition} #{criterion.value}
</when>
<when test="criterion.betweenValue">
and ${criterion.condition} #{criterion.value} and #{criterion.secondValue}
</when>
<when test="criterion.listValue">
and ${criterion.condition}
<foreach close=")" collection="criterion.value" item="listItem" open="(" separator=",">
#{listItem}
</foreach>
</when>
</choose>
</foreach>
</trim>
</if>
</foreach>
</where>
</sql>
<sql id="Base_Column_List">
id, title, url, image, create_date, news_date, source
</sql>
<sql id="Blob_Column_List">
content
</sql>
<select id="selectByExampleWithBLOBs" parameterType="com.ray.news.crawler.entity.NewsExample" resultMap="ResultMapWithBLOBs">
select
<if test="distinct">
distinct
</if>
<include refid="Base_Column_List" />
,
<include refid="Blob_Column_List" />
from news
<if test="_parameter != null">
<include refid="Example_Where_Clause" />
</if>
<if test="orderByClause != null">
order by ${orderByClause}
</if>
</select>
<select id="selectByExample" parameterType="com.ray.news.crawler.entity.NewsExample" resultMap="BaseResultMap">
select
<if test="distinct">
distinct
</if>
<include refid="Base_Column_List" />
from news
<if test="_parameter != null">
<include refid="Example_Where_Clause" />
</if>
<if test="orderByClause != null">
order by ${orderByClause}
</if>
</select>
<select id="selectByPrimaryKey" parameterType="java.lang.Integer" resultMap="ResultMapWithBLOBs">
select
<include refid="Base_Column_List" />
,
<include refid="Blob_Column_List" />
from news
where id = #{id,jdbcType=INTEGER}
</select>
<delete id="deleteByPrimaryKey" parameterType="java.lang.Integer">
delete from news
where id = #{id,jdbcType=INTEGER}
</delete>
<delete id="deleteByExample" parameterType="com.ray.news.crawler.entity.NewsExample">
delete from news
<if test="_parameter != null">
<include refid="Example_Where_Clause" />
</if>
</delete>
<insert id="insert" parameterType="com.ray.news.crawler.entity.News">
insert into news (id, title, url,
image, create_date, news_date,
source, content)
values (#{id,jdbcType=INTEGER}, #{title,jdbcType=VARCHAR}, #{url,jdbcType=VARCHAR},
#{image,jdbcType=VARCHAR}, #{createDate,jdbcType=TIMESTAMP}, #{newsDate,jdbcType=TIMESTAMP},
#{source,jdbcType=VARCHAR}, #{content,jdbcType=LONGVARCHAR})
</insert>
<insert id="insertSelective" parameterType="com.ray.news.crawler.entity.News">
insert into news
<trim prefix="(" suffix=")" suffixOverrides=",">
<if test="id != null">
id,
</if>
<if test="title != null">
title,
</if>
<if test="url != null">
url,
</if>
<if test="image != null">
image,
</if>
<if test="createDate != null">
create_date,
</if>
<if test="newsDate != null">
news_date,
</if>
<if test="source != null">
source,
</if>
<if test="content != null">
content,
</if>
</trim>
<trim prefix="values (" suffix=")" suffixOverrides=",">
<if test="id != null">
#{id,jdbcType=INTEGER},
</if>
<if test="title != null">
#{title,jdbcType=VARCHAR},
</if>
<if test="url != null">
#{url,jdbcType=VARCHAR},
</if>
<if test="image != null">
#{image,jdbcType=VARCHAR},
</if>
<if test="createDate != null">
#{createDate,jdbcType=TIMESTAMP},
</if>
<if test="newsDate != null">
#{newsDate,jdbcType=TIMESTAMP},
</if>
<if test="source != null">
#{source,jdbcType=VARCHAR},
</if>
<if test="content != null">
#{content,jdbcType=LONGVARCHAR},
</if>
</trim>
</insert>
<select id="countByExample" parameterType="com.ray.news.crawler.entity.NewsExample" resultType="java.lang.Long">
select count(*) from news
<if test="_parameter != null">
<include refid="Example_Where_Clause" />
</if>
</select>
<update id="updateByExampleSelective" parameterType="map">
update news
<set>
<if test="record.id != null">
id = #{record.id,jdbcType=INTEGER},
</if>
<if test="record.title != null">
title = #{record.title,jdbcType=VARCHAR},
</if>
<if test="record.url != null">
url = #{record.url,jdbcType=VARCHAR},
</if>
<if test="record.image != null">
image = #{record.image,jdbcType=VARCHAR},
</if>
<if test="record.createDate != null">
create_date = #{record.createDate,jdbcType=TIMESTAMP},
</if>
<if test="record.newsDate != null">
news_date = #{record.newsDate,jdbcType=TIMESTAMP},
</if>
<if test="record.source != null">
source = #{record.source,jdbcType=VARCHAR},
</if>
<if test="record.content != null">
content = #{record.content,jdbcType=LONGVARCHAR},
</if>
</set>
<if test="_parameter != null">
<include refid="Update_By_Example_Where_Clause" />
</if>
</update>
<update id="updateByExampleWithBLOBs" parameterType="map">
update news
set id = #{record.id,jdbcType=INTEGER},
title = #{record.title,jdbcType=VARCHAR},
url = #{record.url,jdbcType=VARCHAR},
image = #{record.image,jdbcType=VARCHAR},
create_date = #{record.createDate,jdbcType=TIMESTAMP},
news_date = #{record.newsDate,jdbcType=TIMESTAMP},
source = #{record.source,jdbcType=VARCHAR},
content = #{record.content,jdbcType=LONGVARCHAR}
<if test="_parameter != null">
<include refid="Update_By_Example_Where_Clause" />
</if>
</update>
<update id="updateByExample" parameterType="map">
update news
set id = #{record.id,jdbcType=INTEGER},
title = #{record.title,jdbcType=VARCHAR},
url = #{record.url,jdbcType=VARCHAR},
image = #{record.image,jdbcType=VARCHAR},
create_date = #{record.createDate,jdbcType=TIMESTAMP},
news_date = #{record.newsDate,jdbcType=TIMESTAMP},
source = #{record.source,jdbcType=VARCHAR}
<if test="_parameter != null">
<include refid="Update_By_Example_Where_Clause" />
</if>
</update>
<update id="updateByPrimaryKeySelective" parameterType="com.ray.news.crawler.entity.News">
update news
<set>
<if test="title != null">
title = #{title,jdbcType=VARCHAR},
</if>
<if test="url != null">
url = #{url,jdbcType=VARCHAR},
</if>
<if test="image != null">
image = #{image,jdbcType=VARCHAR},
</if>
<if test="createDate != null">
create_date = #{createDate,jdbcType=TIMESTAMP},
</if>
<if test="newsDate != null">
news_date = #{newsDate,jdbcType=TIMESTAMP},
</if>
<if test="source != null">
source = #{source,jdbcType=VARCHAR},
</if>
<if test="content != null">
content = #{content,jdbcType=LONGVARCHAR},
</if>
</set>
where id = #{id,jdbcType=INTEGER}
</update>
<update id="updateByPrimaryKeyWithBLOBs" parameterType="com.ray.news.crawler.entity.News">
update news
set title = #{title,jdbcType=VARCHAR},
url = #{url,jdbcType=VARCHAR},
image = #{image,jdbcType=VARCHAR},
create_date = #{createDate,jdbcType=TIMESTAMP},
news_date = #{newsDate,jdbcType=TIMESTAMP},
source = #{source,jdbcType=VARCHAR},
content = #{content,jdbcType=LONGVARCHAR}
where id = #{id,jdbcType=INTEGER}
</update>
<update id="updateByPrimaryKey" parameterType="com.ray.news.crawler.entity.News">
update news
set title = #{title,jdbcType=VARCHAR},
url = #{url,jdbcType=VARCHAR},
image = #{image,jdbcType=VARCHAR},
create_date = #{createDate,jdbcType=TIMESTAMP},
news_date = #{newsDate,jdbcType=TIMESTAMP},
source = #{source,jdbcType=VARCHAR}
where id = #{id,jdbcType=INTEGER}
</update>
</mapper>
@Service
public class NewsServiceImpl implements NewsService {
@Autowired
private NewsDao newsDao;
@Override
@Transactional
public int saveNews(News news) {
//1.check if the news is already existing
NewsExample newsExample = new NewsExample();
newsExample.createCriteria().andUrlEqualTo(news.getUrl());
long count = newsDao.countByExample(newsExample);
//2.if the news is not existing, insert it into the table
if (count == 0) {
return newsDao.insert(news);
}
return 0;
}
@Override
public List<News> searchNewsForPage(int page, int pageSize, NewsExample example) {
PageHelper.startPage(page, pageSize);
List<News> news = newsDao.selectByExampleWithBLOBs(example);
if (CollectionUtils.isEmpty(news)) {
return Collections.EMPTY_LIST;
} else {
return news;
}
}
@Override
public Long countByExample(NewsExample example) {
return newsDao.countByExample(example);
}
}
public final class NewsUtils {
public static String getTextFromContent(String content) {
String scriptRegex = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // script
String styleRegex = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // style
String htmlTagRegex = "<[^>]+>"; // HTML tag
String spaceRegex = "\\s+|\t|\r|\n";// other characters
Pattern scriptPattern = Pattern.compile(scriptRegex, Pattern.CASE_INSENSITIVE);
Matcher scriptMatcher = scriptPattern.matcher(content);
content = scriptMatcher.replaceAll("");
Pattern stylePattern = Pattern.compile(styleRegex, Pattern.CASE_INSENSITIVE);
Matcher styleMatcher = stylePattern.matcher(content);
content = styleMatcher.replaceAll("");
Pattern htmlTagPattern = Pattern.compile(htmlTagRegex, Pattern.CASE_INSENSITIVE);
Matcher htmlTagMatcher = htmlTagPattern.matcher(content);
content = htmlTagMatcher.replaceAll("");
Pattern spacePattern = Pattern.compile(spaceRegex, Pattern.CASE_INSENSITIVE);
Matcher spaceMatcher = spacePattern.matcher(content);
content = spaceMatcher.replaceAll(" ");
return content;
}
public static String getImageFromContent(String content) {
String image = null;
String imgRegex = "(<img.*src\\s*=\\s*(.*?)[^>]*?>)";
Pattern imgPattern = Pattern.compile(imgRegex, Pattern.CASE_INSENSITIVE);
Matcher imgMatcher = imgPattern.matcher(content);
// 找到img标签
if (imgMatcher.find()) {
String img = imgMatcher.group();
// 匹配<img>中的src数据
Matcher srcMatcher = Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img);
image = srcMatcher.find() ? srcMatcher.group(1) : null;
}
return image;
}
public static String getSourceFromPathVariable(String pathVariable) {
switch (pathVariable) {
case "toutiao" :
return "今日头条";
case "neteasy" :
return "网易";
case "sohu" :
return "搜狐";
case "ifeng" :
return "凤凰";
case "sina" :
return "新浪";
default:
return null;
}
}
}
public interface NewsPuller {
void pullNews();
default Document getHtmlFromUrl(String url, boolean useHtmlUnit) throws Exception {
if (!useHtmlUnit) {
return Jsoup.connect(url)
//模拟火狐浏览器
.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
.get();
}
WebClient webClient = new WebClient(BrowserVersion.CHROME); //新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
webClient.getOptions().setCssEnabled(false); //是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false); //当JS执行出错的时候是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); //当HTTP的状态非200时是否抛出异常
//webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置支持AJAX
webClient.getOptions().setUseInsecureSSL(true);
webClient.getOptions().setTimeout(10 * 1000);
HtmlPage rootPage = null;
try {
rootPage = webClient.getPage(url);
webClient.waitForBackgroundJavaScript(10 * 1000); //异步JS执行需要耗时,所以这里线程要阻塞10秒,等待异步JS执行结束
String htmlStr = rootPage.asXml(); //直接将加载完成的页面转换成xml格式的字符串
//System.out.println(htmlStr);
return Jsoup.parse(htmlStr); //获取html文档
} finally {
webClient.close();
}
}
}
@Component("ifengNewsPuller")
public class IfengNewsPuller implements NewsPuller {
private static final Logger logger = LoggerFactory.getLogger(IfengNewsPuller.class);
@Value("${news.ifeng.url}")
private String url;
@Autowired
private NewsService newsService;
private SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
@Override
public void pullNews() {
logger.info("开始拉取凤凰新闻!");
// 1. 获取首页
Document html = null;
try {
html = getHtmlFromUrl(url, false);
} catch (Exception e) {
logger.error("==============获取凤凰首页失败: {} =============", url);
e.printStackTrace();
return;
}
// 2. jsoup 获取新闻 <a> 标签
Elements newsATags = html.select("div#newsList")
.select("ul.news_list-3wjAJJJM")
.select("li")
.select("a");
// 3.从<a>标签中抽取基本信息,封装成news
HashSet<News> newsSet = new HashSet<>();
for (Element a : newsATags) {
String url = a.attr("href");
String title = a.text();
News n = new News();
n.setSource("凤凰");
n.setUrl(url);
n.setTitle(title);
n.setCreateDate(new Date());
newsSet.add(n);
}
// 4.根据新闻url访问新闻,获取新闻内容
newsSet.parallelStream().forEach(news -> {
logger.info("开始抽取凤凰新闻《{}》内容:{}", news.getTitle(), news.getUrl());
Document newsHtml = null;
try {
newsHtml = getHtmlFromUrl(news.getUrl(), false);
Elements contentElement = newsHtml.select("div.text-3zQ3cZD4");
if (contentElement.isEmpty()) {
contentElement = newsHtml.select("div.caption-3_nUnnKX h1");
}
if (contentElement.isEmpty()) {
return;
}
// 直接从头部信息获取部分数据
String time = newsHtml.head().select("meta[name=og:time ]").attr("content");
if (StringUtils.isNotBlank(time)) {
news.setNewsDate(sdf.parse(time));
}
String content = contentElement.toString();
String image = NewsUtils.getImageFromContent(content);
news.setContent(contentElement.text());
news.setImage(image);
newsService.saveNews(news);
logger.info("抽取凤凰新闻《{}》成功!", news.getTitle());
} catch (Exception e) {
logger.error("凤凰新闻抽取失败:{}", news.getUrl());
e.printStackTrace();
}
});
logger.info("凤凰新闻抽取完成!");
}
}
@Component("netEasyNewsPuller")
public class NetEasyNewsPuller implements NewsPuller {
private static final Logger logger = LoggerFactory.getLogger(NetEasyNewsPuller.class);
@Value("${news.neteasy.url}")
private String url;
@Autowired
private NewsService newsService;
@Override
public void pullNews() {
logger.info("开始拉取网易热门新闻!");
// 1. 获取首页
Document html = null;
try {
html = getHtmlFromUrl(url, false);
} catch (Exception e) {
logger.error("==============获取网易新闻首页失败: {}=============", url);
e.printStackTrace();
return;
}
// 2. jsoup 获取指定标签
//Elements newA = html.select("div.tabContents")
Elements newA = html.select("div.tabContents.active") // 减少一部分数据
.select("table")
.select("td")
.select("a");
//System.out.println(newA);
// 3. 从标签中抽取信息,封装成 news
HashSet<News> newsSet = new HashSet<>();
newA.forEach(a -> {
String url = a.attr("href");
News n = new News();
n.setSource("网易");
n.setUrl(url);
n.setCreateDate(new Date());
newsSet.add(n);
});
// 4. 根据url 访问新闻,获取新闻内容
newsSet.forEach(news -> {
logger.info("开始抽取新闻内容:{}", news.getUrl());
Document newsHtml = null;
try {
newsHtml = getHtmlFromUrl(news.getUrl(), false);
Elements newsContentAll = newsHtml.select("div#epContentLeft");
if (!newsContentAll.isEmpty()) {
Elements newsContent = newsHtml.select("div#endText");
Element titleP = newsContentAll.select("h1").first();
String title = titleP.text();
String image = NewsUtils.getImageFromContent(newsContentAll.toString());
news.setTitle(title);
news.setContent(newsContent.text());
news.setImage(image);
newsService.saveNews(news);
logger.info("抽取网易新闻《{}》成功!", news.getTitle());
} else {
logger.error("新闻抽取失败:{}", news.getUrl());
}
} catch (Exception e) {
logger.error("新闻抽取失败:{}", news.getUrl());
e.printStackTrace();
}
});
logger.info("网易新闻拉取完成!");
}
}
@Component("sohuNewsPuller")
public class SohuNewsPuller implements NewsPuller {
private static final Logger logger = LoggerFactory.getLogger(SohuNewsPuller.class);
@Value("${news.sohu.url}")
private String url;
@Autowired
private NewsService newsService;
@Override
public void pullNews() {
logger.info("开始拉取搜狐新闻!");
// 1. 获取首页
Document html = null;
try {
html = getHtmlFromUrl(url, false);
} catch (Exception e) {
logger.error("==============获取搜狐首页失败: {}=============", url);
e.printStackTrace();
return;
}
// 2. jsoup获取新闻<a>标签
Elements newsATags = html.select("div.focus-news")
.select("div.list16")
.select("li")
.select("a");
// 3. 从<a>标签中抽取基本信息,封装成 news
HashSet<News> newsSet = new HashSet<>();
for (Element a : newsATags) {
String url = a.attr("href");
String title = a.attr("title");
News n = new News();
n.setSource("搜狐");
n.setUrl(url);
n.setTitle(title);
n.setCreateDate(new Date());
newsSet.add(n);
}
// 4. 根据新闻url访问新闻,获取新闻内容
newsSet.forEach(news -> {
logger.info("开始抽取搜狐新闻内容:{}", news.getUrl());
Document newsHtml = null;
try {
newsHtml = getHtmlFromUrl(news.getUrl(), false);
String content = newsHtml.select("article.article").first().text();
String image = NewsUtils.getImageFromContent(content);
news.setContent(content);
news.setImage(image);
newsService.saveNews(news);
logger.info("抽取搜狐新闻《{}》成功!", news.getTitle());
} catch (Exception e) {
logger.error("新闻抽取失败:{}", news.getUrl());
e.printStackTrace();
}
});
logger.info("搜狐新闻拉取完成!");
}
}
详情页面爬取内容不完善
@Component("toutiaoNewsPuller")
public class ToutiaoNewsPuller implements NewsPuller {
private static final Logger logger = LoggerFactory.getLogger(ToutiaoNewsPuller.class);
private static final String TOUTIAO_URL = "https://www.toutiao.com";
@Autowired
private NewsService newsService;
@Value("${news.toutiao.url}")
private String url;
@Override
public void pullNews() {
logger.info("开始拉取今日头条热门新闻!");
// 1. 获取首页
Document html = null;
try {
html = getHtmlFromUrl(url, true);
} catch (Exception e) {
logger.error("获取今日头条主页失败!");
e.printStackTrace();
return;
}
// 2. 解析页面并封装到 news
Map<String, News> newsMap = new HashMap<>();
for (Element a :
// select 语法参考: https://www.open-open.com/jsoup/selector-syntax.htm
html.select("a[href~=/group/.*]:not(.comment)") ) {
logger.info("<a>标签: \n{}", a);
String href = TOUTIAO_URL + a.attr("href");
String title = StringUtils.isNotBlank(a.select("p").text()) ?
a.select("p").text() : a.text();
String image = a.select("img").attr("src");
News news = newsMap.get(href);
if (news == null) {
News n = new News();
n.setSource("今日头条");
n.setUrl(href);
n.setCreateDate(new Date());
n.setImage(image);
n.setTitle(title);
newsMap.put(href, n);
} else {
if (a.hasClass("img-wrap")) {
news.setImage(image);
} else if (a.hasClass("title")) {
news.setTitle(title);
}
}
}
logger.info("今日头条新闻标题拉取完成!");
logger.info("开始拉取新闻内容...");
newsMap.values().stream().forEach(news -> {
logger.info("抽取今日头条新闻《{}》", news.getTitle());
Document contentHtml = null;
try {
contentHtml = getHtmlFromUrl(news.getUrl(), true);
} catch (Exception e) {
logger.error("获取今日头条新闻《{}》内容失败!", news.getTitle());
e.printStackTrace();
return;
}
/*Elements scripts = contentHtml.getElementsByTag("script");
scripts.forEach(script -> {
String regex = "articleInfo: \\{\\s*[\\n\\r]*\\s*title: '.*',\\s*[\\n\\r]*\\s*content: '(.*)',";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(script.toString());
if (matcher.find()) {
String content = matcher.group(1)
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("=", "=");
logger.info("content: {}", content);
news.setContent(content);
}
});*/
String content = contentHtml.select("article").text();
logger.info("content: {}", content);
news.setContent(content);
});
newsMap.values()
.stream()
.filter(news -> StringUtils.isNotBlank(news.getContent()) && !news.getContent().equals("null"))
.forEach(newsService::saveNews);
logger.info("今日头条新闻内容拉取完成!");
}
}
@RestController
@RequestMapping("/news")
@Api(value = "新闻拉取API")
public class NewsController {
private static final Logger logger = LoggerFactory.getLogger(NewsController.class);
@Autowired
@Qualifier("ifengNewsPuller")
private NewsPuller ifengNewsPuller;
@Autowired
@Qualifier("netEasyNewsPuller")
private NewsPuller neteasyNewsPuller;
@Autowired
@Qualifier("sohuNewsPuller")
private NewsPuller sohuNewsPuller;
@Autowired
@Qualifier("toutiaoNewsPuller")
private NewsPuller toutiaoNewsPuller;
@Autowired
private NewsService newsService;
@ApiOperation(value = "爬虫拉取凤凰新闻")
@GetMapping("/pull/ifeng")
public void pullIfengNews() {
ifengNewsPuller.pullNews();
}
@ApiOperation(value = "爬虫拉取网易新闻")
@GetMapping("/pull/neteasy")
public void pullNeteasyNews() {
neteasyNewsPuller.pullNews();
}
@ApiOperation(value = "爬虫拉取搜狐新闻")
@GetMapping("/pull/sohu")
public void pullSohuNews() {
sohuNewsPuller.pullNews();
}
@ApiOperation(value = "爬虫拉取今日头条新闻")
@GetMapping("/pull/toutiao")
public void pullToutiaoNews() {
toutiaoNewsPuller.pullNews();
}
@ApiOperation(value = "获取{source}新闻")
@GetMapping("/{source}")
public List<News> getToutiaoNews(@RequestParam Integer page, @RequestParam Integer pageSize, @PathVariable String source) {
NewsExample example = new NewsExample();
example.createCriteria().andSourceEqualTo(NewsUtils.getSourceFromPathVariable(source));
example.setOrderByClause("create_date desc");
return newsService.searchNewsForPage(page, pageSize, example);
}
@ApiOperation("获取{source}新闻总数")
@GetMapping("/{source}/count")
public Long getToutiaoCount(@PathVariable String source) {
NewsExample example = new NewsExample();
example.createCriteria().andSourceEqualTo(NewsUtils.getSourceFromPathVariable(source));
return newsService.countByExample(example);
}
@ApiOperation(value = "获取所有新闻")
@GetMapping
public List<News> getNews(@RequestParam Integer page, @RequestParam Integer pageSize) {
NewsExample example = new NewsExample();
example.createCriteria();
example.setOrderByClause("create_date desc");
return newsService.searchNewsForPage(page, pageSize, example);
}
@ApiOperation("获取新闻总数")
@GetMapping("/count")
public Long getCount() {
NewsExample example = new NewsExample();
example.createCriteria();
return newsService.countByExample(example);
}
}
@Configuration
@EnableSwagger2
public class SwaggerConfig {
@Bean
public Docket createRestApi() {
return new Docket(DocumentationType.SWAGGER_2)
.useDefaultResponseMessages(false)
.apiInfo(apiInfo())
.select()
.apis(RequestHandlerSelectors.withClassAnnotation(Api.class))
.paths(PathSelectors.any())
.build();
}
private ApiInfo apiInfo() {
return new ApiInfoBuilder()
.title("新闻爬虫API文档")
.description("使用Jsoup + HtmlUtil")
.version("1.0")
.build();
}
}
@SpringBootApplication
public class NewscrawlerApplication {
public static void main(String[] args) {
SpringApplication.run(NewscrawlerApplication.class, args);
System.out.println("swagger2: http://localhost:8080/swagger-ui.html");
}
}