参考: 1、http://webmagic.io/docs/zh/ 2、http://blog.csdn.net/qq598535550/article/details/51287630
pom.xml文件如下
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.hadron</groupId>
<artifactId>webmagicDemo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>webmagicDemo</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.5.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.40</version>
</dependency>
</dependencies>
<build>
<finalName>webmagicDemo</finalName>
<plugins>
<!-- 编码和编译和JDK版本 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>utf8</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>
sql语句如下
MariaDB [(none)]> create database webMagic;
Query OK, 1 row affected (0.02 sec)
MariaDB [(none)]> use webMagic;
MariaDB [webMagic]> create table csdnblog(
id int auto_increment primary key,
blogId int not null,
title varchar(255) not null,
blogDate varchar(16),
tags varchar(255),
category varchar(255),
view int ,
comments int,
copyright int,
url varchar(255)
);
Query OK, 0 rows affected (0.01 sec)
MariaDB [webMagic]>
(1)实体Bean
package cn.hadron.webmagic.csdnblog;
/**
* MariaDB [(none)]> create database webMagic;
* MariaDB [webMagic]> create table csdnblog(
* id int auto_increment primary key,
* blogId int not null,
* title varchar(255) not null,
* blogDate varchar(16),
* tags varchar(255),
* category varchar(255),
* view int ,
* comments int,
* copyright int,
* url varchar(255)
* );
* @author Administrator
*
*/
public class CsdnBlog {
private int blogId;// 编号
private String title;// 标题
private String blogDate;// 日期
private String tags;// 标签
private String category;// 分类
private int view;// 阅读人数
private int comments;// 评论人数
private int copyright;// 是否原创
private String url; //网址
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getTags() {
return tags;
}
public void setTags(String tags) {
this.tags = tags;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public int getView() {
return view;
}
public void setView(int view) {
this.view = view;
}
public int getComments() {
return comments;
}
public void setComments(int comments) {
this.comments = comments;
}
public int getCopyright() {
return copyright;
}
public void setCopyright(int copyright) {
this.copyright = copyright;
}
public int getBlogId() {
return blogId;
}
public void setBlogId(int blogId) {
this.blogId = blogId;
}
public String getBlogDate() {
return blogDate;
}
public void setBlogDate(String blogDate) {
this.blogDate = blogDate;
}
}
(2)Blog入库操作
package cn.hadron.webmagic.csdnblog;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
public class CsdnBlogDao {
private Connection conn = null;
private Statement stmt = null;
public CsdnBlogDao() {
try {
Class.forName("com.mysql.jdbc.Driver");
String url = "jdbc:mysql://192.168.80.133:3306/webMagic?user=root&password=123456";
conn = DriverManager.getConnection(url);
stmt = conn.createStatement();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}
public int add(CsdnBlog csdnBlog) {
try {
String sql = "INSERT INTO `webMagic`.`csdnblog` (`blogId`, `title`, `blogDate`, `tags`, `category`, `view`, `comments`, `copyright`,`url`) VALUES (?, ?, ?, ?, ?, ?, ?, ?,?);";
PreparedStatement ps = conn.prepareStatement(sql);
ps.setInt(1, csdnBlog.getBlogId());
ps.setString(2, csdnBlog.getTitle());
ps.setString(3, csdnBlog.getBlogDate());
ps.setString(4, csdnBlog.getTags());
ps.setString(5, csdnBlog.getCategory());
ps.setInt(6, csdnBlog.getView());
ps.setInt(7, csdnBlog.getComments());
ps.setInt(8, csdnBlog.getCopyright());
ps.setString(9, csdnBlog.getUrl());
return ps.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}
return -1;
}
}
(3)爬数据
package cn.hadron.webmagic.csdnblog;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
/**
* CSDN博客爬虫
* @describe 可以爬取指定用户的csdn博客所有文章,并保存到数据库中。
*/
public class CsdnBlogPageProcessor implements PageProcessor {
private static String username = "dabokele";// 设置csdn用户名
private static int size = 0;// 共抓取到的文章数量
// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(5000);
public CsdnBlogPageProcessor(){}
public CsdnBlogPageProcessor(String username){
CsdnBlogPageProcessor.username=username;
}
//get the site settings
@Override
public Site getSite() {
return site;
}
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
@Override
public void process(Page page) {
// 列表页
if (!page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/\\d+").match()) {
// 添加所有文章页
page.addTargetRequests(page.getHtml().xpath("//div[@id='article_list']").links()// 限定文章列表获取区域
.regex("/" + username + "/article/details/\\d+")
.replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url
.all());
// 添加其他列表页
page.addTargetRequests(page.getHtml().xpath("//div[@id='papelist']").links()// 限定其他列表页获取区域
.regex("/" + username + "/article/list/\\d+")
.replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url
.all());
} else {// 文章页,匹配"http://blog\\.csdn\\.net/" + username + "/article/details/\\d+"
size++;// 文章数量加1
// 用CsdnBlog类来存抓取到的数据,方便存入数据库
CsdnBlog csdnBlog = new CsdnBlog();
// 设置编号
csdnBlog.setBlogId(Integer.parseInt(page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/(\\d+)").get()));
// 设置标题
csdnBlog.setTitle(
page.getHtml().xpath("//div[@class='article_title']//span[@class='link_title']/a/text()").get());
// 设置日期
csdnBlog.setBlogDate(
page.getHtml().xpath("//div[@class='article_r']/span[@class='link_postdate']/text()").get());
// 设置标签(可以有多个,用,来分割)
csdnBlog.setTags(listToString(page.getHtml()
.xpath("//div[@class='article_l']/span[@class='link_categories']/a/allText()").all()));
// 设置类别(可以有多个,用,来分割)
csdnBlog.setCategory(
listToString(page.getHtml().xpath("//div[@class='category_r']/label/span/text()").all()));
// 设置阅读人数
csdnBlog.setView(Integer.parseInt(page.getHtml().xpath("//div[@class='article_r']/span[@class='link_view']")
.regex("(\\d+)人阅读").get()));
// 设置评论人数
csdnBlog.setComments(Integer.parseInt(page.getHtml()
.xpath("//div[@class='article_r']/span[@class='link_comments']").regex("\\((\\d+)\\)").get()));
// 设置是否原创
csdnBlog.setCopyright(page.getHtml().regex("bog_copyright").match() ? 1 : 0);
//设置URL
csdnBlog.setUrl(page.getUrl().toString());
// 把对象存入数据库
new CsdnBlogDao().add(csdnBlog);
// 把对象输出控制台
System.out.println(csdnBlog);
}
}
// 把list转换为string,用,分割
public static String listToString(List<String> stringList) {
if (stringList == null) {
return null;
}
StringBuilder result = new StringBuilder();
boolean flag = false;
for (String string : stringList) {
if (flag) {
result.append(",");
} else {
flag = true;
}
result.append(string);
}
return result.toString();
}
public static void main(String[] args) {
long startTime, endTime;
System.out.println("【爬虫开始】请耐心等待一大波数据到你碗里来...");
startTime = System.currentTimeMillis();
// 从用户博客首页开始抓,开启5个线程,启动爬虫
Spider.create(new CsdnBlogPageProcessor("chengyuqiang")).addUrl("http://blog.csdn.net/" + username).thread(5).run();
endTime = System.currentTimeMillis();
System.out.println("【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endTime - startTime) / 1000) + "秒,已保存到数据库,请查收!");
}
}
【爬虫开始】请耐心等待一大波数据到你碗里来...
log4j:WARN No appenders could be found for logger (us.codecraft.webmagic.scheduler.QueueScheduler).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
get page: http://blog.csdn.net/chengyuqiang
cn.hadron.webmagic.csdnblog.CsdnBlog@3e7e6b2e
get page: http://blog.csdn.net/chengyuqiang/article/details/78379332
cn.hadron.webmagic.csdnblog.CsdnBlog@146f9832
cn.hadron.webmagic.csdnblog.CsdnBlog@5e1f28cf
cn.hadron.webmagic.csdnblog.CsdnBlog@4525f588
get page: http://blog.csdn.net/chengyuqiang/article/details/78383856
get page: http://blog.csdn.net/chengyuqiang/article/details/78392985
...
...
cn.hadron.webmagic.csdnblog.CsdnBlog@170dc779
get page: http://blog.csdn.net/chengyuqiang/article/details/53947362
cn.hadron.webmagic.csdnblog.CsdnBlog@5e6d81
get page: http://blog.csdn.net/chengyuqiang/article/details/53947507
cn.hadron.webmagic.csdnblog.CsdnBlog@ac68d76
get page: http://blog.csdn.net/chengyuqiang/article/details/53946911
cn.hadron.webmagic.csdnblog.CsdnBlog@7f82f115
get page: http://blog.csdn.net/chengyuqiang/article/details/53946664
cn.hadron.webmagic.csdnblog.CsdnBlog@528b9c88
get page: http://blog.csdn.net/chengyuqiang/article/details/53868915
【爬虫结束】共抓取182篇文章,耗时约215秒,已保存到数据库,请查收!
MariaDB [webMagic]> select count(1) from csdnblog;
+----------+
| count(1) |
+----------+
| 182 |
+----------+
1 row in set (0.01 sec)
MariaDB [webMagic]>