最近花时间学习了一下使用Java获取网站数据的方法,自己也亲自动手实践一下;共获取3000+数据,去除重复的数据剩余2000+,使用JFreeChart根据电影评分做出几张简单的统计图。
电影评分统计图: JFreeChart生成图片
使用jsoup获取该网站的电影数据信息,此网站动态加载数据,如果直接查看网页源代码是看不到数据的。可以通过js文件,获取相应的数据:
HashMap<String, String> urlandnames = new HashMap<String, String>();
MovieService movieService = new MovieService();
// 排行榜页面
String url = "http://movie.douban.com/chart";
// 获取分类的所有相对链接和分类名称
try {
Document kinds = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36")
.timeout(10000)
.get();
Elements elements = kinds.select("#content .types a");
for(Element element : elements){
String kindurl = element.attr("href"); // 链接地址
String name = element.text(); // 类别
urlandnames.put(kindurl,name);
}
} catch (IOException e) {
e.printStackTrace();
System.out.println("获取urlandname出现错误!!");
}
//获取所有的key
Set<String> keySet = urlandnames.keySet();
//迭代key值
Iterator<String> iterator = keySet.iterator();
List<Movie> allMovies = new ArrayList<Movie>();
while(iterator.hasNext()){
// 获取到key值,即url
String next = iterator.next();
// 根据某一个类别的链接,获取行对应的电影数据
List<Movie> listMovie = getMovieInfo(next);
allMovies.addAll(listMovie);
}
/**
* 获取种类电影信息,保存到数据库
* @param url 某一个种类的链接地址
*/
private List<Movie> getMovieInfo(String url){
String[] tempurl = url.split("&");
String finalurl = "http://movie.douban.com/j/chart/top_list_count?"+tempurl[1]+"&"+tempurl[2];
// finalurl ---------http://movie.douban.com/j/chart/top_list_count?type=18&interval_id=100:90
String document = null;
try {
//获取该类别影片的数量total、可在线观看数量playable_count
document = Jsoup.connect(finalurl).timeout(10000).ignoreContentType(true).execute().body();
// document------{"playable_count":18,"total":32,"unwatched_count":32}可在线观看18部,共32部,未观看32部
} catch (IOException e) {
e.printStackTrace();
}
//json解析器
JsonParser parser = new JsonParser();
//获取json对象
JsonObject jsonObject = (JsonObject) parser.parse(document);
//将json数据转为int型数据
int movienum = jsonObject.get("total").getAsInt();
System.out.println(movienum);//该类型的数量
String nameurl = "http://movie.douban.com/j/chart/top_list?"+tempurl[1]+"&"+tempurl[2]+"&action=&start=0&limit="+movienum;
// nameurl-------------http://movie.douban.com/j/chart/top_list?type=18&interval_id=100:90&action=&start=0&limit=32
FileWriter fw = null;
String doc = null;
try {
//获取该类别的所有影片的信息
doc = Jsoup.connect(nameurl).timeout(10000).ignoreContentType(true).execute().body();
} catch (Exception e) {
e.printStackTrace();
}
//将json的一个对象数组解析成JsonElement对象
JsonElement element = null;
try {
//通过JsonParser对象可以把json格式的字符串解析成一个JsonElement对象
element = parser.parse(doc);
} catch (NullPointerException e) {
e.printStackTrace();
}
JsonArray jsonArray = null;
if(element.isJsonArray()){
//JsonElement对象如果是一个数组的话转化成jsonArray
jsonArray = element.getAsJsonArray();
}
//遍历json的对象数组
Iterator it = jsonArray.iterator();
List<Movie> listMovie = new ArrayList<Movie>();
while (it.hasNext()) {
JsonObject e = (JsonObject)it.next();
//电影名称
String name = e.get("title").getAsString();
//豆瓣评分
float score = e.get("score").getAsFloat();
//发布时间
String release_date = e.get("release_date").getAsString();
//类型
JsonArray jsonArray2 = e.get("types").getAsJsonArray();
String types = jsonArray2.toString();
//链接地址
String movieUrl = e.get("url").getAsString();
//是否可以在线播放
String is_playable = e.get("is_playable").getAsString();
String substring = movieUrl.substring(0, movieUrl.lastIndexOf("/"));
String keyID = substring.substring(substring.lastIndexOf("/"), substring.length());
if(cache.get(keyID) != null){
String value = (String) cache.get(keyID).getObjectValue();
if(!name.equals(value)){
net.sf.ehcache.Element element2 = new net.sf.ehcache.Element(keyID,name);
cache.put(element2);
}else {
// System.out.println("重复的 movie Info");
continue;
}
}else {
net.sf.ehcache.Element element2 = new net.sf.ehcache.Element(keyID,name);
cache.put(element2);
}
Movie movie = new Movie();
movie.setName(name);
movie.setTypes(types);
movie.setRelease_date(release_date);
movie.setScore(score);
movie.setMovieUrl(movieUrl);
movie.setIs_playable(is_playable);
//在控制台输出
// System.out.println(movie.toString());
// System.out.println("正在获取数据ing...");
listMovie.add(movie);
}
return listMovie;
}
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
String method = request.getParameter("method");
System.out.println(method+"===================method");
MovieService movieService = new MovieService();
Map<String, Integer> map = movieService.Count();
Integer one = map.get("one");
Integer two = map.get("two");
Integer three = map.get("three");
Integer four = map.get("four");
Integer five = map.get("five");
if(method.equals("barChart")){
double [][]data = new double[][]{{one},{two},{three},{four},{five}};
String []rowKeys = {">=9",">=8.5",">=8",">=7.5","<7.5"};
String []columnKeys = {"评分"};
CategoryDataset dataset = DatasetUtilities.createCategoryDataset(rowKeys, columnKeys, data);
JFreeChart chart = ChartFactory.createBarChart3D(
"电影评分柱状图", // 图表标题
"电影", // 目录轴的显示标签
"数量", // 数值轴的显示标签
dataset, // 数据集
PlotOrientation.VERTICAL, // 图表方向:水平、垂直
true, // 是否显示图例(对于简单的柱状图必须是 false)
false, // 是否创建工具提示 (tooltip)
false // 是否生成 URL 链接
);
CategoryPlot plot = chart.getCategoryPlot();
// 设置网格背景颜色
plot.setBackgroundPaint(Color.white);
// 设置网格竖线颜色
plot.setDomainGridlinePaint(Color.pink);
// 设置网格横线颜色
plot.setRangeGridlinePaint(Color.pink);
// 显示每个柱的数值,并修改该数值的字体属性
BarRenderer3D renderer=new BarRenderer3D();
renderer.setBaseItemLabelGenerator(new StandardCategoryItemLabelGenerator());
renderer.setBaseItemLabelsVisible(true);
renderer.setBasePositiveItemLabelPosition(new ItemLabelPosition(ItemLabelAnchor.OUTSIDE12, TextAnchor.BASELINE_LEFT));
renderer.setItemLabelAnchorOffset(10D);
// 设置平行柱的之间距离
renderer.setItemMargin(0.4);
plot.setRenderer(renderer);
FileOutputStream fos_jpg = null;
try {
//将图片保存至Tomcat服务器WebRoot下的img目录中
fos_jpg = new FileOutputStream(request.getSession().getServletContext().getRealPath("/")+"barChart.jpg");
ChartUtilities.writeChartAsJPEG(fos_jpg,1,chart,700,500,null);
} catch (Exception e) {
System.out.println("error");
} finally {
try {
fos_jpg.close();
} catch (Exception e) {
System.out.println("error2");
}
}
request.setAttribute("barChart", "barChart.jpg");
}
MovieService movieService = new MovieService();
Map<String, Integer> map = movieService.Count();
Integer one = map.get("one");
Integer two = map.get("two");
Integer three = map.get("three");
Integer four = map.get("four");
Integer five = map.get("five");
if (method.equals("pieChart")) {
DefaultPieDataset data = new DefaultPieDataset();
data.setValue(">=9",one);
data.setValue(">=8.5",two);
data.setValue(">=8",three);
data.setValue(">=7.5",four);
data.setValue("<7.5",five);
JFreeChart chart = ChartFactory.createPieChart3D(
"评分饼状图", // 图表标题
data,
true, // 是否显示图例
false, // 是否创建工具提示 (tooltip)
false // 是否生成 URL 链接
);
//显示百分比
PiePlot pieplot = (PiePlot)chart.getPlot();
pieplot.setLabelFont(new Font("宋体", 0, 12));
pieplot.setNoDataMessage("无数据");
pieplot.setCircular(true);
pieplot.setLabelGap(0.02D);
pieplot.setLabelGenerator(new StandardPieSectionLabelGenerator("{0} {2}",NumberFormat.getNumberInstance(),new DecimalFormat("0.00%")));
PiePlot3D pieplot3d = (PiePlot3D)chart.getPlot();
//设置开始角度
pieplot3d.setStartAngle(120D);
//设置方向为”顺时针方向“
pieplot3d.setDirection(Rotation.CLOCKWISE);
//设置透明度,0.5F为半透明,1为不透明,0为全透明
pieplot3d.setForegroundAlpha(0.7F);
FileOutputStream fos_jpg = null;
try {
//将图片保存至Tomcat服务器WebRoot目录下
fos_jpg = new FileOutputStream(request.getSession().getServletContext().getRealPath("/")+"pieChart.jpg");
ChartUtilities.writeChartAsJPEG(fos_jpg,1,chart,700,500,null);
} catch (Exception e) {
System.out.println("error");
} finally {
try {
fos_jpg.close();
} catch (Exception e) {
System.out.println("error2");
}
}
request.setAttribute("pieChart", "pieChart.jpg");
}
if (method.equals("lineChart")) {
XYSeriesCollection collection = new XYSeriesCollection();
XYSeries series = new XYSeries("折线");
Map<String, Integer> map2 = movieService.lineChart();
int number = 99;
for(int i=0; i<map2.size(); i++){
String s= number+"";
String score = s.charAt(0)+"."+s.charAt(1);
series.add(Double.parseDouble(score),map2.get(score));
// System.out.println(Double.parseDouble(score)+"--"+map2.get(score));
number--;
}
collection.addSeries(series);
JFreeChart chart = ChartFactory.createXYLineChart(
"评分折线图",
"评分",
"数量",
collection,
PlotOrientation.VERTICAL,
true,
true,
false);
XYPlot plot = (XYPlot) chart.getPlot();
//设置曲线是否显示数据点
XYLineAndShapeRenderer xylinerenderer = (XYLineAndShapeRenderer)plot.getRenderer();
xylinerenderer.setBaseShapesVisible(true);
//设置曲线显示各数据点的值
XYItemRenderer xyitem = plot.getRenderer();
xyitem.setBaseItemLabelsVisible(true);
xyitem.setBasePositiveItemLabelPosition(new ItemLabelPosition(ItemLabelAnchor.OUTSIDE12, TextAnchor.BASELINE_CENTER));
xyitem.setBaseItemLabelGenerator(new StandardXYItemLabelGenerator());
xyitem.setBaseItemLabelFont(new Font("Dialog", 1, 10));
plot.setRenderer(xyitem);
FileOutputStream fos_jpg = null;
try {
//将图片保存至Tomcat服务器WebRoot目录下
fos_jpg = new FileOutputStream(request.getSession().getServletContext().getRealPath("/")+"lineChart.jpg");
ChartUtilities.writeChartAsJPEG(fos_jpg,1,chart,700,500,null);
} catch (Exception e) {
System.out.println("error");
} finally {
try {
fos_jpg.close();
} catch (Exception e) {
System.out.println("error2");
}
}
request.setAttribute("lineChart", "lineChart.jpg");
}
public class MovieDao {
/**
* 把获取的数据,一次性插入
* @param listMovie
*/
public void save(List<Movie> listMovie){
Connection connection = null;
PreparedStatement statement = null;
connection = JdbcUtils.getConnection();
try {
int i = 1;
for(Movie movie : listMovie){
System.out.println("正在插入第"+(i++)+"条数据到数据库ing...");
String sql = " INSERT INTO movie(NAME,TYPES,release_date,score,movieUrl,is_playable) VALUE( ?,?,?,?,?,? ) ";
statement = connection.prepareStatement(sql);
statement.setString(1, movie.getName());
statement.setString(2, movie.getTypes());
statement.setString(3, movie.getRelease_date());
statement.setFloat(4, movie.getScore());
statement.setString(5, movie.getMovieUrl());
statement.setString(6, movie.getIs_playable());
statement.execute();
}
System.out.println("保存数据完成");
} catch (SQLException e) {
System.out.println("保存数据出现错误 MovieDao error");
e.printStackTrace();
throw new RuntimeException(e);
} finally {
try {
connection.close();
statement.close();
} catch (SQLException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
}
/**
* 查询所有数据
* @return
*/
public List<Movie> findAll(){
Connection connection = null;
PreparedStatement statement = null;
ResultSet resultSet = null;
try {
connection = JdbcUtils.getConnection();
String sql = " select * from movie ";
statement = connection.prepareStatement(sql);
resultSet = statement.executeQuery();
List<Movie> list = new ArrayList<Movie>();
while (resultSet.next()) {
Movie movie = new Movie();
movie.setId(resultSet.getInt("id"));
movie.setName(resultSet.getString("name"));
movie.setTypes(resultSet.getString("types"));
movie.setRelease_date(resultSet.getString("release_date"));
movie.setScore(resultSet.getFloat("score"));
movie.setMovieUrl(resultSet.getString("movieUrl"));
movie.setIs_playable(resultSet.getString("is_playable"));
list.add(movie);
}
return list;
} catch (SQLException e) {
e.printStackTrace();
throw new RuntimeException(e);
} finally {
try {
connection.close();
statement.close();
} catch (SQLException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
}
/**
* 统计不同分数级别的电影数量
* @return
*/
public Map<String,Integer> Count(){
Connection conn = null;
PreparedStatement stmt = null;
ResultSet resultSet = null;
Map<String,Integer> mapCount = new HashMap<String, Integer>();
conn = JdbcUtils.getConnection();
String sql = null;
String key = null;
int i=0;
while (i<5) {
switch (i) {
case 0:
sql = "SELECT COUNT(1) FROM movie WHERE score>=9 ";
key = "one";
break;
case 1:
sql = "SELECT COUNT(1) FROM movie WHERE score>=8.5 && score<9 ";
key = "two";
break;
case 2:
sql = "SELECT COUNT(1) FROM movie WHERE score>=8 && score<8.5 ";
key = "three";
break;
case 3:
sql = "SELECT COUNT(1) FROM movie WHERE score>=7.5 && score<8 ";
key = "four";
break;
case 4:
sql = "SELECT COUNT(1) FROM movie WHERE score<7.5 ";
key = "five";
break;
}
try {
stmt = conn.prepareStatement(sql);
resultSet = stmt.executeQuery();
while (resultSet.next()) {
mapCount.put(key, resultSet.getInt(1));
}
} catch (SQLException e) {
e.printStackTrace();
}
i++;
}
return mapCount;
}
/**
* 统计每个分数对应的数量
* @return
*/
public Map<String,Integer> lineChart(){
Connection conn = null;
PreparedStatement stmt = null;
ResultSet resultSet = null;
Map<String,Integer> mapCount = new HashMap<String, Integer>();
conn = JdbcUtils.getConnection();
String sql = null;
int number = 99;
for( ; number>=70; number-=1){
//获取9.9 9.1 7.4 .....
String s = number+"";
String score = s.charAt(0)+"."+s.charAt(1);
sql = "SELECT COUNT(1) FROM movie WHERE score=" + score ;
try {
stmt = conn.prepareStatement(sql);
resultSet = stmt.executeQuery();
while (resultSet.next()) {
mapCount.put(score, resultSet.getInt(1));
}
} catch (SQLException e) {
e.printStackTrace();
}
}
return mapCount;
}
}
两分钟抓取数据2000+并保存至数据库中,感觉还是挺慢的,有待优化代码
代码源码: GitHub:https://github.com/YanKuan-IT/DouBanMoviesInfo_DB.git
注:如有什么做的不对的,请指教