java使用正则表达式抓取网页内容存为txt

前几天女友在网上看了一本电子书,想要下载下来,不过那个网站只能支持在线阅读,不提供下载,还好可以复制粘贴。

于是这个复制粘贴的任务便交给了我,看了一下网站url,单篇文章的html源码都很简单,作为一个程序员怎么可以重复的复制粘贴呢?

于是有了这个代码,比较简单:

package WEB;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 网页抓取
 * @author 胡阳
 * @blog http://www.the5fire.com
 *
 */
public class WebGet {
	private String myUrl;
	private HttpURLConnection con;
	private StringBuilder contextAll = new StringBuilder("");

	private int pageCount = 0;
	private String pageType = "";
	public WebGet() {

	}

	public WebGet(String url) {
		this.myUrl = url;
	}

	public WebGet(String url,int pageCount,String pageType) {
		this.myUrl = url;
		this.pageCount = pageCount;
		this.pageType = pageType;
	}

	/**
	 * 正则表达式
	 * */
	public String regex() {
		String googleRegex = "";
		return googleRegex;
	}

	public void init(String url, String page) throws IOException {
		this.myUrl = "http://www.tianyabook.com/qita/hougeixue/";
		this.init(page);
	}

	public void init(String page) throws IOException {
		if (myUrl != null && !myUrl.equals("")) {
			URL urlmy = new URL(myUrl + page + ".html");
			con = (HttpURLConnection) urlmy.openConnection();
			con.setFollowRedirects(true);
			con.setInstanceFollowRedirects(false);
			con.connect();
		}
	}

	/**
	 * 写字符串中数据到txt文件
	 * @param context
	 * @return
	 * @throws IOException
	 */
	public boolean writeTxt(String context,String filePath) throws IOException {
		System.out.println("开始写文件。。");
		OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(
				filePath));

		osw.write(context, 0, context.length());
		osw.flush();
		osw.close();

		return true;
	}

	/**
	 * 获得网页内容,要指定编码格式
	 * @param codeType GB2312/UTF-8/……
	 * @return
	 * @throws IOException 
	 * @throws  
	 */
	public String getContent(String codeType) throws IOException{
		if(pageCount < 1){
			return "null";
		}
		System.out.println("开始抓取内容。。。。。");
		for (int i = 1; i < pageCount; i++) {
			System.out.println("抓取第 " + i + "页");
			this.init(String.valueOf(i));
			BufferedReader br = new BufferedReader(new InputStreamReader(con
					.getInputStream(), codeType));
			String s = "";
			StringBuffer sb = new StringBuffer("");
			while ((s = br.readLine()) != null) {
				sb.append(s);
			}

			String result = sb.toString();
			Pattern pattern = Pattern.compile(regex());
			Matcher matcher = pattern.matcher(result);

			while (matcher.find()) {
				String title = matcher.group().replaceAll("<.*?>", "")
						.replaceAll(" ", "");

				contextAll.append(title + "\n\t");
			}
			System.out.println("完成:" + i + "页");
			System.out.println("");
		}

		return contextAll.toString();
	}

	public static void main(String[] args) throws IOException {

		WebGet wg = new WebGet("http://www.tianyabook.com/qita/hougeixue/",227,"html");
		try {
			if (wg.writeTxt(wg.getContent("GB2312"),"D:\\houhei.txt")) {
				System.out.println("完成");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}

	}
}

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区

领取腾讯云代金券