为了进行hive与spark的开发,所以想以某个大规模数据集进行测试,找到了搜狗引擎的日志数据,网上公开的应该有一个月的数据,差不多为5000多万条,做测试应该是满足要求的。
搜索引擎查询日志库设计为包括约1个月(2008年6月)Sogou搜索引擎部分网页查询需求及用户点击情况的网页查询日志数据集合。为进行中文搜索引擎用户行为分析的研究者提供基准研究语料。(网址为:http://www.sogou.com/labs/dl/q.html)
在百度云盘上找到了一个分享,于是转载到自己云盘里,也在这边分享一下:链接:http://pan.baidu.com/s/12VPue 密码:jn39。
做数据入库到mysql,由于一直在ubuntu环境上做实验,于是采用eclipse + java来开发,虽然效率比较低,但是将就用吧。下附主要代码。其中由于日志采用文本行的方式来处理,对文本的切割有些地方会报错,因此采取一些简单的策略直接滤掉一些不满足要求的。并迁移到hive做下实验,效率还是挺高的。
public static void main(String [] args) {
// The name of the file to open.
String fileName = "/home/Downloads/SougouQ/access_log.20080629.decode.filter";
// The name of the file to open.
String newfileName = "/home/Downloads/SougouQ/20080629.csv";
// This will reference one line at a time
String line = null;
String filePath = "/home/Downloads/SougouQ";
File logifles = new File(filePath);
File[] fs = logifles.listFiles();
String dateString ="",newdatestring;
Connection con = null;
Statement st = null;
String url = "jdbc:mysql://127.0.0.1:3306/dblog";
String user = "root";
String password = "ndscbigdata";
try
{
//## connect DB
// con = DriverManager.getConnection(url, user, password);
con=DriverManager.getConnection(url+"?user="+user+"&password="+password+"&useUnicode=true&characterEncoding=utf-8");
System.out.println("connect db success!");
// String insertsql = "INSERT INTO sougouquery(visitTime,userID, visitKeyword,rankIndex,clickIndex,clickUrl)"
// + " values(?,?,?,?,?,?)";
String insertsql2 = "INSERT INTO sougouquery(visitTime,userID, visitKeyword,rankIndex,clickIndex,clickUrl) values('%s','%s','%s',%d,%d,'%s')";
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String sqlex="";
PreparedStatement preparedStmt = null;
String[] parts = null;
//## for each files (30 days log file)
for(int i=0; i
{
fileName = fs[i].getName();
if(fileName.length()<10)
continue;
System.out.println(fs[i].getAbsolutePath());
System.out.println(fileName);
//## extract date(20080601)
dateString = fileName.substring(11, 19);
newdatestring = String.format("%s-%s-%s", dateString.substring(0,4),dateString.substring(4,6),dateString.substring(6,8));
System.out.println(dateString);
System.out.println(newdatestring);
// FileReader reads text files in the default encoding.
//## read file
FileInputStream fis = new FileInputStream(fs[i].getAbsolutePath());
InputStreamReader isr = new InputStreamReader(fis, "GB2312");
BufferedReader bufferedReader = new BufferedReader(isr);
//
//## new file name
newfileName = filePath+"/"+dateString+".csv";
System.out.println(newfileName);
//## write file
// FileOutputStream fos = new FileOutputStream(newfileName);
// OutputStreamWriter osw = new OutputStreamWriter(fos, "GB2312");
// BufferedWriter bufferedWriter = new BufferedWriter(osw);
while((line = bufferedReader.readLine()) != null)
{
//System.out.println(line);
// bufferedWriter.write(newdatestring +" "+line);
// System.out.println(newdatestring +" "+line);
// bufferedWriter.newLine();
// break;
// java.sql.Date startDate = new java.sql.Date()
parts = line.split("\\s");
if(parts.length!=6)
continue;
// the mysql insert statement
// create the mysql insert preparedstatement
// preparedStmt = con.prepareStatement(insertsql);
// preparedStmt.setDate(1,sdf.parse(newdatestring+" "+parts[0]));
// preparedStmt.setString (2,parts[1]);
// preparedStmt.setString (3, parts[2]);
// preparedStmt.setInt(4, Integer.parseInt(parts[3]));
// preparedStmt.setInt(5, Integer.parseInt(parts[4]));
// preparedStmt.setString (6, parts[5]);
sqlex = String.format(insertsql2, newdatestring+" "+parts[0],parts[1],parts[2],Integer.parseInt(parts[3]),Integer.parseInt(parts[4]),parts[5]);
try
{
st = con.createStatement();
//System.out.println(sqlex);
st.executeUpdate(sqlex);
// execute the preparedstatement
//preparedStmt.execute();
}
catch (SQLException ex) {
System.out.println(ex.getMessage());
continue;
}
}
// // Always close files.
bufferedReader.close();
// // Always close files.
// bufferedWriter.close();
}
}
// Note that write() does not automatically
// append a newline character.
// bufferedWriter.write("Hello there,");
// bufferedWriter.write(" here is some text.");
// bufferedWriter.newLine();
// bufferedWriter.write("We are writing");
// bufferedWriter.write(" the text to the file.");
catch(FileNotFoundException ex) {
System.out.println(
"Unable to open file '" +
fileName + "'");
}
catch(IOException ex) {
System.out.println(
"Error reading file '"
+ fileName + "'");
// Or we could just do this:
// ex.printStackTrace();
}
catch (SQLException ex) {
System.out.println(ex.getMessage());
}
finally {
try {
if (st != null) {
st.close();
}
if (con != null) {
con.close();
}
} catch (SQLException ex) {
System.out.println(ex.getMessage());
}
}
}