需求
题目:电影搜索引擎 功能:根据文档数据集,模拟用户输入一个电影关键 词后,搜索到哪些文档 难度系数:1.2 l.生成模拟数据集:用C+、Java、Python等语言编 写程序,按以下格式要求生成一个较大的模拟数据集 (不少于10000条数据)。 文档ID关键词ID1:出现次数关键词ID2:出现次数关 键词ID3:出现次数关键词ID4:出现次数.… 注意:关键词D1:6,表示关键词D1这一列的值为 6(即出现次数) 2.HBase编程:编写Java程序,实现本题功能,即在 HBase中创建一个表doc,把数据集中的数据全部插 入到doc表,然后再输入一个电影关键词,按以下格 式输出查询结果。尽可能详细描述程序的编译、在 DE环境下测试、打包、执行完整过程,附相应的截 图及程序执行结果。 文档D1:出现次数 文档D2:出现次数 文档D3:出现次数 3.测试:在HBase的Shell中测试上述Java程序的执行 结果。要求尽可能描述详细的测试过程和结果。
public static Connection getConnection() throws IOException {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "ov-dapqahbase-03,ov-dapqahbase-02,ov-dapqahbase-04");
conf.set("hbase.zookeeper.property.clientPort", "2181");
conf.set("zookeeper.znode.parent", "/hbase-unsecure");
User.SecureHadoopUser user = new User.SecureHadoopUser();
Connection hbase = ConnectionFactory.createConnection(conf, user);
return hbase;
}
public static void insert() throws Exception{
List<String> res = constructData();
Connection hbase = getConnection();
// Admin admin = hbase.getAdmin();
// HTableDescriptor hTableDescriptor = new HTableDescriptor(TableName.valueOf("nlp_graph:doc"));
// HColumnDescriptor hColumnDescriptor = new HColumnDescriptor("f1");
// hTableDescriptor.addFamily(hColumnDescriptor);
// admin.createTable(hTableDescriptor);
// admin.close();
Table hTable = hbase.getTable(TableName.valueOf("nlp_graph:doc"));
for(String str: res) {
String[] s = str.split(",");
String docId = s[0];
for(int i = 1;i<s.length; i++) {
String[] ss = s[i].split(":");
String rowkey = ss[0];
String v = ss[1];
Put put = new Put(rowkey.getBytes());
put.addColumn("f1".getBytes(), docId.getBytes(), v.getBytes());
hTable.put(put);
}
}
}
public static List<String> search(String str) throws IOException {
Get get = new Get(str.getBytes());
Connection hbase = getConnection();
Result result = hbase.getTable(TableName.valueOf("nlp_graph:doc")).get(get);
List<Cell> list = result.listCells();
List<String> res = new ArrayList<>();
for(Cell cell: list) {
String qualifier = Bytes.toString(CellUtil.cloneQualifier(cell));
String value = Bytes.toString(CellUtil.cloneValue(cell));
res.add(qualifier+":"+value);
}
return res;
}
public static void main(String[] args) throws Exception {
// insert(); //插入数据
List<String> res = search("1");
for(String s: res) {
System.out.println(s);
}
}
/**
* 生成数据集
*
* @throws FileNotFoundException
*/
public static List<String> constructData() throws FileNotFoundException {
List<String> result = new ArrayList<>();
List<String> list = Arrays.asList("1", "2", "3", "4", "5", "6");
for (int i = 0; i < 20000; i++) {
StringBuilder stringBuilder = new StringBuilder();
stringBuilder = stringBuilder.append(i).append(",");
for (int j = 0; j < list.size(); j++) {
stringBuilder = stringBuilder.append(list.get(j)).append(":").append(Integer.valueOf(list.get(j)) * 10 + 1).append(",");
}
String str = stringBuilder.toString();
result.add(str.substring(0, str.length() - 1));
}
return result;
}