先计数单词数量存到hdfs文件上,这个是以前的就做过的
package com.my.myhnase.mapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
//计数变量
private static final IntWritable ONE = new IntWritable(1);
/**
*
* @author 汤高
* Mapper<LongWritable, Text, Text, IntWritable>中 LongWritable,IntWritable是Hadoop数据类型表示长整型和整形
*
* LongWritable, Text表示输入类型 (比如本应用单词计数输入是 偏移量(字符串中的第一个单词的其实位置),对应的单词(值))
* Text, IntWritable表示输出类型 输出是单词 和他的个数
* 注意:map函数中前两个参数LongWritable key, Text value和输出类型不一致
* 所以后面要设置输出类型 要使他们一致
*/
//Map过程
public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
/***
*
*/
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
//默认的map的value是每一行,我这里自定义的是以空格分割
String[] vs = value.toString().split("\\s");
for (String v : vs) {
//写出去
context.write(new Text(v), ONE);
}
}
}
//Reduce过程
/***
* @author 汤高
* Text, IntWritable输入类型,从map过程获得 既map的输出作为Reduce的输入
* Text, IntWritable输出类型
*/
public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int count=0;
for(IntWritable v:values){
count+=v.get();//单词个数加一
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) {
Configuration conf=new Configuration();
try {
//得到一个Job 并设置名字
Job job=Job.getInstance(conf,"wordcount1");
//设置Jar 使本程序在Hadoop中运行
job.setJarByClass(WordCount.class);
//设置Map处理类
job.setMapperClass(WordCountMapper.class);
//设置map的输出类型,因为不一致,所以要设置
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置Reduce处理类
job.setReducerClass(WordCountReducer.class);
//设置输入和输出目录
FileInputFormat.addInputPath(job, new Path("hdfs://192.168.52.140:9000/in_2/"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.52.140:9000/myhbase"+System.currentTimeMillis()));
//启动运行
System.exit(job.waitForCompletion(true) ? 0:1);
} catch (IOException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
然后从hdfs读取刚刚产生的文件存到hbase表中
package com.my.myhnase.mapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
//从hdfs读取文件存到hbase
public class WordCountHbaseMapreduce01 {
public static void main(String[] args) throws Exception {
System.exit(run());
}
public static int run() throws Exception {
Configuration conf = new Configuration();
conf = HBaseConfiguration.create(conf);
conf.set("hbase.zookeeper.quorum", "192.168.52.140");
Job job = Job.getInstance(conf, "wordcount");
job.setJarByClass(WordCountHbaseMapreduce01.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(
"hdfs://192.168.52.140:9000/myhbase1463572723056"));
// 把数据写入Hbase数据库
TableMapReduceUtil.initTableReducerJob("word",
MyHbaseReducer.class, job);
checkTable(conf);
return job.waitForCompletion(true) ? 0 : 1;
}
//创建表
private static void checkTable(Configuration conf) throws Exception {
Connection con = ConnectionFactory.createConnection(conf);
Admin admin = con.getAdmin();
TableName tn = TableName.valueOf("word");
if (!admin.tableExists(tn)){
HTableDescriptor htd = new HTableDescriptor(tn);
HColumnDescriptor hcd = new HColumnDescriptor("wordcount");
htd.addFamily(hcd);
admin.createTable(htd);
System.out.println("表不存在,新创建表成功....");
}
}
//把数据存到hbase
public static class MyHbaseReducer extends TableReducer<Text, Text, ImmutableBytesWritable>{
@Override
protected void reduce(Text key, Iterable<Text> values,
Reducer<Text, Text, ImmutableBytesWritable, Mutation>.Context context)
throws IOException, InterruptedException {
// T一定要先tostring然后再转为byte,不然会词会有点不准确
Put put=new Put(key.toString().getBytes());
put.addColumn(Bytes.toBytes("wordcount"), Bytes.toBytes("num"), values.iterator().next().getBytes());
context.write(new ImmutableBytesWritable(key.getBytes()), put);
}
}
}
从hbase表读取数据存到hdfs文件中
package com.my.myhnase.mapreduce;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//从hbase读取内容到hdfs文件上
public class WordCountHbaseMapreduce02 {
public static class MyHBaseMap02 extends TableMapper<Text, Text>{
public static void main(String[] args) throws Exception {
System.exit(run());
}
@Override
protected void map(ImmutableBytesWritable key, Result value,
Mapper<ImmutableBytesWritable, Result, Text, Text>.Context context)
throws IOException, InterruptedException {
String word=null;
String num=null;
List<Cell> cs=value.listCells();
for(Cell cell:cs){
word=Bytes.toString(CellUtil.cloneRow(cell));
num=Bytes.toString(CellUtil.cloneValue(cell));
}
context.write(new Text(word), new Text(num));
}
public static int run() throws Exception {
Configuration conf = new Configuration();
conf = HBaseConfiguration.create(conf);
conf.set("hbase.zookeeper.quorum", "192.168.52.140");
Job job = Job.getInstance(conf, "wordcount2");
job.setJarByClass(WordCountHbaseMapreduce02.class);
Scan scan = new Scan();
//取对业务有用的数据 tags, nickname
scan.addColumn(Bytes.toBytes("wordcount"), Bytes.toBytes("num"));
//数据来源 hbase
//TableInputFormat.addColumns(scan, columns);
//ImmutableBytesWritable来自hbase数据的类型
TableMapReduceUtil.initTableMapperJob("word", scan, MyHBaseMap02.class,
Text.class, Text.class, job);
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.52.140:9000/hadoop_hbase_out" + new Date().getTime()));
return job.waitForCompletion(true) ? 0 : 1;
}
}
}
从hbase表读取数据存到hbase表
package com.my.myhnase.mapreduce;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountHbaseMapreduce03 {
public static void main(String[] args) throws Exception {
System.exit(run());
}
public static class MyHBaseMap03 extends TableMapper<ImmutableBytesWritable, ImmutableBytesWritable>{
@Override
protected void map(ImmutableBytesWritable key, Result value,
Mapper<ImmutableBytesWritable, Result, ImmutableBytesWritable, ImmutableBytesWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
ImmutableBytesWritable word=null;
ImmutableBytesWritable num=null;
List<Cell> cs=value.listCells();
for(Cell cell:cs){
word=new ImmutableBytesWritable(CellUtil.cloneRow(cell));
num=new ImmutableBytesWritable(CellUtil.cloneValue(cell));
}
context.write(word, num);
}
}
private static int run() throws Exception {
Configuration conf = new Configuration();
conf = HBaseConfiguration.create(conf);
conf.set("hbase.zookeeper.quorum", "192.168.52.140");
Job job = Job.getInstance(conf, "wordcount3");
job.setJarByClass(WordCountHbaseMapreduce03.class);
Scan scan = new Scan();
//取对业务有用的数据 tags, nickname
scan.addColumn(Bytes.toBytes("wordcount"), Bytes.toBytes("num"));
//数据来源 hbase
//TableInputFormat.addColumns(scan, columns);
//ImmutableBytesWritable来自hbase数据的类型
TableMapReduceUtil.initTableMapperJob("word", scan, MyHBaseMap03.class,
ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
createTable(conf);
TableMapReduceUtil.initTableReducerJob("word2", MyHBaseReduce03.class, job);
return job.waitForCompletion(true) ? 0 : 1;
}
private static void createTable(Configuration conf) throws IOException{
Connection con = ConnectionFactory.createConnection(conf);
Admin admin = con.getAdmin();
TableName tn = TableName.valueOf("word2");
if (!admin.tableExists(tn)){
HTableDescriptor htd = new HTableDescriptor(tn);
HColumnDescriptor hcd = new HColumnDescriptor("wordcount");
htd.addFamily(hcd);
admin.createTable(htd);
System.out.println("表不存在,新创建表成功....");
}
}
public static class MyHBaseReduce03 extends TableReducer<ImmutableBytesWritable, ImmutableBytesWritable,ImmutableBytesWritable>{
@Override
protected void reduce(ImmutableBytesWritable key, Iterable<ImmutableBytesWritable> values,
Reducer<ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable, Mutation>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
Put put=new Put(key.get());
put.addColumn(Bytes.toBytes("wordcount"), Bytes.toBytes("num"), values.iterator().next().get());
context.write(new ImmutableBytesWritable(key.get()), put);
}
}
}