Mapreduce和HBase新版本整合之WordCount计数案例

先计数单词数量存到hdfs文件上,这个是以前的就做过的

package com.my.myhnase.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {
    //计数变量
    private static final IntWritable ONE = new IntWritable(1);
    /**
     * 
     * @author 汤高
     *  Mapper<LongWritable, Text, Text, IntWritable>中  LongWritable,IntWritable是Hadoop数据类型表示长整型和整形
     *
     *  LongWritable, Text表示输入类型 (比如本应用单词计数输入是 偏移量(字符串中的第一个单词的其实位置),对应的单词(值))
     *  Text, IntWritable表示输出类型  输出是单词  和他的个数
     *  注意:map函数中前两个参数LongWritable key, Text value和输出类型不一致
     *      所以后面要设置输出类型 要使他们一致
     */
    //Map过程
    public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        /***
         * 
         */
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            //默认的map的value是每一行,我这里自定义的是以空格分割
            String[] vs = value.toString().split("\\s");
            for (String v : vs) {
                //写出去
                context.write(new Text(v), ONE);
            }

        }
    }
    //Reduce过程
    /***
     * @author 汤高
     * Text, IntWritable输入类型,从map过程获得 既map的输出作为Reduce的输入
     * Text, IntWritable输出类型
     */
    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int count=0;
            for(IntWritable v:values){
                count+=v.get();//单词个数加一
            }

            context.write(key, new IntWritable(count));
        }

    }

    public static void main(String[] args) {

        Configuration conf=new Configuration();
        try {


            //得到一个Job 并设置名字
            Job job=Job.getInstance(conf,"wordcount1");
            //设置Jar 使本程序在Hadoop中运行
            job.setJarByClass(WordCount.class);
            //设置Map处理类
            job.setMapperClass(WordCountMapper.class);
            //设置map的输出类型,因为不一致,所以要设置
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            //设置Reduce处理类
            job.setReducerClass(WordCountReducer.class);
            //设置输入和输出目录
            FileInputFormat.addInputPath(job, new Path("hdfs://192.168.52.140:9000/in_2/"));
            FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.52.140:9000/myhbase"+System.currentTimeMillis()));
            //启动运行
            System.exit(job.waitForCompletion(true) ? 0:1);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }


}

然后从hdfs读取刚刚产生的文件存到hbase表中

package com.my.myhnase.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;

//从hdfs读取文件存到hbase
public class WordCountHbaseMapreduce01 {

    public static void main(String[] args) throws Exception {
        System.exit(run());
    }

    public static int run() throws Exception {
        Configuration conf = new Configuration();
        conf = HBaseConfiguration.create(conf);
        conf.set("hbase.zookeeper.quorum", "192.168.52.140");

        Job job = Job.getInstance(conf, "wordcount");
        job.setJarByClass(WordCountHbaseMapreduce01.class);

        job.setInputFormatClass(KeyValueTextInputFormat.class);


        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(
                "hdfs://192.168.52.140:9000/myhbase1463572723056"));
        // 把数据写入Hbase数据库

        TableMapReduceUtil.initTableReducerJob("word",
                MyHbaseReducer.class, job);
        checkTable(conf);
        return job.waitForCompletion(true) ? 0 : 1;

    }
    //创建表
    private static void checkTable(Configuration conf) throws Exception {
        Connection con = ConnectionFactory.createConnection(conf);
        Admin admin = con.getAdmin();
        TableName tn = TableName.valueOf("word");
        if (!admin.tableExists(tn)){
            HTableDescriptor htd = new HTableDescriptor(tn);
            HColumnDescriptor hcd = new HColumnDescriptor("wordcount");
            htd.addFamily(hcd);
            admin.createTable(htd);
            System.out.println("表不存在,新创建表成功....");
        }
    }
    //把数据存到hbase
    public static class MyHbaseReducer extends TableReducer<Text, Text, ImmutableBytesWritable>{

        @Override
        protected void reduce(Text key, Iterable<Text> values,
                Reducer<Text, Text, ImmutableBytesWritable, Mutation>.Context context)
                        throws IOException, InterruptedException {
            // T一定要先tostring然后再转为byte,不然会词会有点不准确

            Put put=new Put(key.toString().getBytes());

            put.addColumn(Bytes.toBytes("wordcount"), Bytes.toBytes("num"), values.iterator().next().getBytes());

            context.write(new ImmutableBytesWritable(key.getBytes()), put);
        }
    }

}

从hbase表读取数据存到hdfs文件中

package com.my.myhnase.mapreduce;

import java.io.IOException;
import java.util.Date;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


//从hbase读取内容到hdfs文件上
public class WordCountHbaseMapreduce02 {

    public static class MyHBaseMap02 extends TableMapper<Text, Text>{
        public static void main(String[] args) throws Exception {
            System.exit(run());
        }

        @Override
        protected void map(ImmutableBytesWritable key, Result value,
                Mapper<ImmutableBytesWritable, Result, Text, Text>.Context context)
                        throws IOException, InterruptedException {


            String word=null;
            String num=null;
            List<Cell> cs=value.listCells();
            for(Cell cell:cs){
                 word=Bytes.toString(CellUtil.cloneRow(cell));
                 num=Bytes.toString(CellUtil.cloneValue(cell));
            }

            context.write(new Text(word), new Text(num));
        }


        public static int run() throws Exception {
            Configuration conf = new Configuration();
            conf = HBaseConfiguration.create(conf);
            conf.set("hbase.zookeeper.quorum", "192.168.52.140");

            Job job = Job.getInstance(conf, "wordcount2");
            job.setJarByClass(WordCountHbaseMapreduce02.class);


            Scan scan = new Scan();
            //取对业务有用的数据 tags, nickname
            scan.addColumn(Bytes.toBytes("wordcount"), Bytes.toBytes("num"));

            //数据来源 hbase
            //TableInputFormat.addColumns(scan, columns);
            //ImmutableBytesWritable来自hbase数据的类型
            TableMapReduceUtil.initTableMapperJob("word", scan, MyHBaseMap02.class, 
                    Text.class,  Text.class, job);
            FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.52.140:9000/hadoop_hbase_out" + new Date().getTime()));

            return job.waitForCompletion(true) ? 0 : 1;
        }
    }

}

从hbase表读取数据存到hbase表

package com.my.myhnase.mapreduce;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;


public class WordCountHbaseMapreduce03 {


    public static void main(String[] args) throws Exception {
        System.exit(run());
    }


    public static class  MyHBaseMap03 extends TableMapper<ImmutableBytesWritable, ImmutableBytesWritable>{

        @Override
        protected void map(ImmutableBytesWritable key, Result value,
                Mapper<ImmutableBytesWritable, Result, ImmutableBytesWritable, ImmutableBytesWritable>.Context context)
                        throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            ImmutableBytesWritable word=null;
            ImmutableBytesWritable num=null;
            List<Cell> cs=value.listCells();
            for(Cell cell:cs){
                 word=new ImmutableBytesWritable(CellUtil.cloneRow(cell));
                 num=new ImmutableBytesWritable(CellUtil.cloneValue(cell));
            }

            context.write(word, num);
        }
    }

    private static int run() throws Exception {
        Configuration conf = new Configuration();
        conf = HBaseConfiguration.create(conf);
        conf.set("hbase.zookeeper.quorum", "192.168.52.140");

        Job job = Job.getInstance(conf, "wordcount3");
        job.setJarByClass(WordCountHbaseMapreduce03.class);


        Scan scan = new Scan();
        //取对业务有用的数据 tags, nickname
        scan.addColumn(Bytes.toBytes("wordcount"), Bytes.toBytes("num"));

        //数据来源 hbase
        //TableInputFormat.addColumns(scan, columns);
        //ImmutableBytesWritable来自hbase数据的类型
        TableMapReduceUtil.initTableMapperJob("word", scan, MyHBaseMap03.class, 
                ImmutableBytesWritable.class,  ImmutableBytesWritable.class, job);

        createTable(conf);
        TableMapReduceUtil.initTableReducerJob("word2", MyHBaseReduce03.class, job);


        return job.waitForCompletion(true) ? 0 : 1;
    }

    private static void createTable(Configuration conf) throws IOException{
        Connection con = ConnectionFactory.createConnection(conf);
        Admin admin = con.getAdmin();
        TableName tn = TableName.valueOf("word2");
        if (!admin.tableExists(tn)){
            HTableDescriptor htd = new HTableDescriptor(tn);
            HColumnDescriptor hcd = new HColumnDescriptor("wordcount");
            htd.addFamily(hcd);
            admin.createTable(htd);
            System.out.println("表不存在,新创建表成功....");
        }
    }

    public static class MyHBaseReduce03 extends TableReducer<ImmutableBytesWritable, ImmutableBytesWritable,ImmutableBytesWritable>{

        @Override
        protected void reduce(ImmutableBytesWritable key, Iterable<ImmutableBytesWritable> values,
                Reducer<ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable, Mutation>.Context context)
                        throws IOException, InterruptedException {
            // TODO Auto-generated method stub

            Put put=new Put(key.get());

            put.addColumn(Bytes.toBytes("wordcount"), Bytes.toBytes("num"), values.iterator().next().get());

            context.write(new ImmutableBytesWritable(key.get()), put);
        }
    }

}

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

相关文章

来自专栏PPV课数据科学社区

【学习】七天搞定SAS(二):基本操作(判断、运算、基本函数)

? 今天开始注重变量操作。 SAS生成新变量 SAS支持基本的加减乘除,值得一提的是它的**代表指数,而不是^。* Modify homegarden dat...

54440
来自专栏码匠的流水账

聊聊openmessaging-java

openmessaging-java/openmessaging-api/src/main/java/io/openmessaging/producer/Pro...

30310
来自专栏10km的专栏

thrift:返回null的解决办法

最的项目用到swift:thrift做RPC框架,开始也没有了解太深,就开始干了,今天开始测试了,发现thrift居然不允许服务接口返回null。跟踪源码到下面...

44360
来自专栏码匠的流水账

聊聊storm的LinearDRPCTopologyBuilder

storm-2.0.0/storm-client/src/jvm/org/apache/storm/drpc/LinearDRPCTopologyBuilder...

13230
来自专栏码匠的流水账

聊聊storm的IEventLogger

storm-2.0.0/storm-client/src/jvm/org/apache/storm/metric/IEventLogger.java

16130
来自专栏张善友的专栏

如何结合IbatisNet的LIST遍历实现模糊查询

我仿照Java的Spring+Ibatis+Struct用Castle+IBatisNet+Asp.net的开发框架的DAO的基类:BaseSqlMapDao内...

25890
来自专栏码匠的流水账

聊聊rocketmq的PullConsumerImpl

io/openmessaging/rocketmq/consumer/PullConsumerImpl.java

27810
来自专栏码匠的流水账

自定义kafka streams的processor

本文来解析一下kafka streams的KStreamBuilder以及举例如何自定义kafka streams的processor

16720
来自专栏YG小书屋

orc文件格式对常用系统的支持

26630
来自专栏码匠的流水账

聊聊storm的IEventLogger

storm-2.0.0/storm-client/src/jvm/org/apache/storm/metric/IEventLogger.java

9620

扫码关注云+社区

领取腾讯云代金券