文件倒排索引算法及其hadoop实现

triplebee

发布于 2018-01-12 14:49:36

7080

发布于 2018-01-12 14:49:36

文章被收录于专栏：计算机视觉与深度学习基础

什么是文件的倒排索引？

简单讲就是一种搜索引擎的算法。过倒排索引，可以根据单词快速获取包含这个单词的文档列表。倒排索引主要由两个部分组成：“单词”和对应出现的“倒排文件”。

详细解释有一篇博客说得挺好：http://blog.csdn.net/hguisu/article/details/7962350

MapReduce的设计思路

整个过程包含map、combiner、reduce三个阶段，它们各自对应的key和value类型如下表所示：

	InputKey	InputValue	OutputKey	OutputValue
Map	Object	Text	Text	Text
Combiner	Text	Text	Text	Text
Reduce	Text	Text	Text	Text

使用默认的TextInputFormat读入文件，三个部分的具体操作如下：

Map：将每一行的内容分词，输出key为“单词：文章”，输出value为“出现次数”，这里是Text类型的“1”；

Combiner：针对每一个输入key，将value值转为int数值累加，并将key中的文章放入value，输出key为“单词”，输出value为“文章：出现次数；……”；

Reduce：针对每一个输入key，以冒号分割，将value值中的出现次数取出来累加，并记录文章数量，计算出出平均出现次数，输出key为“单词平均出现次数”，输出value为“文章：出现次数；……”

2. MapReduce的代码片段

Map代码如下：
public static class Map extends Mapper<Object,Text,Text,Text>
{
   private TextvalueInfo = new Text();
   private TextkeyInfo = new Text();
   privateFileSplit split;
   public void map(Object key, Text value,Context context) throws IOException,InterruptedException
    {
       split =(FileSplit) context.getInputSplit();
       StringTokenizerstk = new StringTokenizer(value.toString());//单词分割
       while(stk.hasMoreElements()) //还有单词
       {
           Stringname = split.getPath().getName();//获取文件名
           intsplitIndex = name.indexOf(".");//获取文件名中点的位置
           keyInfo.set(stk.nextToken()+ ":" + name.substring(0, splitIndex));//单词:去后缀文件名
           valueInfo.set("1");//outputValue置为1
           context.write(keyInfo,valueInfo);//写入context
        }
    }
}
Combiner代码如下：
public static class Combiner extends Reducer<Text,Text,Text,Text>
{
   Text info =new Text();
    public void reduce(Text key,Iterable<Text> values,Context context) 
throwsIOException, InterruptedException
    {
       int sum = 0;
       for (Textvalue : values)
       {
           sum +=Integer.parseInt(value.toString());//累加同单词在同文章中出现次数
       }
       intsplitIndex = key.toString().indexOf(":");//获取key中的冒号位置
       info.set(key.toString().substring(splitIndex+1)+ ":" + sum);//设置value为文章：次数
       key.set(key.toString().substring(0,splitIndex));//设置key为单词
       context.write(key,info);//写入context
    }
}

Reduce代码如下：
public static class Reduce extends Reducer<Text,Text,Text,Text>
{
   private Textresult = new Text();
   public void reduce(Text key, Iterable<Text> values,Context contex) 
throwsIOException, InterruptedException
    {
       StringfileList = new String();
       double sum =0 , cnt = 0;
       for (Textvalue : values)
       {
           cnt++;//统计出现的文章数
           fileList+= value.toString() + ";";//文章次数之间加分号
           intsplitIndex = value.toString().indexOf(":"); 
           sum +=Integer.parseInt(value.toString().substring(splitIndex+1));//统计出现总次数
       }
       sum /= cnt;//计算平均次数
       result.set(fileList);//设置value值
       key.set(key.toString()+ '\t' + String.format("%.2f", sum));//设置key值
       contex.write(key,result);//写入context
    }
}
这里最终输出的key是“单词平均出现次数”，
Value是“文章：出现次数;……”。

开发环境： Intellijidea + meaven + java1.8

对武侠小说集合的进行倒排索引，输出文件中江湖的截图如下：

完整代码如下：

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.commons.lang.ObjectUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.hbase.client.HTable;


public class InvertedIndex
{
    private static Configuration conf2 = null;
    static
    {
        conf2 = HBaseConfiguration.create();
    }

    public static void addData(String tableName, String rowKey, String family,
                               String qualifier, String value )throws Exception
    {
        try
        {
            HTable table = new HTable(conf2, tableName);
            Put put = new Put(Bytes.toBytes(rowKey));
            put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier), Bytes.toBytes(value));
            table.put(put);
            System.out.println("insert success!");
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
    }

    public static class Map extends Mapper<Object,Text,Text,Text>
    {
        private Text valueInfo = new Text();
        private Text keyInfo = new Text();
        private FileSplit split;
        public void map(Object key, Text value,Context context) throws IOException, InterruptedException
        {
            split = (FileSplit) context.getInputSplit();
            StringTokenizer stk = new StringTokenizer(value.toString());
            while (stk.hasMoreElements())
            {
                String name = split.getPath().getName();
                int splitIndex = name.indexOf(".");
                keyInfo.set(stk.nextToken() + ":" + name.substring(0, splitIndex));
                valueInfo.set("1");
                context.write(keyInfo, valueInfo);
            }
        }
    }

    public static class Combiner extends Reducer<Text,Text,Text,Text>
    {
        Text info = new Text();
        public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException
        {
            int sum = 0;
            for (Text value : values)
            {
                sum += Integer.parseInt(value.toString());
            }
            int splitIndex = key.toString().indexOf(":");
            info.set(key.toString().substring(splitIndex+1) + ":" + sum);
            key.set(key.toString().substring(0,splitIndex));
            context.write(key, info);
        }
    }

    public static class Reduce extends Reducer<Text,Text,Text,Text>
    {
        private Text result = new Text();
        public void reduce(Text key, Iterable<Text> values,Context contex) throws IOException, InterruptedException
        {
            //生成文档列表
            String fileList = new String();
            double sum = 0 , cnt = 0;
            for (Text value : values)
            {
                cnt++;
                fileList += value.toString() + ";";
                int splitIndex = value.toString().indexOf(":");
                sum += Integer.parseInt(value.toString().substring(splitIndex+1));
            }
            sum /= cnt;

            result.set(fileList);
            //key.set(key.toString() + '\t' + String.format("%.2f", sum));
            try
            {
                addData("test", key.toString(), "BigData", "aveNum", String.format("%.2f", sum));
            }
            catch (Exception e)
            {
                e.printStackTrace();
            }
            contex.write(key, result);
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException
    {
        Configuration conf = new Configuration();//配置对象
        Job job = new Job(conf,"InvertedIndex");//新建job
        job.setJarByClass(InvertedIndex.class);//job类

        job.setMapperClass(Map.class);//map设置
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setCombinerClass(Combiner.class);//combiner设置

        job.setReducerClass(Reduce.class);//reduce设置
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //FileInputFormat.addInputPath(job, new Path("/data/wuxia_novels/"));//路径设置
        //FileOutputFormat.setOutputPath(job, new Path("/user/2016st28/exp2/"));
        FileInputFormat.addInputPath(job, new Path("/input/exp2/"));//路径设置
        FileOutputFormat.setOutputPath(job, new Path("/output/test/"));

        System.exit(job.waitForCompletion(true)?0:1);
    }
}

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2017-02-23 ，如有侵权请联系 cloudcommunity@tencent.com 删除

其他

本文分享自作者个人站点/博客前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

其他

登录后参与评论

0 条评论

热度

文件倒排索引算法及其hadoop实现

文件倒排索引算法及其hadoop实现

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐