什么是文件的倒排索引?
简单讲就是一种搜索引擎的算法。过倒排索引,可以根据单词快速获取包含这个单词的文档列表。倒排索引主要由两个部分组成:“单词”和对应出现的“倒排文件”。
详细解释有一篇博客说得挺好:http://blog.csdn.net/hguisu/article/details/7962350
整个过程包含map、combiner、reduce三个阶段,它们各自对应的key和value类型如下表所示:
InputKey | InputValue | OutputKey | OutputValue | |
---|---|---|---|---|
Map | Object | Text | Text | Text |
Combiner | Text | Text | Text | Text |
Reduce | Text | Text | Text | Text |
使用默认的TextInputFormat读入文件,三个部分的具体操作如下:
Map:将每一行的内容分词,输出key为“单词:文章”,输出value为“出现次数”,这里是Text类型的“1”;
Combiner:针对每一个输入key,将value值转为int数值累加,并将key中的文章放入value,输出key为“单词”,输出value为“文章:出现次数;……”;
Reduce:针对每一个输入key,以冒号分割,将value值中的出现次数取出来累加,并记录文章数量,计算出出平均出现次数,输出key为“单词平均出现次数”,输出value为“文章:出现次数;……”
2. MapReduce的代码片段
Map代码如下:
public static class Map extends Mapper<Object,Text,Text,Text>
{
private TextvalueInfo = new Text();
private TextkeyInfo = new Text();
privateFileSplit split;
public void map(Object key, Text value,Context context) throws IOException,InterruptedException
{
split =(FileSplit) context.getInputSplit();
StringTokenizerstk = new StringTokenizer(value.toString());//单词分割
while(stk.hasMoreElements()) //还有单词
{
Stringname = split.getPath().getName();//获取文件名
intsplitIndex = name.indexOf(".");//获取文件名中点的位置
keyInfo.set(stk.nextToken()+ ":" + name.substring(0, splitIndex));//单词:去后缀文件名
valueInfo.set("1");//outputValue置为1
context.write(keyInfo,valueInfo);//写入context
}
}
}
Combiner代码如下:
public static class Combiner extends Reducer<Text,Text,Text,Text>
{
Text info =new Text();
public void reduce(Text key,Iterable<Text> values,Context context)
throwsIOException, InterruptedException
{
int sum = 0;
for (Textvalue : values)
{
sum +=Integer.parseInt(value.toString());//累加同单词在同文章中出现次数
}
intsplitIndex = key.toString().indexOf(":");//获取key中的冒号位置
info.set(key.toString().substring(splitIndex+1)+ ":" + sum);//设置value为文章:次数
key.set(key.toString().substring(0,splitIndex));//设置key为单词
context.write(key,info);//写入context
}
}
Reduce代码如下:
public static class Reduce extends Reducer<Text,Text,Text,Text>
{
private Textresult = new Text();
public void reduce(Text key, Iterable<Text> values,Context contex)
throwsIOException, InterruptedException
{
StringfileList = new String();
double sum =0 , cnt = 0;
for (Textvalue : values)
{
cnt++;//统计出现的文章数
fileList+= value.toString() + ";";//文章次数之间加分号
intsplitIndex = value.toString().indexOf(":");
sum +=Integer.parseInt(value.toString().substring(splitIndex+1));//统计出现总次数
}
sum /= cnt;//计算平均次数
result.set(fileList);//设置value值
key.set(key.toString()+ '\t' + String.format("%.2f", sum));//设置key值
contex.write(key,result);//写入context
}
}
这里最终输出的key是“单词平均出现次数”,
Value是“文章:出现次数;……”。
开发环境: Intellijidea + meaven + java1.8
对武侠小说集合的进行倒排索引,输出文件中江湖的截图如下:
完整代码如下:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.commons.lang.ObjectUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.hbase.client.HTable;
public class InvertedIndex
{
private static Configuration conf2 = null;
static
{
conf2 = HBaseConfiguration.create();
}
public static void addData(String tableName, String rowKey, String family,
String qualifier, String value )throws Exception
{
try
{
HTable table = new HTable(conf2, tableName);
Put put = new Put(Bytes.toBytes(rowKey));
put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier), Bytes.toBytes(value));
table.put(put);
System.out.println("insert success!");
}
catch (IOException e)
{
e.printStackTrace();
}
}
public static class Map extends Mapper<Object,Text,Text,Text>
{
private Text valueInfo = new Text();
private Text keyInfo = new Text();
private FileSplit split;
public void map(Object key, Text value,Context context) throws IOException, InterruptedException
{
split = (FileSplit) context.getInputSplit();
StringTokenizer stk = new StringTokenizer(value.toString());
while (stk.hasMoreElements())
{
String name = split.getPath().getName();
int splitIndex = name.indexOf(".");
keyInfo.set(stk.nextToken() + ":" + name.substring(0, splitIndex));
valueInfo.set("1");
context.write(keyInfo, valueInfo);
}
}
}
public static class Combiner extends Reducer<Text,Text,Text,Text>
{
Text info = new Text();
public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException
{
int sum = 0;
for (Text value : values)
{
sum += Integer.parseInt(value.toString());
}
int splitIndex = key.toString().indexOf(":");
info.set(key.toString().substring(splitIndex+1) + ":" + sum);
key.set(key.toString().substring(0,splitIndex));
context.write(key, info);
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text>
{
private Text result = new Text();
public void reduce(Text key, Iterable<Text> values,Context contex) throws IOException, InterruptedException
{
//生成文档列表
String fileList = new String();
double sum = 0 , cnt = 0;
for (Text value : values)
{
cnt++;
fileList += value.toString() + ";";
int splitIndex = value.toString().indexOf(":");
sum += Integer.parseInt(value.toString().substring(splitIndex+1));
}
sum /= cnt;
result.set(fileList);
//key.set(key.toString() + '\t' + String.format("%.2f", sum));
try
{
addData("test", key.toString(), "BigData", "aveNum", String.format("%.2f", sum));
}
catch (Exception e)
{
e.printStackTrace();
}
contex.write(key, result);
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException
{
Configuration conf = new Configuration();//配置对象
Job job = new Job(conf,"InvertedIndex");//新建job
job.setJarByClass(InvertedIndex.class);//job类
job.setMapperClass(Map.class);//map设置
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setCombinerClass(Combiner.class);//combiner设置
job.setReducerClass(Reduce.class);//reduce设置
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//FileInputFormat.addInputPath(job, new Path("/data/wuxia_novels/"));//路径设置
//FileOutputFormat.setOutputPath(job, new Path("/user/2016st28/exp2/"));
FileInputFormat.addInputPath(job, new Path("/input/exp2/"));//路径设置
FileOutputFormat.setOutputPath(job, new Path("/output/test/"));
System.exit(job.waitForCompletion(true)?0:1);
}
}