要建立词典,最基本的应该有词典的描述信息、词典索引文件以及词典数据文件。 /// <summary> /// 索引文件 /// </summary> string idxFile = "dic.idx"; /// <summary> /// 数据文件 /// </summary> string dictfile = "dic.dict"; /// <summary> /// 词典信息文件 /// </summary> string ifoFile = "dic.ifo"; 我们建立对应的三个类
详细的代码如下:
///
/// 词语解释
///
class DictWord
{
///
/// 解析
///
public string Description
{
get;
set;
}
}
///
/// 词典索引
///
class DictIndex
{
///
/// 词语
///
public string Word
{
get;
set;
}
///
/// 偏移
///
public int Offset
{
get;
set;
}
///
/// 数据大小
///
public int DataSize
{
get;
set;
}
}
///
/// 词典信息
///
class DictInfo
{
///
/// 词典名称
///
public string BookName
{
get;
set;
}
///
/// 收录词数
///
public int WordCount
{
get;
set;
}
///
/// 当前偏移
///
public int CurrentOffset
{
get;
set;
}
}
数据结构说明:
建立词典比较简单,首先,定义几个变量来存储词典相关信息: DictInfo info; SortedList<string, DictIndex> indexs; List<DictWord> words;
ps: SortedList能直接排序,不用我们再手动排序了
然后我们来看添加词语:
///
/// 添加词语
///
///
///
public void Add(string word, string description)
{
words.Add(new DictWord() { Description = description });
indexs.Add(word, new DictIndex { DataSize = Encoding.UTF8.GetBytes(description).Length, Offset = info.CurrentOffset, Word = word });
// 数量++
info.WordCount++;
// 偏移++
info.CurrentOffset += Encoding.UTF8.GetBytes(description).Length;
}
非常简单,就是添加索引,同时把词典的数量加1
最后来看怎么存储到文件:
/// <summary>
/// 保存
/// </summary>
public void Save()
{
StringBuilder dicBuilder = new StringBuilder();
dicBuilder.AppendLine(string.Format("BookName={0}", info.BookName));
dicBuilder.AppendLine(string.Format("WordCount={0}", info.WordCount));
dicBuilder.AppendLine(string.Format("CurrentOffset={0}", info.CurrentOffset));
File.WriteAllText(ifoFile, dicBuilder.ToString(), Encoding.UTF8);
dicBuilder = new StringBuilder();
using (BinaryWriter idxWriter = new BinaryWriter(File.Open(dictfile, FileMode.Create)))
{
foreach (var word in words)
{
idxWriter.Write(Encoding.UTF8.GetBytes(word.Description));
}
}
using (BinaryWriter idxWriter = new BinaryWriter(File.Open(idxFile, FileMode.Create)))
{
foreach (var index in indexs)
{
// 分块大小 128+4+4 = 136
// word 最长128
byte[] word = new byte[128];
var wordData = Encoding.UTF8.GetBytes(index.Key);
var length = Math.Min(128, wordData.Length);
for (var i = 0; i < length; i++)
{
word[i] = wordData[i];
}
idxWriter.Write(word);
byte[] re = new byte[4];
idxWriter.Write(index.Value.Offset);
idxWriter.Write(index.Value.DataSize);
}
}
}
这里注意下word最多能存128个字节,每个index区地大小为128+4+4 = 136字节
前面做这么多准备,不都是为了查询吗?木有查询,神马都是浮云!
前面说到了,索引文件存储的是排序好的词语列表,所以查询就比较简单了 先给出两个辅助方法: idxStream = new FileStream(idxFile, FileMode.Open); idxReader = new BinaryReader(idxStream); dictStream = new FileStream(dictfile, FileMode.Open); dictReader = new BinaryReader(dictStream); (1) 获取指定位置的索引
///
/// 获取指定位置的索引
///
///
///
public DictIndex GetWordIndex(int wordIndex)
{
idxStream.Seek(0, SeekOrigin.Begin);
idxStream.Seek(wordIndex * 136, SeekOrigin.Begin);
byte[] word = idxReader.ReadBytes(128);
var dicIndex = new DictIndex();
dicIndex.Word = Encoding.UTF8.GetString(word).Replace("\0", "");
dicIndex.Offset = idxReader.ReadInt32();
dicIndex.DataSize = idxReader.ReadInt32();
return dicIndex;
}
(2)获取指定索引对应的词语解释
///
/// 获取指定词语的解释
///
///
///
public string GetWordDescription(DictIndex dictIndex)
{
dictStream.Seek(0, SeekOrigin.Begin);
if (dictIndex.Offset != 0)
dictStream.Seek(dictIndex.Offset, SeekOrigin.Begin);
byte[] word = dictReader.ReadBytes(dictIndex.DataSize);
return Encoding.UTF8.GetString(word).Replace("\0", "");
}
现在开始二分查找:
///
/// 获取词语解释
///
///
///
public string GetDescription(string word)
{
var i = 0;
var mid = info.WordCount / 2;
var max = info.WordCount;
DictIndex w = new DictIndex();
while (i <= max)
{
mid = (i + max) / 2;
w = GetWordIndex(mid);
if (string.Compare(w.Word, word) > 0)
{
max = mid - 1;
}
else if (string.Compare(w.Word, word) < 0)
{
i = mid + 1;
}
else
{
break;
}
}
return "[" + w.Word + "]\n" + GetWordDescription(w);
}
此部分完整代码:
///
/// 词典
///
class Dict
{
DictInfo info;
SortedList indexs;
List words;
///
/// 索引文件
///
string idxFile = "dic.idx";
///
/// 数据文件
///
string dictfile = "dic.dict";
///
/// 词典信息文件
///
string ifoFile = "dic.ifo";
BinaryReader idxReader;
FileStream idxStream;
BinaryReader dictReader;
FileStream dictStream;
///
/// 查询使用
///
public Dict()
{
LoadDictInfo();
idxStream = new FileStream(idxFile, FileMode.Open);
idxReader = new BinaryReader(idxStream);
dictStream = new FileStream(dictfile, FileMode.Open);
dictReader = new BinaryReader(dictStream);
}
///
/// 创建时使用
///
///
public Dict(string name)
{
info = new DictInfo { BookName = name, WordCount = 0, CurrentOffset = 0 };
indexs = new SortedList();
words = new List();
}
///
/// 获取词语解释
///
///
///
public string GetDescription(string word)
{
var i = 0;
var mid = info.WordCount / 2;
var max = info.WordCount;
DictIndex w = new DictIndex();
while (i <= max)
{
mid = (i + max) / 2;
w = GetWordIndex(mid);
if (string.Compare(w.Word, word) > 0)
{
max = mid - 1;
}
else if (string.Compare(w.Word, word) < 0)
{
i = mid + 1;
}
else
{
break;
}
}
return "[" + w.Word + "]\n" + GetWordDescription(w);
}
///
/// 获取指定位置的索引
///
///
///
public DictIndex GetWordIndex(int wordIndex)
{
idxStream.Seek(0, SeekOrigin.Begin);
idxStream.Seek(wordIndex * 136, SeekOrigin.Begin);
byte[] word = idxReader.ReadBytes(128);
var dicIndex = new DictIndex();
dicIndex.Word = Encoding.UTF8.GetString(word).Replace("\0", "");
dicIndex.Offset = idxReader.ReadInt32();
dicIndex.DataSize = idxReader.ReadInt32();
return dicIndex;
}
///
/// 获取指定词语的解释
///
///
///
public string GetWordDescription(DictIndex dictIndex)
{
dictStream.Seek(0, SeekOrigin.Begin);
if (dictIndex.Offset != 0)
dictStream.Seek(dictIndex.Offset, SeekOrigin.Begin);
byte[] word = dictReader.ReadBytes(dictIndex.DataSize);
return Encoding.UTF8.GetString(word).Replace("\0", "");
}
///
/// 添加词语
///
///
///
public void Add(string word, string description)
{
words.Add(new DictWord() { Description = description });
indexs.Add(word, new DictIndex { DataSize = Encoding.UTF8.GetBytes(description).Length, Offset = info.CurrentOffset, Word = word });
// 数量++
info.WordCount++;
// 偏移++
info.CurrentOffset += Encoding.UTF8.GetBytes(description).Length;
}
///
/// 加载词典信息
///
void LoadDictInfo()
{
var infos = File.ReadAllLines(ifoFile);
info = new DictInfo
{
BookName = infos[0].Replace("BookName=", "").Trim(),
WordCount = int.Parse(infos[1].Replace("WordCount=", "").Trim()),
CurrentOffset = int.Parse(infos[2].Replace("CurrentOffset=", "").Trim()),
};
}
///
/// 保存
///
public void Save()
{
StringBuilder dicBuilder = new StringBuilder();
dicBuilder.AppendLine(string.Format("BookName={0}", info.BookName));
dicBuilder.AppendLine(string.Format("WordCount={0}", info.WordCount));
dicBuilder.AppendLine(string.Format("CurrentOffset={0}", info.CurrentOffset));
File.WriteAllText(ifoFile, dicBuilder.ToString(), Encoding.UTF8);
dicBuilder = new StringBuilder();
using (BinaryWriter idxWriter = new BinaryWriter(File.Open(dictfile, FileMode.Create)))
{
foreach (var word in words)
{
idxWriter.Write(Encoding.UTF8.GetBytes(word.Description));
}
}
using (BinaryWriter idxWriter = new BinaryWriter(File.Open(idxFile, FileMode.Create)))
{
foreach (var index in indexs)
{
// 分块大小 128+4+4 = 136
// word 最长128
byte[] word = new byte[128];
var wordData = Encoding.UTF8.GetBytes(index.Key);
var length = Math.Min(128, wordData.Length);
for (var i = 0; i < length; i++)
{
word[i] = wordData[i];
}
idxWriter.Write(word);
byte[] re = new byte[4];
idxWriter.Write(index.Value.Offset);
idxWriter.Write(index.Value.DataSize);
}
}
}
}
如图所示
文件夹中放置了许多文本文件,内容为词语的解释
首先、建立词典:
Dict dic = new Dict("病症词典");
var files = new DirectoryInfo(@"G:\Users\Administrator\Desktop\新建文件夹 (3)\新建文件夹 (3)").GetFiles();
foreach (var file in files)
{
Console.WriteLine(file.FullName);
dic.Add(file.Name.Replace("的症状.txt", ""), File.ReadAllText(file.FullName));
}
dic.Save();
然后、把玩一番:
var dict = new Dict();
while (true)
{
Console.Write("请输入词语:");
var w = Console.ReadLine();
Stopwatch sw = new Stopwatch();
sw.Start();
Console.WriteLine("找到词语:");
Console.WriteLine(dict.GetDescription(w));
sw.Stop();
Console.WriteLine("耗时:" + sw.ElapsedMilliseconds + "ms");
}
运行结果:
到此为止,谢谢收看!