编辑距离的算法是首先由俄国科学家Levenshtein提出的,故又叫 Levenshtein Distance。一个字符串可以通过增加一个字符,删除一个字符,替换一个字符得到另外一个字符串,假设,我们把从字符串A转换成字符串B,前面3种操 作所执行的最少次数称为AB相似度 如 abc adc 度为 1 ababababa babababab 度为 2 abcd acdb 度为2
using System;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace Levenshtein
{
/// <summary>
/// 分析完成事件委托
/// </summary>
/// <param name="sim">相似度</param>
public delegate void AnalyzerCompletedHander(double sim);
/// <summary>
/// 文章相似度工具
/// </summary>
public class LevenshteinDistance:IDisposable
{
private string str1;
private string str2;
private int[,] index;
int k;
Task<double> task;
/// <summary>
/// 分析完成事件
/// </summary>
public event AnalyzerCompletedHander AnalyzerCompleted;
/// <summary>
/// 获取或设置文章1
/// </summary>
public string Str1
{
get { return str1; }
set
{
str1 = Format(value);
index = new int[str1.Length, str2.Length];
}
}
/// <summary>
/// 获取或设置文章2
/// </summary>
public string Str2
{
get { return str2; }
set
{
str2 = Format(value);
index = new int[str1.Length, str2.Length];
}
}
/// <summary>
/// 运算总次数
/// </summary>
public int TotalTimes
{
get { return str1.Length * str2.Length; }
}
/// <summary>
/// 是否完成
/// </summary>
public bool IsCompleted
{
get { return task.IsCompleted; }
}
/// <summary>
/// 实例化
/// </summary>
/// <param name="str1">文章1</param>
/// <param name="str2">文章2</param>
public LevenshteinDistance(string str1, string str2)
{
this.str1 = Format(str1);
this.str2 = Format(str2);
index = new int[str1.Length, str2.Length];
}
public LevenshteinDistance()
{
}
/// <summary>
/// 异步开始任务
/// </summary>
public void Start()
{
task = new Task<double>(Analyzer);
task.Start();
task.ContinueWith(o => Completed(o.Result));
}
/// <summary>
/// 同步开始任务
/// </summary>
/// <returns>相似度</returns>
public double StartAyns()
{
task = new Task<double>(Analyzer);
task.Start();
task.Wait();
return task.Result;
}
private void Completed(double s)
{
if (AnalyzerCompleted != null)
{
AnalyzerCompleted(s);
}
}
private double Analyzer()
{
if (str1.Length == 0 || str2.Length == 0)
return 0;
for (int i = 0; i < str1.Length; i++)
{
for (int j = 0; j < str2.Length; j++)
{
k = str1[i] == str2[j] ? 0 : 1;
if (i == 0&&j==0)
{
continue;
}
else if (i == 0)
{
index[i, j] = k + index[i, j - 1];
continue;
}
else if (j == 0)
{
index[i, j] = k + index[i - 1, j];
continue;
}
int temp = Min(index[i, j - 1],
index[i - 1, j],
index[i - 1, j - 1]);
index[i, j] = temp + k;
}
}
float similarty = 1 - (float)index[str1.Length - 1, str2.Length - 1]
/ (str1.Length > str2.Length ? str1.Length : str2.Length);
return similarty;
}
private string Format(string str)
{
str = Regex.Replace(str, @"[^a-zA-Z0-9\u4e00-\u9fa5\s]", "");
return str;
}
private int Min(int a, int b, int c)
{
int temp = a < b ? a : b;
temp = temp < c ? temp : c;
return temp;
}
public void Dispose()
{
task.Dispose();
}
}
}</pre>
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。