You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

153 lines
4.7 KiB

  1. using System;
  2. namespace Apewer.Internals
  3. {
  4. /// <summary>Levenshtein 距离:两个字符串之间,由一个转换成另一个所需的最少编辑操作次数。</summary>
  5. internal class Levenshtein
  6. {
  7. #region 结果
  8. private double _Rate;
  9. private int _Times;
  10. private int _Distance;
  11. private TimeSpan _Duration;
  12. /// <summary>相似度,取值范围 0 ~ 1。若不同,则趋近于 0;若完全相同,则为 1。</summary>
  13. public double Rate { get { return _Rate; } }
  14. /// <summary>对比次数。</summary>
  15. public int Times { get { return _Times; } }
  16. /// <summary>距离。</summary>
  17. public int Distance { get { return _Distance; } }
  18. /// <summary>时长。</summary>
  19. public TimeSpan Duration { get { return _Duration; } }
  20. #endregion
  21. #region 计算
  22. /// <summary>字符串 1。</summary>
  23. private char[] _ArrChar1;
  24. /// <summary>字符串 2。</summary>
  25. private char[] _ArrChar2;
  26. /// <summary>开始时间。</summary>
  27. private DateTime _BeginTime;
  28. /// <summary>结束时间。</summary>
  29. private DateTime _EndTime;
  30. /// <summary>算法矩阵。</summary>
  31. private int[,] _Matrix;
  32. /// <summary>矩阵列数。</summary>
  33. private int _Column;
  34. /// <summary>矩阵行数。</summary>
  35. private int _Row;
  36. private Levenshtein()
  37. {
  38. InitVariable();
  39. }
  40. /// <summary>初始化变量。</summary>
  41. private void InitVariable()
  42. {
  43. _Rate = 0;
  44. _Times = 0;
  45. _Distance = 0;
  46. _Duration = new TimeSpan(0);
  47. }
  48. /// <summary>初始化算法基本信息。</summary>
  49. private void InitVariable(string arg1, string arg2)
  50. {
  51. InitVariable();
  52. _ArrChar1 = arg1.ToCharArray();
  53. _ArrChar2 = arg2.ToCharArray();
  54. _Row = _ArrChar1.Length + 1;
  55. _Column = _ArrChar2.Length + 1;
  56. _Matrix = new int[_Row, _Column];
  57. }
  58. /// <summary>初始化矩阵的第一行和第一列。</summary>
  59. private void InitMatrix()
  60. {
  61. for (int i = 0; i < _Column; i++) _Matrix[0, i] = i;
  62. for (int i = 0; i < _Row; i++) _Matrix[i, 0] = i;
  63. }
  64. /// <summary>取三个数中的最小值。</summary>
  65. private int Minimum(int argFirst, int argSecond, int argThird)
  66. {
  67. int intMin = argFirst;
  68. if (argSecond < intMin) intMin = argSecond;
  69. if (argThird < intMin) intMin = argThird;
  70. return intMin;
  71. }
  72. /// <summary>计算相似度。</summary>
  73. private void Compute()
  74. {
  75. // 开始时间。
  76. _BeginTime = DateTime.Now;
  77. // 初始化矩阵的第一行和第一列。
  78. this.InitMatrix();
  79. int intCost = 0;
  80. for (int i = 1; i < _Row; i++)
  81. {
  82. for (int j = 1; j < _Column; j++)
  83. {
  84. if (_ArrChar1[i - 1] == _ArrChar2[j - 1])
  85. {
  86. intCost = 0;
  87. }
  88. else
  89. {
  90. intCost = 1;
  91. }
  92. // 关键步骤,计算当前位置值为左边 + 1、上面 + 1、左上角 + intCost 中的最小值。
  93. // 循环遍历到最后 _Matrix[_Row - 1, _Column - 1] 即为两个字符串的距离。
  94. _Matrix[i, j] = this.Minimum(_Matrix[i - 1, j] + 1, _Matrix[i, j - 1] + 1, _Matrix[i - 1, j - 1] + intCost);
  95. _Times++;
  96. }
  97. }
  98. // 结束时间。
  99. _EndTime = DateTime.Now;
  100. // 相似率,移动次数小于最长的字符串长度的 20% 算同一题。
  101. int intLength = _Row > _Column ? _Row : _Column;
  102. _Rate = 1 - (double)_Matrix[_Row - 1, _Column - 1] / intLength;
  103. _Duration = _EndTime - _BeginTime;
  104. _Distance = _Matrix[_Row - 1, _Column - 1];
  105. }
  106. #endregion
  107. #region 方法
  108. /// <summary>计算两个字符串的相似度。</summary>
  109. public static Levenshtein Compute(string arg1, string arg2)
  110. {
  111. var vString1 = string.IsNullOrEmpty(arg1) ? "" : arg1;
  112. var vString2 = string.IsNullOrEmpty(arg2) ? "" : arg2;
  113. var vInstance = new Levenshtein();
  114. vInstance.InitVariable(vString1, vString2);
  115. vInstance.Compute();
  116. return vInstance;
  117. }
  118. #endregion
  119. }
  120. }