-
한글 일치율 비교#5. HammingDistance, JaroWinklerDistance 알고리즘 추가c# Winform 개발/글자비교 2021. 6. 23. 14:51
HammingDistance : 글자개수로 비교
JaroWinklerDistance : LevenshteinDistance 와 마찬가지로 편집거리 알고리즘
클래스를 우선추가한후
public static class HammingDistance { public static int GetHammingDistance(string s, string t) { /* if (s.Length != t.Length) { throw new Exception("Strings must be equal length"); } */ int distance = s.ToCharArray() .Zip(t.ToCharArray(), (c1, c2) => new { c1, c2 }) .Count(m => m.c1 != m.c2); return distance; } } public static class JaroWinklerDistance { /* The Winkler modification will not be applied unless the * percent match was at or above the mWeightThreshold percent * without the modification. * Winkler's paper used a default value of 0.7 */ private static readonly double mWeightThreshold = 0.7; /* Size of the prefix to be concidered by the Winkler modification. * Winkler's paper used a default value of 4 */ private static readonly int mNumChars = 4; /// <summary> /// Returns the Jaro-Winkler distance between the specified /// strings. The distance is symmetric and will fall in the /// range 0 (perfect match) to 1 (no match). /// </summary> /// <param name="aString1">First String</param> /// <param name="aString2">Second String</param> /// <returns></returns> public static double distance(string aString1, string aString2) { return 1.0 - proximity(aString1, aString2); } /// <summary> /// Returns the Jaro-Winkler distance between the specified /// strings. The distance is symmetric and will fall in the /// range 0 (no match) to 1 (perfect match). /// </summary> /// <param name="aString1">First String</param> /// <param name="aString2">Second String</param> /// <returns></returns> public static double proximity(string aString1, string aString2) { int lLen1 = aString1.Length; int lLen2 = aString2.Length; if (lLen1 == 0) return lLen2 == 0 ? 1.0 : 0.0; int lSearchRange = Math.Max(0, Math.Max(lLen1, lLen2) / 2 - 1); // default initialized to false bool[] lMatched1 = new bool[lLen1]; bool[] lMatched2 = new bool[lLen2]; int lNumCommon = 0; for (int i = 0; i < lLen1; ++i) { int lStart = Math.Max(0, i - lSearchRange); int lEnd = Math.Min(i + lSearchRange + 1, lLen2); for (int j = lStart; j < lEnd; ++j) { if (lMatched2[j]) continue; if (aString1[i] != aString2[j]) continue; lMatched1[i] = true; lMatched2[j] = true; ++lNumCommon; break; } } if (lNumCommon == 0) return 0.0; int lNumHalfTransposed = 0; int k = 0; for (int i = 0; i < lLen1; ++i) { if (!lMatched1[i]) continue; while (!lMatched2[k]) ++k; if (aString1[i] != aString2[k]) ++lNumHalfTransposed; ++k; } // System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed); int lNumTransposed = lNumHalfTransposed / 2; // System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed); double lNumCommonD = lNumCommon; double lWeight = (lNumCommonD / lLen1 + lNumCommonD / lLen2 + (lNumCommon - lNumTransposed) / lNumCommonD) / 3.0; if (lWeight <= mWeightThreshold) return lWeight; int lMax = Math.Min(mNumChars, Math.Min(aString1.Length, aString2.Length)); int lPos = 0; while (lPos < lMax && aString1[lPos] == aString2[lPos]) ++lPos; if (lPos == 0) return lWeight; return lWeight + 0.1 * lPos * (1.0 - lWeight); } }
시군구/읍면동 데이터( items )와 비교하면 일치율을 구할 수 있습니다.
swHD.Start(); foreach (var pair in items) { string[] value = pair.Value.ToString().Split('|'); int totalLen = int.Parse(value[1]) > tbLen ? int.Parse(value[1]) : tbLen; int dis = HammingDistance.GetHammingDistance(pair.Key, tb); double percent = (double)(totalLen - dis) / totalLen * 100; if (percent >= Double.Parse(textBox2.Text)) { detectHD.Add(value[0], percent); } } swHD.Stop(); swJWD.Start(); foreach (var pair in items) { string[] value = pair.Value.ToString().Split('|'); int totalLen = int.Parse(value[1]) > tbLen ? int.Parse(value[1]) : tbLen; double dis = JaroWinklerDistance.distance(pair.Key, tb); double percent = (1-dis) * 100; if (percent >= Double.Parse(textBox2.Text)) { detectJWD.Add(value[0], percent); } } swJWD.Stop();
결과 화면
300x250'c# Winform 개발 > 글자비교' 카테고리의 다른 글
한글 일치율 비교#4. 텍스트 비교 및 일치율 추출 (0) 2021.06.22 한글 일치율 비교#3. LevenshteinDistance (0) 2021.06.22 한글 일치율 비교#2. 한글분리 (초성,중성,종성) (0) 2021.06.22 한글 일치율 비교#1. 전국 읍면동 데이터 Map에 담기 (0) 2021.06.22