ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • 한글 일치율 비교#5. HammingDistance, JaroWinklerDistance 알고리즘 추가
    c# Winform 개발/글자비교 2021. 6. 23. 14:51

     

    HammingDistance : 글자개수로 비교

    JaroWinklerDistance : LevenshteinDistance 와 마찬가지로 편집거리 알고리즘

     

    클래스를 우선추가한후

     

    public static class HammingDistance
        {
            public static int GetHammingDistance(string s, string t)
            {
                /*
                if (s.Length != t.Length)
                {
                    throw new Exception("Strings must be equal length");
                }
                */
    
                int distance =
                    s.ToCharArray()
                    .Zip(t.ToCharArray(), (c1, c2) => new { c1, c2 })
                    .Count(m => m.c1 != m.c2);
    
                return distance;
            }
        }
    
        public static class JaroWinklerDistance
        {
            /* The Winkler modification will not be applied unless the 
             * percent match was at or above the mWeightThreshold percent 
             * without the modification. 
             * Winkler's paper used a default value of 0.7
             */
            private static readonly double mWeightThreshold = 0.7;
    
            /* Size of the prefix to be concidered by the Winkler modification. 
             * Winkler's paper used a default value of 4
             */
            private static readonly int mNumChars = 4;
    
    
            /// <summary>
            /// Returns the Jaro-Winkler distance between the specified  
            /// strings. The distance is symmetric and will fall in the 
            /// range 0 (perfect match) to 1 (no match). 
            /// </summary>
            /// <param name="aString1">First String</param>
            /// <param name="aString2">Second String</param>
            /// <returns></returns>
            public static double distance(string aString1, string aString2)
            {
                return 1.0 - proximity(aString1, aString2);
            }
    
    
            /// <summary>
            /// Returns the Jaro-Winkler distance between the specified  
            /// strings. The distance is symmetric and will fall in the 
            /// range 0 (no match) to 1 (perfect match). 
            /// </summary>
            /// <param name="aString1">First String</param>
            /// <param name="aString2">Second String</param>
            /// <returns></returns>
            public static double proximity(string aString1, string aString2)
            {
                int lLen1 = aString1.Length;
                int lLen2 = aString2.Length;
                if (lLen1 == 0)
                    return lLen2 == 0 ? 1.0 : 0.0;
    
                int lSearchRange = Math.Max(0, Math.Max(lLen1, lLen2) / 2 - 1);
    
                // default initialized to false
                bool[] lMatched1 = new bool[lLen1];
                bool[] lMatched2 = new bool[lLen2];
    
                int lNumCommon = 0;
                for (int i = 0; i < lLen1; ++i)
                {
                    int lStart = Math.Max(0, i - lSearchRange);
                    int lEnd = Math.Min(i + lSearchRange + 1, lLen2);
                    for (int j = lStart; j < lEnd; ++j)
                    {
                        if (lMatched2[j]) continue;
                        if (aString1[i] != aString2[j])
                            continue;
                        lMatched1[i] = true;
                        lMatched2[j] = true;
                        ++lNumCommon;
                        break;
                    }
                }
                if (lNumCommon == 0) return 0.0;
    
                int lNumHalfTransposed = 0;
                int k = 0;
                for (int i = 0; i < lLen1; ++i)
                {
                    if (!lMatched1[i]) continue;
                    while (!lMatched2[k]) ++k;
                    if (aString1[i] != aString2[k])
                        ++lNumHalfTransposed;
                    ++k;
                }
                // System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed);
                int lNumTransposed = lNumHalfTransposed / 2;
    
                // System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed);
                double lNumCommonD = lNumCommon;
                double lWeight = (lNumCommonD / lLen1
                                 + lNumCommonD / lLen2
                                 + (lNumCommon - lNumTransposed) / lNumCommonD) / 3.0;
    
                if (lWeight <= mWeightThreshold) return lWeight;
                int lMax = Math.Min(mNumChars, Math.Min(aString1.Length, aString2.Length));
                int lPos = 0;
                while (lPos < lMax && aString1[lPos] == aString2[lPos])
                    ++lPos;
                if (lPos == 0) return lWeight;
                return lWeight + 0.1 * lPos * (1.0 - lWeight);
    
            }
    
    
        }

     

    시군구/읍면동 데이터( items )와 비교하면 일치율을 구할 수 있습니다. 

     

    swHD.Start();
                foreach (var pair in items)
                {
                    string[] value = pair.Value.ToString().Split('|');
                    int totalLen = int.Parse(value[1]) > tbLen ? int.Parse(value[1]) : tbLen;
                    int dis = HammingDistance.GetHammingDistance(pair.Key, tb);
    
                    double percent = (double)(totalLen - dis) / totalLen * 100;
    
                    if (percent >= Double.Parse(textBox2.Text))
                    {
                        detectHD.Add(value[0], percent);
                    }
                }
                swHD.Stop();
    
                swJWD.Start();
                foreach (var pair in items)
                {
                    string[] value = pair.Value.ToString().Split('|');
                    int totalLen = int.Parse(value[1]) > tbLen ? int.Parse(value[1]) : tbLen;
                    double dis = JaroWinklerDistance.distance(pair.Key, tb);
    
                    double percent = (1-dis) * 100;
    
                    if (percent >= Double.Parse(textBox2.Text))
                    {
                        detectJWD.Add(value[0], percent);
                    }
                }
                swJWD.Stop();

     

    결과 화면

     

    300x250

    댓글

Designed by Tistory.