DFA在C#中的實現:過濾敏感詞

大雄45發表於2021-12-14
導讀 DFA即Deterministic Finite Automaton,也就是確定有窮自動機,它是是通過event和當前的state得到下一個state,即event+state=nextstate。在實現敏感詞過濾的演算法中,我們必須要減少運算,而DFA在DFA演算法中幾乎沒有什麼計算,有的只是狀態的轉換。

DFA在C#中的實現:過濾敏感詞DFA在C#中的實現:過濾敏感詞

敏感詞、文字過濾是一個網站必不可少的功能,如何設計一個好的、高效的過濾演算法是非常有必要的。

在實現文字過濾的演算法中,DFA是唯一比較好的實現演算法。DFA即Deterministic Finite Automaton,也就是確定有窮自動機,它是是通過event和當前的state得到下一個state,即event+state=nextstate。在實現敏感詞過濾的演算法中,我們必須要減少運算,而DFA在DFA演算法中幾乎沒有什麼計算,有的只是狀態的轉換。

下面看下在c#方法下實現方式

1、構建敏感詞庫類
private bool LoadDictionary() 
       { 
           var wordList = new List(); 
           if (_memoryLexicon == null) 
           { 
               _memoryLexicon = new WordGroup[char.MaxValue]; 
               var words = new SensitiveWordBll().GetAllWords(); 
               if (words == null) 
                   return false; 
               foreach (string word in words) 
               { 
                   wordList.Add(word); 
                   var chineseWord = Microsoft.VisualBasic.Strings.StrConv(word, 
                       Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0); 
                   if (word != chineseWord) 
                       wordList.Add(chineseWord); 
               } 
               foreach (var word in wordList) 
               { 
                   if (word.Length > 0) 
                   { 
                       var group = _memoryLexicon[word[0]]; 
                       if (group == null) 
                       { 
                           group = new WordGroup(); 
                           _memoryLexicon[word[0]] = group; 
                       } 
                       group.Add(word.Substring(1)); 
                   } 
               } 
           } 
           return true; 
       }
2、構建敏感詞檢測類
private bool Check(string blackWord) 
     { 
         _wordlenght = 0; 
         //檢測源下一位遊標 
         _nextCursor = _cursor + 1; 
         var found = false; 
         var continueCheck = 0; 
         //遍歷詞的每一位做匹配 
         for (var i = 0; i < blackWord.Length; i++) 
         { 
             //特殊字元偏移遊標 
             var offset = 0; 
             if (_nextCursor >= _sourceText.Length) 
             { 
                 if (i - 1 < blackWord.Length - 1) 
                     found = false; 
                 break; 
             } 
             else 
             { 
                 //檢測下位字元如果不是漢字 數字 字元 偏移量加1 
                 for (var y = _nextCursor; y < _sourceText.Length; y++) 
                 { 
                     if (!IsChs(_sourceText[y]) && !IsNum(_sourceText[y]) && !IsAlphabet(_sourceText[y])) 
                     { 
                         offset++; 
                         //避讓特殊字元,下位遊標如果>=字串長度 跳出 
                         if (_nextCursor + offset >= _sourceText.Length) 
                             break; 
                         _wordlenght++; 
                     } 
                     else break; 
                 } 
                 if (_nextCursor + offset >= _sourceText.Length) 
                 { 
                     found = false; 
                     break; 
                 } 
                 if (blackWord[i] == _sourceText[_nextCursor + offset]) 
                 { 
                     found = true; 
                     continueCheck = 0; 
                 } 
                 else 
                 { 
                     // 匹配不到時嘗試繼續匹配4個字元 
                     if (continueCheck < 4 && _nextCursor < _sourceText.Length - 1) 
                     { 
                         continueCheck++; 
                         i--; 
                     } 
                     else 
                     { 
                         found = false; 
                         break; 
                     } 
                 } 
             } 
             _nextCursor = _nextCursor + 1 + offset; 
             _wordlenght++; 
         } 
         return found; 
     } 
 }
3、測試與使用方法
_illegalWords = new List(); 
          if (string.IsNullOrEmpty(sourceText) && string.IsNullOrEmpty(_sourceText)) 
          { 
              return sourceText; 
          } 
 
          if (!string.IsNullOrEmpty(sourceText)) 
              _sourceText = sourceText; 
          _cursor = 0; 
          if (!LoadDictionary()) 
          { 
              return _sourceText; 
          } 
 
          var tempString = _sourceText.ToCharArray(); 
          var sourceTextDbc = ToDBC(SourceText); 
          for (var i = 0; i < SourceText.Length; i++) 
          { 
              //查詢以該字為首字元的片語 
              var group = _memoryLexicon[sourceTextDbc[i]]; 
              if (group != null) 
              { 
                  for (var z = 0; z < group.Count(); z++) 
                  { 
                      string word = group.GetWord(z); 
                      if (word.Length == 0 || Check(word)) 
                      { 
                          if (isFirstCheckedReturn) 
                          { 
                              return null; 
                          } 
 
                          var blackword = string.Empty; 
                          for (var pos = 0; pos < _wordlenght + 1; pos++) 
                          { 
                              blackword += tempString[pos + _cursor].ToString(); 
                              tempString[pos + _cursor] = ReplaceChar; 
                          } 
                          _illegalWords.Add(blackword); 
 
                          _cursor = _cursor + _wordlenght; 
                          i = i + _wordlenght; 
                          break; 
                      } 
                  } 
              } 
              _cursor++; 
          } 
          return new string(tempString); 
var filter = new SensitiveWordFilter(); 
           filter.SourceText = "dddddd"; 
           var sourctText = filter.SourceText; 
           filter.ResetMemoryLexicon(); 
           var datetime = DateTime.Now; 
           var ss = filter.Filter(); 
           var datetime2 = DateTime.Now; 
           var millisecond = (datetime2 - datetime).TotalMilliseconds; 
           Console.WriteLine(millisecond); 
           Console.WriteLine(ss); 
           var words = System.IO.File.ReadAllLines(@"D:\Recv\敏感詞庫大全.txt", System.Text.Encoding.UTF8); 
           var ssx = sourctText; 
           var datetimex = DateTime.Now; 
           foreach (var word in words) 
           { 
               if (word.Length > 0) 
                   ssx = ssx.Replace(word, "*".PadLeft(word.Length, '*')); 
           } 
           var datetime2x = DateTime.Now; 
           var millisecondx = (datetime2x - datetimex).TotalMilliseconds; 
           Console.WriteLine(millisecondx); 
           Console.WriteLine(ssx);

原文來自:

來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69955379/viewspace-2847587/,如需轉載,請註明出處,否則將追究法律責任。

相關文章