基於PHP + TRIE樹實現敏感詞過濾演算法

walways發表於2019-04-16
  • 公司新專案素材編輯功能需要提供敏感詞過濾功能,於是上網查了下,很多都是基於trie演算法的,但基於PHP寫的卻少有,或者部分存在bug。所以,自己在別人的基礎上進行了完善。

敏感詞過濾演算法實現


class TreeMap
{
    public $data;  // 節點字元
    public $children = [];  // 存放子節點引用(因為有任意個子節點,所以靠陣列來儲存)
    public $isEndingChar = false;  // 是否是字串結束字元

    public function __construct($data)
    {
        $this->data = $data;
    }
}

class TrieTree
{
    /**
     * 敏感詞陣列
     * 
     * @var array
     * @author qpf
     */
    public $trieTreeMap = array();

    public function __construct()
    {
        $this->trieTreeMap = new TreeMap('/');
    }

    /**
     * 獲取敏感詞Map
     * 
     * @return array
     * @author qpf
     */
    public function getTreeMap()
    {
        return $this->trieTreeMap;
    }

    /**
     * 新增敏感詞
     * 
     * @param array $txtWords
     * @author qpf
     */
    public function addWords(array $wordsList)
    {
        foreach ($wordsList as $words) {
            $trieTreeMap = $this->trieTreeMap;
            $len = mb_strlen($words);
            for ($i = 0; $i < $len; $i++) {
                $word = mb_substr($words, $i, 1);
                if(!isset($trieTreeMap->children[$word])){
                    $newNode = new TreeMap($word);
                    $trieTreeMap->children[$word] = $newNode;
                }
                $trieTreeMap = $trieTreeMap->children[$word];
            }
            $trieTreeMap->isEndingChar = true;
        }
    }

    /**
     * 查詢對應敏感詞
     * 
     * @param string $txt
     * @return array
     * @author qpf
     */
    public function search($txt)
    {
        $wordsList = array();
        $txtLength = mb_strlen($txt);
        for ($i = 0; $i < $txtLength; $i++) {
            $wordLength = $this->checkWord($txt, $i, $txtLength);
            if($wordLength > 0) {
                echo $wordLength;
                $words = mb_substr($txt, $i, $wordLength);
                $wordsList[] = $words;
                $i += $wordLength - 1;
            }
        }
        return $wordsList;
    }

    /**
     * 敏感詞檢測
     * 
     * @param $txt
     * @param $beginIndex
     * @param $length
     * @return int
     */
    private function checkWord($txt, $beginIndex, $length)
    {
        $flag = false;
        $wordLength = 0;
        $trieTree = $this->trieTreeMap; //獲取敏感詞樹
        for ($i = $beginIndex; $i < $length; $i++) {
            $word = mb_substr($txt, $i, 1); //檢驗單個字
            if (!isset($trieTree->children[$word])) { //如果樹中不存在,結束        
                break;
            }
            //如果存在
            $wordLength++; 
            $trieTree = $trieTree->children[$word];
            if ($trieTree->isEndingChar === true) {  
                $flag = true;
                break;
            }
        }
        if($beginIndex > 0) {
            $flag || $wordLength = 0; //如果$flag == false  賦值$wordLenth為0
        }
        return $wordLength;
    }
    
}

$data = ['白粉', '白粉人', '白粉人嫩','不該大'];
$wordObj = new TrieTree();
$wordObj->addWords($data);

$txt = "白粉啊,白粉人,我不該大啊";
$words = $wordObj->search($txt);
var_dump($words);die;

複製程式碼

相關文章