JAVA使用DFA演算法過濾敏感詞

bug毁灭者發表於2024-03-09
程式碼示例如下:
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.ReUtil;
import cn.hutool.core.util.StrUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.util.*;

public class SensitiveWordUtils {
    //最小匹配模式 
    public static int minMatchTYpe = 1;

    //最大匹配模式 
    public static int maxMatchType = 2;

    //英文字母正則式 
    public static final String englishLletter = "[a-zA-z]+";

    /** 
     * @description: 初始化詞庫
     * @date: 2024/3/9 10:50
     * @param sensitiveWords
     * @return java.util.Map
     */
    public static Map initKeyWordAndWhiteList(List<String> sensitiveWords) {
        if(CollUtil.isEmpty(sensitiveWords)){
            return null;
        }
        try{
            Set<String> keyWordSet = new HashSet<String>();
            for(String s: sensitiveWords){
                keyWordSet.add(s.trim());
            }
            return addSensitiveWordAndWhiteListToHashMap(keyWordSet);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /** 
     * @description: 構建詞庫
     * @date: 2024/3/9 10:51
     * @param keyWordSet
     * @return java.util.HashMap
     */
    private static HashMap addSensitiveWordAndWhiteListToHashMap(Set<String> keyWordSet){
        HashMap sensitiveWordMap = new HashMap(keyWordSet.size());
        String key = null;
        Map nowMap = null;
        Map<String, String> newWorMap = null;
        Iterator<String> iterator = keyWordSet.iterator();
        while (iterator.hasNext()) {
            key = iterator.next();
            nowMap = sensitiveWordMap;
            for (int i = 0; i < key.length(); i++) {
                char keyChar = key.charAt(i);
                Object wordMap = nowMap.get(keyChar);
                if(wordMap != null){
                    nowMap = (Map) wordMap;
                }else{
                    newWorMap = new HashMap<String, String>();
                    newWorMap.put("isEnd", "0");
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }
                if(i == key.length() - 1){
                    nowMap.put("isEnd", "1");
                }
            }
        }
        return sensitiveWordMap;
    }

    /** 
     * @description: 敏感詞匹配
     * @date: 2024/3/9 10:52
     * @param text 待檢測文字
     * @param sensitiveWordMap 構建後的敏感詞詞庫map
     * @param wordMap 處理後的敏感詞map
     * @param wordWhiteMap 處理後的白名單map
     * @param ignoreCase 是否忽略大小寫 1是 0否
     * @param ignoreSpace 是否忽略空格 1是 0否
     * @param specialScanWay 是否精確匹配 1是 0否
     * @return java.util.Map<java.lang.String,java.util.Set<java.lang.String>>
     */
    public static Map<String, Set<String>> findAllNew(String text, Map sensitiveWordMap, Map<String, String> wordMap, Map<String, String> wordWhiteMap, Integer ignoreCase, Integer ignoreSpace, Integer specialScanWay) {
        Map<String, Set<String>> result = Maps.newHashMap();
        Set<String> allSensitiveWordList = new HashSet<String>();
        long txtLength = text.length();
        for (int i = 0; i < txtLength; i++) {
            int length = checkSensitiveWordNew(text, i, maxMatchType, sensitiveWordMap, ignoreCase, ignoreSpace);
            //處理精準匹配
            if (null != specialScanWay && specialScanWay == CommonConstant.Numbers.NUMBER_1 && length > CommonConstant.Numbers.NUMBER_0) {
                String subStr = StrUtil.sub(text, i, i + length);
                if (ReUtil.count(englishLletter, subStr) > CommonConstant.Numbers.NUMBER_0) {
                    //取前一個字元 
                    String beforeSubStr = StrUtil.sub(text, i - 1, i);
                    //取後一個字元 
                    String afterSubStr = StrUtil.sub(text, i + length, i + length + 1);
                    //命中文字是頂行,且往後取一位,若是英文,不屬於命中敏感詞 
                    if(i == CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, afterSubStr) > CommonConstant.Numbers.NUMBER_0){
                        length = CommonConstant.Numbers.NUMBER_0;
                        //命中文字往後取一位,只要是任意英文單詞,不屬於命中敏感詞 
                    }else if(ReUtil.count(englishLletter, afterSubStr) > CommonConstant.Numbers.NUMBER_0){
                        length = CommonConstant.Numbers.NUMBER_0;
                        //命中文字往前取一位等於n,且往前再取一位不等於 ‘\’,不屬於命中敏感詞
                    }else if((i - 1) >= CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, beforeSubStr) > CommonConstant.Numbers.NUMBER_0 && StrUtil.equals(beforeSubStr, "n") && !StrUtil.equals(StrUtil.sub(text, i - 2, i - 1), "\\")){
                        length = CommonConstant.Numbers.NUMBER_0;
                        //命中文字往前取一位為任意英文字元,不屬於命中敏感詞
                    }else if((i - 1) >= CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, beforeSubStr) > CommonConstant.Numbers.NUMBER_0) {
                        length = CommonConstant.Numbers.NUMBER_0;
                    }
                }
            }
            if (length > 0) {
                String keyWord = text.substring(i, i + length);
                String newKeyWord = "";
                if (CommonConstant.Numbers.NUMBER_1 == ignoreCase && CommonConstant.Numbers.NUMBER_1 == ignoreSpace) {
                    newKeyWord = keyWord.toLowerCase();
                    newKeyWord = StrUtil.cleanBlank(newKeyWord);
                } else if (CommonConstant.Numbers.NUMBER_1 == ignoreCase) {
                    newKeyWord = keyWord.toLowerCase();
                } else if (CommonConstant.Numbers.NUMBER_1 == ignoreSpace) {
                    newKeyWord = StrUtil.cleanBlank(keyWord);
                } else {
                    newKeyWord = keyWord;
                }
                if(wordMap.containsKey(newKeyWord) && !wordWhiteMap.containsKey(newKeyWord)){
                    allSensitiveWordList.add(wordMap.get(newKeyWord));
                }
                i = i + length - 1;
            }
        }
        result.put("allHitWord", allSensitiveWordList);
        return result;
    }

    /** 
     * @description: 從詞庫map中進行匹配
     * @date: 2024/3/9 10:49
     * @param text 待檢測文字
     * @param beginIndex 文字下標開始位置
     * @param sensitiveWordMap 構建後的敏感詞詞庫map
     * @param ignoreCase 是否忽略大小寫 1是 0否
     * @param ignoreSpace 是否忽略空格 1是 0否
     * @return int 返回命中的字元長度
     */
    private static int checkSensitiveWordNew(String txt, int beginIndex, int matchType, Map sensitiveWordMap, Integer ignoreCase, Integer ignoreSpace) {
        boolean flag = false;
        int matchFlag = 0;
        int firstMatchFlag = 0;
        char word = 0;
        Map nowMap = sensitiveWordMap;
        for(int i = beginIndex; i < txt.length(); i++){
            word = txt.charAt(i);
            if(CommonConstant.Numbers.NUMBER_1 == ignoreSpace && Character.isSpaceChar(word)){
                matchFlag++;
                continue;
            }
            if(CommonConstant.Numbers.NUMBER_1 == ignoreCase){
                word = Character.toLowerCase(word);
            }
            nowMap = (Map)nowMap.get(word);
            if(nowMap != null){
                matchFlag++;
                if ("1".equals(nowMap.get("isEnd"))){
                    flag = true;
                    firstMatchFlag = matchFlag;
                    if(minMatchTYpe == matchType){
                        break;
                    }
                }
            }else{
                // 解決敏感詞內嵌問題 如 Xinjiang和Xinjiang Independenc兩個詞彙 若文字為Xinjiang Inefb 則不會命中,邏輯上應命中Xinjiang 
                if(matchFlag > firstMatchFlag){
                    matchFlag = firstMatchFlag;
                }
                break;
            }
        }
        if(!flag){
            matchFlag = 0;
        }
        return matchFlag;
    }


    public static void main(String[] args) {
        //精確匹配 
        int specialScanWay = 1;
        //忽略大小寫 
        int ignoreCase = 1;
        //原始敏感詞詞庫列表 
        List<String> wordList = new ArrayList<>();
        wordList.add("臺獨");
        wordList.add("Xinjiang");
        wordList.add("Xinjiang production and construction Corps");
        //原始白名單列表
        List<String> allWhiteWordList = new ArrayList<>();
        allWhiteWordList.add("一臺獨立");
        //構建新的敏感詞詞庫map 
        Map<String, String> wordMap = Maps.newHashMap();
        //構建新的白名單map 
        Map<String, String> wordWhiteMap = Maps.newHashMap();
        //最新詞庫列表(整合原始詞庫和白名單 並進行大小寫處理) 
        List<String> newWordList = Lists.newArrayList();
        wordList.forEach(item->{
            String word = item;
            //處理大小寫
            if(1 == ignoreCase){
                word = item.toLowerCase();
            }
            wordMap.put(word, item);
            newWordList.add(word);
        });
        if(CollUtil.isNotEmpty(allWhiteWordList)){
            allWhiteWordList.forEach(item->{
                String word = item;
                //處理大小寫
                if(1 == ignoreCase){
                    word = item.toLowerCase();
                }
                wordWhiteMap.put(word, item);
                newWordList.add(word);
            });
        }
        String text = "這是一段測試文字,xiNJiang production,大膽臺獨分子,這是一臺獨立的計算機";
        Map sensitiveWordMap = SensitiveWordUtils.initKeyWordAndWhiteList(newWordList);
        Map<String, Set<String>> resultMap = SensitiveWordUtils.findAllNew(text, sensitiveWordMap, wordMap, wordWhiteMap, ignoreCase, 0, specialScanWay);
        System.out.println("resultMap = " + resultMap.toString());
    }
}

相關文章