import cn.hutool.core.collection.CollUtil; import cn.hutool.core.util.ReUtil; import cn.hutool.core.util.StrUtil; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import java.util.*; public class SensitiveWordUtils { //最小匹配模式 public static int minMatchTYpe = 1; //最大匹配模式 public static int maxMatchType = 2; //英文字母正則式 public static final String englishLletter = "[a-zA-z]+"; /** * @description: 初始化詞庫 * @date: 2024/3/9 10:50 * @param sensitiveWords * @return java.util.Map */ public static Map initKeyWordAndWhiteList(List<String> sensitiveWords) { if(CollUtil.isEmpty(sensitiveWords)){ return null; } try{ Set<String> keyWordSet = new HashSet<String>(); for(String s: sensitiveWords){ keyWordSet.add(s.trim()); } return addSensitiveWordAndWhiteListToHashMap(keyWordSet); } catch (Exception e) { e.printStackTrace(); } return null; } /** * @description: 構建詞庫 * @date: 2024/3/9 10:51 * @param keyWordSet * @return java.util.HashMap */ private static HashMap addSensitiveWordAndWhiteListToHashMap(Set<String> keyWordSet){ HashMap sensitiveWordMap = new HashMap(keyWordSet.size()); String key = null; Map nowMap = null; Map<String, String> newWorMap = null; Iterator<String> iterator = keyWordSet.iterator(); while (iterator.hasNext()) { key = iterator.next(); nowMap = sensitiveWordMap; for (int i = 0; i < key.length(); i++) { char keyChar = key.charAt(i); Object wordMap = nowMap.get(keyChar); if(wordMap != null){ nowMap = (Map) wordMap; }else{ newWorMap = new HashMap<String, String>(); newWorMap.put("isEnd", "0"); nowMap.put(keyChar, newWorMap); nowMap = newWorMap; } if(i == key.length() - 1){ nowMap.put("isEnd", "1"); } } } return sensitiveWordMap; } /** * @description: 敏感詞匹配 * @date: 2024/3/9 10:52 * @param text 待檢測文字 * @param sensitiveWordMap 構建後的敏感詞詞庫map * @param wordMap 處理後的敏感詞map * @param wordWhiteMap 處理後的白名單map * @param ignoreCase 是否忽略大小寫 1是 0否 * @param ignoreSpace 是否忽略空格 1是 0否 * @param specialScanWay 是否精確匹配 1是 0否 * @return java.util.Map<java.lang.String,java.util.Set<java.lang.String>> */ public static Map<String, Set<String>> findAllNew(String text, Map sensitiveWordMap, Map<String, String> wordMap, Map<String, String> wordWhiteMap, Integer ignoreCase, Integer ignoreSpace, Integer specialScanWay) { Map<String, Set<String>> result = Maps.newHashMap(); Set<String> allSensitiveWordList = new HashSet<String>(); long txtLength = text.length(); for (int i = 0; i < txtLength; i++) { int length = checkSensitiveWordNew(text, i, maxMatchType, sensitiveWordMap, ignoreCase, ignoreSpace); //處理精準匹配 if (null != specialScanWay && specialScanWay == CommonConstant.Numbers.NUMBER_1 && length > CommonConstant.Numbers.NUMBER_0) { String subStr = StrUtil.sub(text, i, i + length); if (ReUtil.count(englishLletter, subStr) > CommonConstant.Numbers.NUMBER_0) { //取前一個字元 String beforeSubStr = StrUtil.sub(text, i - 1, i); //取後一個字元 String afterSubStr = StrUtil.sub(text, i + length, i + length + 1); //命中文字是頂行,且往後取一位,若是英文,不屬於命中敏感詞 if(i == CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, afterSubStr) > CommonConstant.Numbers.NUMBER_0){ length = CommonConstant.Numbers.NUMBER_0; //命中文字往後取一位,只要是任意英文單詞,不屬於命中敏感詞 }else if(ReUtil.count(englishLletter, afterSubStr) > CommonConstant.Numbers.NUMBER_0){ length = CommonConstant.Numbers.NUMBER_0; //命中文字往前取一位等於n,且往前再取一位不等於 ‘\’,不屬於命中敏感詞 }else if((i - 1) >= CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, beforeSubStr) > CommonConstant.Numbers.NUMBER_0 && StrUtil.equals(beforeSubStr, "n") && !StrUtil.equals(StrUtil.sub(text, i - 2, i - 1), "\\")){ length = CommonConstant.Numbers.NUMBER_0; //命中文字往前取一位為任意英文字元,不屬於命中敏感詞 }else if((i - 1) >= CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, beforeSubStr) > CommonConstant.Numbers.NUMBER_0) { length = CommonConstant.Numbers.NUMBER_0; } } } if (length > 0) { String keyWord = text.substring(i, i + length); String newKeyWord = ""; if (CommonConstant.Numbers.NUMBER_1 == ignoreCase && CommonConstant.Numbers.NUMBER_1 == ignoreSpace) { newKeyWord = keyWord.toLowerCase(); newKeyWord = StrUtil.cleanBlank(newKeyWord); } else if (CommonConstant.Numbers.NUMBER_1 == ignoreCase) { newKeyWord = keyWord.toLowerCase(); } else if (CommonConstant.Numbers.NUMBER_1 == ignoreSpace) { newKeyWord = StrUtil.cleanBlank(keyWord); } else { newKeyWord = keyWord; } if(wordMap.containsKey(newKeyWord) && !wordWhiteMap.containsKey(newKeyWord)){ allSensitiveWordList.add(wordMap.get(newKeyWord)); } i = i + length - 1; } } result.put("allHitWord", allSensitiveWordList); return result; } /** * @description: 從詞庫map中進行匹配 * @date: 2024/3/9 10:49 * @param text 待檢測文字 * @param beginIndex 文字下標開始位置 * @param sensitiveWordMap 構建後的敏感詞詞庫map * @param ignoreCase 是否忽略大小寫 1是 0否 * @param ignoreSpace 是否忽略空格 1是 0否 * @return int 返回命中的字元長度 */ private static int checkSensitiveWordNew(String txt, int beginIndex, int matchType, Map sensitiveWordMap, Integer ignoreCase, Integer ignoreSpace) { boolean flag = false; int matchFlag = 0; int firstMatchFlag = 0; char word = 0; Map nowMap = sensitiveWordMap; for(int i = beginIndex; i < txt.length(); i++){ word = txt.charAt(i); if(CommonConstant.Numbers.NUMBER_1 == ignoreSpace && Character.isSpaceChar(word)){ matchFlag++; continue; } if(CommonConstant.Numbers.NUMBER_1 == ignoreCase){ word = Character.toLowerCase(word); } nowMap = (Map)nowMap.get(word); if(nowMap != null){ matchFlag++; if ("1".equals(nowMap.get("isEnd"))){ flag = true; firstMatchFlag = matchFlag; if(minMatchTYpe == matchType){ break; } } }else{ // 解決敏感詞內嵌問題 如 Xinjiang和Xinjiang Independenc兩個詞彙 若文字為Xinjiang Inefb 則不會命中,邏輯上應命中Xinjiang if(matchFlag > firstMatchFlag){ matchFlag = firstMatchFlag; } break; } } if(!flag){ matchFlag = 0; } return matchFlag; } public static void main(String[] args) { //精確匹配 int specialScanWay = 1; //忽略大小寫 int ignoreCase = 1; //原始敏感詞詞庫列表 List<String> wordList = new ArrayList<>(); wordList.add("臺獨"); wordList.add("Xinjiang"); wordList.add("Xinjiang production and construction Corps"); //原始白名單列表 List<String> allWhiteWordList = new ArrayList<>(); allWhiteWordList.add("一臺獨立"); //構建新的敏感詞詞庫map Map<String, String> wordMap = Maps.newHashMap(); //構建新的白名單map Map<String, String> wordWhiteMap = Maps.newHashMap(); //最新詞庫列表(整合原始詞庫和白名單 並進行大小寫處理) List<String> newWordList = Lists.newArrayList(); wordList.forEach(item->{ String word = item; //處理大小寫 if(1 == ignoreCase){ word = item.toLowerCase(); } wordMap.put(word, item); newWordList.add(word); }); if(CollUtil.isNotEmpty(allWhiteWordList)){ allWhiteWordList.forEach(item->{ String word = item; //處理大小寫 if(1 == ignoreCase){ word = item.toLowerCase(); } wordWhiteMap.put(word, item); newWordList.add(word); }); } String text = "這是一段測試文字,xiNJiang production,大膽臺獨分子,這是一臺獨立的計算機"; Map sensitiveWordMap = SensitiveWordUtils.initKeyWordAndWhiteList(newWordList); Map<String, Set<String>> resultMap = SensitiveWordUtils.findAllNew(text, sensitiveWordMap, wordMap, wordWhiteMap, ignoreCase, 0, specialScanWay); System.out.println("resultMap = " + resultMap.toString()); } }