赞
踩
确定性有穷自动机,用于正则表达式的匹配,最长左子式匹配
- /**
- * 检测敏感词
- *
- * @param scriptText
- * @param matchType
- * @return
- */
- public static Set<String> checkSensitiveWord(String scriptText, int matchType) {
- Set<String> sensitiveWordSet = new HashSet<>();
- for (int i = 0; i < scriptText.length(); i++) {
- int length = testSensitiveWord(scriptText, i, matchType, sensitiveWordMap);
- if (length > 0) {
- sensitiveWordSet.add(scriptText.substring(i, i + length));
- i = i + length - 1;
- }
- }
- return sensitiveWordSet;
- }
构建敏感词map
- public static void initSensitiveWordMap(List<WordSenstive> wordSenstives) {
- log.info("开始初始化敏感词map");
- List<String> collect = wordSenstives.stream().map(a -> a.getSenstiveWord()).collect(Collectors.toList());
- Set<String> keyWordSet = new HashSet<String>(collect);
- Map<String, String> newWorMap = null;
- String key = null;
- Map nowMap = null;
- sensitiveWordMap = new HashMap(keyWordSet.size());
- Iterator<String> iterator = keyWordSet.iterator();
- while (iterator.hasNext()) {
- key = iterator.next();
- if (key == null) {
- continue;
- }
- nowMap = sensitiveWordMap;
- for (int i = 0; i < key.length(); i++) {
- char keyChar = key.charAt(i);
- Object wordMap = nowMap.get(keyChar);
- if (wordMap != null) {
- nowMap = (Map) wordMap;
- } else {
- newWorMap = new HashMap<String, String>();
- newWorMap.put("isEnd", "0");
- nowMap.put(keyChar, newWorMap);
- nowMap = newWorMap;
- }
- if (i == key.length() - 1) {
- nowMap.put("deepCount", i + 1 + "");
- nowMap.put("isEnd", "1");
- }
- }
- }
- log.info("敏感词map构建完成");
- }
匹配敏感词
- private static int testSensitiveWord(String scriptText, int index, int matchType, Map sensitiveWordMap) {
- boolean flag = false;
- int matchFlag = 0;
- char word = 0;
- Map nowMap = sensitiveWordMap;
- for (int i = index; i < scriptText.length(); i++) {
- word = scriptText.charAt(i);
- nowMap = (Map) nowMap.get(word);
- if (nowMap != null) {
- matchFlag++;//找到相应的key,匹配标识+1
- if ("1".equals(nowMap.get("isEnd"))) {
- Integer deepCount = Integer.valueOf((String) nowMap.get("deepCount"));
- flag = isWord(scriptText, i, deepCount);
- if (1 == matchType || flag) {//1:最小匹配,2:全匹配
- break;
- }
- }
- } else {
- break;
- }
- }
- if (matchFlag < 2 || !flag) {
- matchFlag = 0;
- }
- return matchFlag;
- }
匹配是否是单词
- private static boolean isWord(String scriptText, int i, int deepCount) {
- boolean isWord = true;
- if (i - deepCount >= 0 && scriptText.charAt(i - deepCount) > 96 && scriptText.charAt(i - deepCount) < 123) {
- isWord = false;
- }
- return isWord;
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。