当前位置:   article > 正文

敏感词检测算法_敏感字检测算法

敏感字检测算法

思路:DFA算法

确定性有穷自动机,用于正则表达式的匹配,最长左子式匹配

  1. /**
  2. * 检测敏感词
  3. *
  4. * @param scriptText
  5. * @param matchType
  6. * @return
  7. */
  8. public static Set<String> checkSensitiveWord(String scriptText, int matchType) {
  9. Set<String> sensitiveWordSet = new HashSet<>();
  10. for (int i = 0; i < scriptText.length(); i++) {
  11. int length = testSensitiveWord(scriptText, i, matchType, sensitiveWordMap);
  12. if (length > 0) {
  13. sensitiveWordSet.add(scriptText.substring(i, i + length));
  14. i = i + length - 1;
  15. }
  16. }
  17. return sensitiveWordSet;
  18. }

构建敏感词map

  1. public static void initSensitiveWordMap(List<WordSenstive> wordSenstives) {
  2. log.info("开始初始化敏感词map");
  3. List<String> collect = wordSenstives.stream().map(a -> a.getSenstiveWord()).collect(Collectors.toList());
  4. Set<String> keyWordSet = new HashSet<String>(collect);
  5. Map<String, String> newWorMap = null;
  6. String key = null;
  7. Map nowMap = null;
  8. sensitiveWordMap = new HashMap(keyWordSet.size());
  9. Iterator<String> iterator = keyWordSet.iterator();
  10. while (iterator.hasNext()) {
  11. key = iterator.next();
  12. if (key == null) {
  13. continue;
  14. }
  15. nowMap = sensitiveWordMap;
  16. for (int i = 0; i < key.length(); i++) {
  17. char keyChar = key.charAt(i);
  18. Object wordMap = nowMap.get(keyChar);
  19. if (wordMap != null) {
  20. nowMap = (Map) wordMap;
  21. } else {
  22. newWorMap = new HashMap<String, String>();
  23. newWorMap.put("isEnd", "0");
  24. nowMap.put(keyChar, newWorMap);
  25. nowMap = newWorMap;
  26. }
  27. if (i == key.length() - 1) {
  28. nowMap.put("deepCount", i + 1 + "");
  29. nowMap.put("isEnd", "1");
  30. }
  31. }
  32. }
  33. log.info("敏感词map构建完成");
  34. }

匹配敏感词

  1. private static int testSensitiveWord(String scriptText, int index, int matchType, Map sensitiveWordMap) {
  2. boolean flag = false;
  3. int matchFlag = 0;
  4. char word = 0;
  5. Map nowMap = sensitiveWordMap;
  6. for (int i = index; i < scriptText.length(); i++) {
  7. word = scriptText.charAt(i);
  8. nowMap = (Map) nowMap.get(word);
  9. if (nowMap != null) {
  10. matchFlag++;//找到相应的key,匹配标识+1
  11. if ("1".equals(nowMap.get("isEnd"))) {
  12. Integer deepCount = Integer.valueOf((String) nowMap.get("deepCount"));
  13. flag = isWord(scriptText, i, deepCount);
  14. if (1 == matchType || flag) {//1:最小匹配,2:全匹配
  15. break;
  16. }
  17. }
  18. } else {
  19. break;
  20. }
  21. }
  22. if (matchFlag < 2 || !flag) {
  23. matchFlag = 0;
  24. }
  25. return matchFlag;
  26. }

匹配是否是单词

  1. private static boolean isWord(String scriptText, int i, int deepCount) {
  2. boolean isWord = true;
  3. if (i - deepCount >= 0 && scriptText.charAt(i - deepCount) > 96 && scriptText.charAt(i - deepCount) < 123) {
  4. isWord = false;
  5. }
  6. return isWord;
  7. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/153844
推荐阅读
相关标签
  

闽ICP备14008679号