当前位置:   article > 正文

java过滤敏感词汇

filteredresult java

前言

现在几乎所有的网站再发布带有文字信息的内容时都会要求过滤掉发动的、不健康的、影响社会安定的等敏感词汇,这里为大家提供了可以是现在这种功能的解决方案

第一种方式

  • 创建敏感词汇文件;首先需要准备一个txt格式的文件用于存放需要过滤的敏感词汇,这个文件放到resources资源文件的根目录

    代码如下

  1. package com.xxxx.service;
  2. import lombok.Data;
  3. import org.springframework.stereotype.Service;
  4. import java.io.BufferedReader;
  5. import java.io.IOException;
  6. import java.io.InputStreamReader;
  7. import java.io.UnsupportedEncodingException;
  8. import java.util.ArrayList;
  9. import java.util.Collection;
  10. import java.util.HashMap;
  11. import java.util.HashSet;
  12. import java.util.List;
  13. import java.util.Set;
  14. /**
  15. * 敏感词汇service
  16. *
  17. * @author
  18. * @date
  19. */
  20. @Data
  21. @Service
  22. public class SensitiveWordService {
  23. private StringBuilder replaceAll;
  24. /**
  25. * 编码
  26. * <P>
  27. * 在读敏感词汇文件时需要用到
  28. */
  29. private String encoding = "UTF-8";
  30. /**
  31. * 替换字符窜
  32. * <P>
  33. * 用于替换敏感词汇的字符窜
  34. */
  35. private String replceStr = "*";
  36. /**
  37. *单次替换的敏感词汇的长度
  38. */
  39. private int replceSize = 500;
  40. /**
  41. * 敏感词汇文件
  42. * <P>
  43. * 此文件放在资源文件的根目录下
  44. */
  45. private String fileName = "censorwords.txt";
  46. private List<String> arrayList;
  47. /**
  48. * 包含的敏感词列表,过滤掉重复项
  49. */
  50. public Set<String> sensitiveWordSet;
  51. /**
  52. * 包含的敏感词列表,包括重复项,统计次数
  53. */
  54. public List<String> sensitiveWordList;
  55. /**
  56. * 移除敏感词汇
  57. *
  58. * @param str 需要过滤的字符窜
  59. *
  60. * @return 过滤之后的字符窜
  61. */
  62. public String removeSensitiveWord(String str){
  63. SensitiveWordService sw = new SensitiveWordService("censorwords.txt");
  64. sw.InitializationWork();
  65. return sw.filterInfo(str);
  66. }
  67. /**
  68. * 拦截信息
  69. * <P>
  70. * 过滤掉敏感词汇的方法
  71. *
  72. * @param str 将要被过滤信息
  73. *
  74. * @return 过滤后的信息
  75. */
  76. public String filterInfo(String str) {
  77. sensitiveWordSet = new HashSet<String>();
  78. sensitiveWordList= new ArrayList<>();
  79. StringBuilder buffer = new StringBuilder(str);
  80. HashMap<Integer, Integer> hash = new HashMap<Integer, Integer>(arrayList.size());
  81. String temp;
  82. for(int x = 0; x < arrayList.size();x++) {
  83. temp = arrayList.get(x);
  84. int findIndexSize = 0;
  85. for(int start = -1;(start=buffer.indexOf(temp,findIndexSize)) > -1;){
  86. //System.out.println("###replace="+temp);
  87. findIndexSize = start+temp.length();//从已找到的后面开始找
  88. Integer mapStart = hash.get(start);//起始位置
  89. //满足1个,即可更新map
  90. if(mapStart == null || (mapStart != null && findIndexSize > mapStart)){
  91. hash.put(start, findIndexSize);
  92. //System.out.println("###敏感词:"+buffer.substring(start, findIndexSize));
  93. }
  94. }
  95. }
  96. Collection<Integer> values = hash.keySet();
  97. for(Integer startIndex : values){
  98. Integer endIndex = hash.get(startIndex);
  99. //获取敏感词,并加入列表,用来统计数量
  100. String sensitive = buffer.substring(startIndex, endIndex);
  101. //System.out.println("###敏感词:"+sensitive);
  102. if (!sensitive.contains("*")) {//添加敏感词到集合
  103. sensitiveWordSet.add(sensitive);
  104. sensitiveWordList.add(sensitive);
  105. }
  106. buffer.replace(startIndex, endIndex, replaceAll.substring(0,endIndex-startIndex));
  107. }
  108. hash.clear();
  109. return buffer.toString();
  110. }
  111. /**
  112. * 初始化敏感词库
  113. */
  114. private void InitializationWork() {
  115. replaceAll = new StringBuilder(replceSize);
  116. for(int x=0;x < replceSize;x++)
  117. {
  118. replaceAll.append(replceStr);
  119. }
  120. //加载词库
  121. arrayList = new ArrayList<String>();
  122. InputStreamReader read = null;
  123. BufferedReader bufferedReader = null;
  124. try {
  125. read = new InputStreamReader(SensitiveWordService.class.getClassLoader().getResourceAsStream(fileName),encoding);
  126. bufferedReader = new BufferedReader(read);
  127. for(String txt = null;(txt = bufferedReader.readLine()) != null;){
  128. if(!arrayList.contains(txt))
  129. arrayList.add(txt);
  130. }
  131. } catch (UnsupportedEncodingException e) {
  132. e.printStackTrace();
  133. } catch (IOException e) {
  134. e.printStackTrace();
  135. }finally{
  136. try {
  137. if(null != bufferedReader)
  138. bufferedReader.close();
  139. } catch (IOException e) {
  140. e.printStackTrace();
  141. }
  142. try {
  143. if(null != read)
  144. read.close();
  145. } catch (IOException e) {
  146. e.printStackTrace();
  147. }
  148. }
  149. }
  150. /**
  151. * 测试方法
  152. *
  153. * @param args 参数
  154. */
  155. public static void main(String[] args){
  156. long startNumer = System.currentTimeMillis();
  157. SensitiveWordService sw = new SensitiveWordService("censorwords.txt");
  158. sw.InitializationWork();
  159. //System.out.println("敏感词的数量:" + arrayList.size());
  160. String str = "你好呀,我这里有敏感词汇,来过滤我呀";
  161. System.out.println("被检测字符串长度:"+str.length());
  162. str = sw.filterInfo(str);
  163. long endNumber = System.currentTimeMillis();
  164. //System.out.println("语句中包含敏感词的个数为:" + sensitiveWordSet.size() + "。包含:" + sensitiveWordSet);
  165. //System.out.println("语句中包含敏感词的个数为:" + sensitiveWordList.size() + "。包含:" + sensitiveWordList);
  166. System.out.println("总共耗时:"+(endNumber-startNumer)+"ms");
  167. System.out.println("替换后的字符串为:\n"+str);
  168. System.out.println("替换后的字符串长度为:\n"+str.length());
  169. }
  170. /**
  171. * 有参构造
  172. * <P>
  173. * 文件要求路径在src或resource下,默认文件名为censorwords.txt
  174. * @param fileName 词库文件名(含后缀)
  175. */
  176. public SensitiveWordService(String fileName) {
  177. this.fileName = fileName;
  178. }
  179. /**
  180. * 有参构造
  181. *
  182. * @param replceStr 敏感词被转换的字符
  183. * @param replceSize 初始转义容量
  184. */
  185. public SensitiveWordService(String replceStr, int replceSize){
  186. this.replceStr = fileName;
  187. this.replceSize = replceSize;
  188. }
  189. /**
  190. * 无参构造
  191. */
  192. public SensitiveWordService(){
  193. }
  194. }

第二种方法

  1. package com.xxxx.filters;
  2. import java.io.IOException;
  3. import java.io.InputStream;
  4. import java.util.Enumeration;
  5. import java.util.Properties;
  6. import java.util.regex.Matcher;
  7. import java.util.regex.Pattern;
  8. /**
  9. * 【匹配度可以,速度较慢】
  10. * Java关键字过滤:http://blog.csdn.net/linfssay/article/details/7599262
  11. * @author ShengDecheng
  12. *
  13. */
  14. public class KeyWordFilter {
  15. private static Pattern pattern = null;
  16. private static int keywordsCount = 0;
  17. // 从words.properties初始化正则表达式字符串
  18. private static void initPattern() {
  19. StringBuffer patternBuffer = new StringBuffer();
  20. try {
  21. //words.properties
  22. InputStream in = KeyWordFilter.class.getClassLoader().getResourceAsStream("keywords.properties");
  23. Properties property = new Properties();
  24. property.load(in);
  25. Enumeration<?> enu = property.propertyNames();
  26. patternBuffer.append("(");
  27. while (enu.hasMoreElements()) {
  28. String scontent = (String) enu.nextElement();
  29. patternBuffer.append(scontent + "|");
  30. //System.out.println(scontent);
  31. keywordsCount ++;
  32. }
  33. patternBuffer.deleteCharAt(patternBuffer.length() - 1);
  34. patternBuffer.append(")");
  35. //System.out.println(patternBuffer);
  36. // unix换成UTF-8
  37. // pattern = Pattern.compile(new
  38. // String(patternBuf.toString().getBytes("ISO-8859-1"), "UTF-8"));
  39. // win下换成gb2312
  40. // pattern = Pattern.compile(new String(patternBuf.toString()
  41. // .getBytes("ISO-8859-1"), "gb2312"));
  42. // 装换编码
  43. pattern = Pattern.compile(patternBuffer.toString());
  44. } catch (IOException ioEx) {
  45. ioEx.printStackTrace();
  46. }
  47. }
  48. private static String doFilter(String str) {
  49. Matcher m = pattern.matcher(str);
  50. // while (m.find()) {// 查找符合pattern的字符串
  51. // System.out.println("The result is here :" + m.group());
  52. // }
  53. // 选择替换方式,这里以* 号代替
  54. str = m.replaceAll("*");
  55. return str;
  56. }
  57. public static void main(String[] args) {
  58. long startNumer = System.currentTimeMillis();
  59. initPattern();
  60. //String str = "我日,艹,fuck,你妹的 干啥呢";
  61. System.out.println("敏感词的数量:" + keywordsCount);
  62. String str = "你好呀,我这里有敏感词汇,来过滤我呀";
  63. System.out.println("被检测字符串长度:"+str.length());
  64. str = doFilter(str);
  65. //高效Java敏感词、关键词过滤工具包_过滤非法词句:http://blog.csdn.net/ranjio_z/article/details/6299834
  66. //FilteredResult result = WordFilterUtil.filterText(str, '*');
  67. long endNumber = System.currentTimeMillis();
  68. System.out.println("总共耗时:"+(endNumber-startNumer)+"ms");
  69. System.out.println("替换后的字符串为:\n"+str);
  70. //System.out.println("替换后的字符串为:\n"+result.getFilteredContent());
  71. //System.out.println("替换后的字符串为1:\n"+result.getOriginalContent());
  72. //System.out.println("替换后的字符串为2:\n"+result.getBadWords());
  73. }
  74. }

敏感词汇文件keywords.properties

868068-20180106161323815-722995588.png
868068-20180106161341503-314687336.png

转载于:https://www.cnblogs.com/nikeodong/p/8214523.html

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/153847
推荐阅读
相关标签
  

闽ICP备14008679号