当前位置:   article > 正文

apdplat.word.WordSegmenter分词功能使用自有词库,实现过滤功能,可是实际上,导致的结果差强人意,没办法只能使用JDK的自带过滤的功能_org.apdplat.word

org.apdplat.word

WOrd的分词功能,自定义的词库,可以使用自定义的,可是实际上自带的词库实在是无法删除,导致的分词的效果很差劲

  1. import com.alibaba.fastjson.JSON;
  2. import org.apache.commons.lang3.StringUtils;
  3. import org.apdplat.word.WordSegmenter;
  4. import org.apdplat.word.dictionary.DictionaryFactory;
  5. import org.apdplat.word.segmentation.Word;
  6. import org.apdplat.word.util.WordConfTools;
  7. import java.util.ArrayList;
  8. import java.util.List;
  9. /********************************************
  10. * 模块名称: 主要功能是做标题分词的操作,工具类
  11. * 功能说明:
  12. * 开发人员:
  13. * 开发时间:2020/8/29 12:21
  14. * v1.0.0.0 2020/8/29-01
  15. *******************************************/
  16. public class WordPartitionUtils {
  17. public static void main(String[] args) {
  18. //分词效果加载词库
  19. DictionaryFactory.getDictionary().clear();
  20. List<String> parameterList = new ArrayList<>();
  21. parameterList.add("对决");
  22. DictionaryFactory.getDictionary().addAll(parameterList);
  23. //词典
  24. WordConfTools.set("dic.path", "classpath:word/custom.txt");
  25. //词性标注数据
  26. WordConfTools.set("part.of.speech.dic.path", "classpath:word/part_of_speech.txt");
  27. //词性说明数据
  28. WordConfTools.set("part.of.speech.des.path", "classpath:word/part_of_speech_des.txt");
  29. //二元模型
  30. WordConfTools.set("bigram.path", "classpath:word/bigram.txt");
  31. //三元模型
  32. WordConfTools.set("trigram.path", "classpath:word/trigram.txt");
  33. //停用词词典
  34. WordConfTools.set("stopwords.path", "classpath:word/stopwords.txt");
  35. //用于分割词的标点符号
  36. WordConfTools.set("punctuation.path", "classpath:word/punctuation.txt");
  37. //百家姓
  38. WordConfTools.set("surname.path", "classpath:word/surname.txt");
  39. //数量词
  40. WordConfTools.set("quantifier.path", "classpath:word/quantifier.txt");
  41. // WordConfTools.forceOverride("classpath:custom.txt");
  42. // WordConfTools.set("dic.path", "classpath:dic.txt,classpath:custom.txt");
  43. DictionaryFactory.reload();
  44. String title = "<刺猬索尼克>曝正片片段,音速小子上演高萌对决";
  45. List<Word> list = WordSegmenter.seg(title);
  46. String value = WordConfTools.get("dic.path");
  47. System.out.println(JSON.toJSONString(list));
  48. System.out.println("value =" + value);
  49. }
  50. /**
  51. * 针对【标题不含QYJC(企业简称) 且 标题不含负面关键词 且 标题不含重要关键词 且 dsCode为转化率低于50%的栏目】进行过滤
  52. *
  53. * @param title 入参 标题
  54. * @param dsCode 资讯的编码
  55. * @return false 不满足条件,true满足条件
  56. */
  57. public Boolean isContionWord(String title, String dsCode, List<String> parameterDsCodeList) {
  58. Boolean wordFlag = false;
  59. List<Word> list = WordSegmenter.seg(title);
  60. for (Word word : list) {
  61. if (word.getPartOfSpeech() != null && word.getPartOfSpeech().getPos().equals("i")) {
  62. if (StringUtils.isNotBlank(word.getText())) { //匹配上的关键字
  63. wordFlag = true;
  64. // log.error("【Word分词标题为】:{},【匹配上关键字】:{}", title, word.getText());
  65. } else {
  66. // log.error("【Word分词标题为】:{},【匹配关键字-无】", title);
  67. }
  68. break;
  69. }
  70. }
  71. if (wordFlag && parameterDsCodeList.contains(dsCode)) {
  72. return true;
  73. }
  74. return false;
  75. }

运行结果:

SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
[{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"des":"未知","pos":"i"},"synonym":[],"text":"刺"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"猬"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"des":"","pos":"nr"},"synonym":[],"text":"索尼克"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"曝"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"正"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"片"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"片段"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"音"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"速"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"小"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"子"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"上演"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"des":"","pos":"nr"},"synonym":[],"text":"高萌对"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"决"}]
value =classpath:word/custom.txt

使用Word分词来实现文本的过滤,效果耗时是单位数;

 

使用JDK的过滤stream流式来实现文本的过滤,效果耗时是单位数;差异不大

  1. SELECT
  2.     t.keyword AS '标题',
  3.     t.tag_count AS '耗时(毫秒)',
  4.     t.tags AS '过滤方式',
  5.     t.remark AS '返回匹配结果',
  6.     t.is_add AS '结果0 false 1 true',
  7.     t.xwbt AS '返回结果',
  8.     t.mtcc AS '数据编码',
  9.     t.update_time AS '操作时间'
  10. FROM
  11.     tbm_news_log t where  t.tags='WORD'
  12. ORDER BY
  13.     t.id DESC   limit 1000;
  14. SELECT
  15.     t.keyword AS '标题',
  16.     t.tag_count AS '耗时(毫秒)',
  17.     t.tags AS '过滤方式',
  18.     t.remark AS '返回匹配结果',
  19.     t.is_add AS '结果0 false 1 true',
  20.     t.xwbt AS '返回结果',
  21.     t.mtcc AS '数据编码',
  22.     t.update_time AS '操作时间'
  23. FROM
  24.     tbm_news_log t where  t.tags='JDKCONTAINS'
  25. ORDER BY
  26.     t.id DESC  limit 1000;

 

综上是redis先缓存8万条数据,然后进行过滤,

测试1000条数据的标题过滤效果如截图,差异不明显。

 

 

依赖pom.xml

 

  1. <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
  2. <dependency>
  3. <groupId>com.janeluo</groupId>
  4. <artifactId>ikanalyzer</artifactId>
  5. <version>2012_u6</version>
  6. <exclusions>
  7. <exclusion>
  8. <artifactId>lucene-queryparser</artifactId>
  9. <groupId>org.apache.lucene</groupId>
  10. </exclusion>
  11. <exclusion>
  12. <artifactId>lucene-core</artifactId>
  13. <groupId>org.apache.lucene</groupId>
  14. </exclusion>
  15. <exclusion>
  16. <groupId>org.slf4j</groupId>
  17. <artifactId>slf4j-api</artifactId>
  18. </exclusion>
  19. </exclusions>
  20. </dependency>
  21. <dependency>
  22. <groupId>org.apdplat</groupId>
  23. <artifactId>word</artifactId>
  24. <version>${apdplat.word.version}</version>
  25. <exclusions>
  26. <exclusion>
  27. <artifactId>lucene-queryparser</artifactId>
  28. <groupId>org.apache.lucene</groupId>
  29. </exclusion>
  30. <exclusion>
  31. <artifactId>lucene-core</artifactId>
  32. <groupId>org.apache.lucene</groupId>
  33. </exclusion>
  34. <exclusion>
  35. <groupId>org.slf4j</groupId>
  36. <artifactId>slf4j-api</artifactId>
  37. </exclusion>
  38. <exclusion>
  39. <groupId>org.apache.lucene</groupId>
  40. <artifactId>lucene-analyzers-common</artifactId>
  41. </exclusion>
  42. </exclusions>
  43. </dependency>

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/610018
推荐阅读
相关标签
  

闽ICP备14008679号