赞
踩
WOrd的分词功能,自定义的词库,可以使用自定义的,可是实际上自带的词库实在是无法删除,导致的分词的效果很差劲
-
- import com.alibaba.fastjson.JSON;
- import org.apache.commons.lang3.StringUtils;
- import org.apdplat.word.WordSegmenter;
- import org.apdplat.word.dictionary.DictionaryFactory;
- import org.apdplat.word.segmentation.Word;
- import org.apdplat.word.util.WordConfTools;
-
- import java.util.ArrayList;
- import java.util.List;
-
- /********************************************
- * 模块名称: 主要功能是做标题分词的操作,工具类
- * 功能说明:
- * 开发人员:
- * 开发时间:2020/8/29 12:21
- * v1.0.0.0 2020/8/29-01
- *******************************************/
-
- public class WordPartitionUtils {
-
-
- public static void main(String[] args) {
- //分词效果加载词库
- DictionaryFactory.getDictionary().clear();
- List<String> parameterList = new ArrayList<>();
- parameterList.add("对决");
- DictionaryFactory.getDictionary().addAll(parameterList);
- //词典
- WordConfTools.set("dic.path", "classpath:word/custom.txt");
- //词性标注数据
- WordConfTools.set("part.of.speech.dic.path", "classpath:word/part_of_speech.txt");
- //词性说明数据
- WordConfTools.set("part.of.speech.des.path", "classpath:word/part_of_speech_des.txt");
- //二元模型
- WordConfTools.set("bigram.path", "classpath:word/bigram.txt");
- //三元模型
- WordConfTools.set("trigram.path", "classpath:word/trigram.txt");
- //停用词词典
- WordConfTools.set("stopwords.path", "classpath:word/stopwords.txt");
- //用于分割词的标点符号
- WordConfTools.set("punctuation.path", "classpath:word/punctuation.txt");
- //百家姓
- WordConfTools.set("surname.path", "classpath:word/surname.txt");
- //数量词
- WordConfTools.set("quantifier.path", "classpath:word/quantifier.txt");
- // WordConfTools.forceOverride("classpath:custom.txt");
- // WordConfTools.set("dic.path", "classpath:dic.txt,classpath:custom.txt");
- DictionaryFactory.reload();
- String title = "<刺猬索尼克>曝正片片段,音速小子上演高萌对决";
- List<Word> list = WordSegmenter.seg(title);
- String value = WordConfTools.get("dic.path");
-
- System.out.println(JSON.toJSONString(list));
- System.out.println("value =" + value);
-
- }
-
-
- /**
- * 针对【标题不含QYJC(企业简称) 且 标题不含负面关键词 且 标题不含重要关键词 且 dsCode为转化率低于50%的栏目】进行过滤
- *
- * @param title 入参 标题
- * @param dsCode 资讯的编码
- * @return false 不满足条件,true满足条件
- */
- public Boolean isContionWord(String title, String dsCode, List<String> parameterDsCodeList) {
-
- Boolean wordFlag = false;
- List<Word> list = WordSegmenter.seg(title);
- for (Word word : list) {
- if (word.getPartOfSpeech() != null && word.getPartOfSpeech().getPos().equals("i")) {
- if (StringUtils.isNotBlank(word.getText())) { //匹配上的关键字
- wordFlag = true;
- // log.error("【Word分词标题为】:{},【匹配上关键字】:{}", title, word.getText());
- } else {
- // log.error("【Word分词标题为】:{},【匹配关键字-无】", title);
- }
- break;
- }
- }
- if (wordFlag && parameterDsCodeList.contains(dsCode)) {
- return true;
- }
- return false;
- }
运行结果:
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
[{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"des":"未知","pos":"i"},"synonym":[],"text":"刺"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"猬"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"des":"","pos":"nr"},"synonym":[],"text":"索尼克"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"曝"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"正"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"片"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"片段"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"音"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"速"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"小"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"子"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"上演"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"des":"","pos":"nr"},"synonym":[],"text":"高萌对"},{"acronymPinYin":"","antonym":[],"frequency":0,"fullPinYin":"","partOfSpeech":{"$ref":"$[0].partOfSpeech"},"synonym":[],"text":"决"}]
value =classpath:word/custom.txt
使用Word分词来实现文本的过滤,效果耗时是单位数;
使用JDK的过滤stream流式来实现文本的过滤,效果耗时是单位数;差异不大
- SELECT
- t.keyword AS '标题',
- t.tag_count AS '耗时(毫秒)',
- t.tags AS '过滤方式',
- t.remark AS '返回匹配结果',
- t.is_add AS '结果0 false 1 true',
- t.xwbt AS '返回结果',
- t.mtcc AS '数据编码',
- t.update_time AS '操作时间'
- FROM
- tbm_news_log t where t.tags='WORD'
- ORDER BY
- t.id DESC limit 1000;
- SELECT
- t.keyword AS '标题',
- t.tag_count AS '耗时(毫秒)',
- t.tags AS '过滤方式',
- t.remark AS '返回匹配结果',
- t.is_add AS '结果0 false 1 true',
- t.xwbt AS '返回结果',
- t.mtcc AS '数据编码',
- t.update_time AS '操作时间'
- FROM
- tbm_news_log t where t.tags='JDKCONTAINS'
- ORDER BY
- t.id DESC limit 1000;
综上是redis先缓存8万条数据,然后进行过滤,
测试1000条数据的标题过滤效果如截图,差异不明显。
依赖pom.xml
- <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
- <dependency>
- <groupId>com.janeluo</groupId>
- <artifactId>ikanalyzer</artifactId>
- <version>2012_u6</version>
- <exclusions>
- <exclusion>
- <artifactId>lucene-queryparser</artifactId>
- <groupId>org.apache.lucene</groupId>
- </exclusion>
- <exclusion>
- <artifactId>lucene-core</artifactId>
- <groupId>org.apache.lucene</groupId>
- </exclusion>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
-
- <dependency>
- <groupId>org.apdplat</groupId>
- <artifactId>word</artifactId>
- <version>${apdplat.word.version}</version>
- <exclusions>
- <exclusion>
- <artifactId>lucene-queryparser</artifactId>
- <groupId>org.apache.lucene</groupId>
- </exclusion>
- <exclusion>
- <artifactId>lucene-core</artifactId>
- <groupId>org.apache.lucene</groupId>
- </exclusion>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-common</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。