赞
踩
首先在pom文件中加入分词处理的JAR包
- <dependency>
- <groupId>com.hankcs</groupId>
- <artifactId>hanlp</artifactId>
- <version>portable-1.6.1</version>
- </dependency>
1、创建封装分词数据的Bean
-
- package com.qlys.frame.model.impl;
-
- import java.io.Serializable;
- import java.util.concurrent.atomic.AtomicInteger;
-
- /**
- * 分词基本信息
- */
- public class SegmentWord implements Serializable {
-
- /**
- *
- */
- private static final long serialVersionUID = 5662341029767237202L;
-
- // 词名
- private String name;
-
- // 词性
- private String pos;
-
- // 词频
- private AtomicInteger frequency = new AtomicInteger();
-
- public SegmentWord(String name,String pos) {
- this.name = name;
- this.pos = pos;
- }
-
- public String getName() {
- return name;
- }
-
- public void setName(String name) {
- this.name = name;
- }
-
- public String getPos() {
- return pos;
- }
-
- public void setPos(String pos) {
- this.pos = pos;
- }
-
- public AtomicInteger getFrequency() {
- return frequency;
- }
-
- public void setFrequency(AtomicInteger frequency) {
- this.frequency = frequency;
- }
-
-
-
-
- }
2、进行分词并形成词频信息
-
- package com.qlys.frame.util;
-
-
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Map;
- import java.util.Optional;
- import java.util.concurrent.ConcurrentHashMap;
-
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
-
- import com.hankcs.hanlp.corpus.tag.Nature;
- import com.hankcs.hanlp.seg.common.Term;
- import com.hankcs.hanlp.tokenizer.NLPTokenizer;
- import com.qlys.frame.model.impl.SegmentWord;
- /**
- * 分词处理工具类
- */
- public class TokenizerUtil {
-
-
-
- private static final Logger log = LoggerFactory.getLogger(TokenizerUtil.class);
-
- /**
- * 分词策略
- * @return 分词信息
- */
- public List<Term> segmentCategory(String content){
- return NLPTokenizer.segment(content);
- }
-
- /**
- * 分词并计算词频
- * @param content 需要进行分词的文本
- * @return 分词后的词组信息及词频
- */
- public Map<String, SegmentWord> segment(String content) {
- log.debug("开始执行分词");
- List<Term> termList = this.segmentCategory(content);
- Map<String, SegmentWord> map = new ConcurrentHashMap<String, SegmentWord>();
- termList.forEach(i -> Optional.ofNullable(i.nature == Nature.w ? null : i.nature).ifPresent(m -> map
- .computeIfAbsent(i.word, k -> new SegmentWord(i.word, i.nature.toString())).getFrequency().incrementAndGet()));
- return map;
- }
- /**
- * 相似度运算
- *
- * @param s 分词1
- * @param o 分词2
- * @return 分词1和分词2的相似度
- */
- public double similarity(Map<String, SegmentWord> s,Map<String, SegmentWord> o) {
- List<String> keys = new ArrayList<String>();
- keys.addAll(s.keySet());
- keys.retainAll(o.keySet());
- //运算分子数据
- return keys.stream().map(val->s.get(val).getFrequency().intValue()*o.get(val).getFrequency().intValue()).reduce((a,b)->a+b).get()
- /
- Math.sqrt(s.values().stream().map(val->Math.pow(val.getFrequency().intValue(),2)).reduce((a,b)->a+b).get())
- / Math.sqrt(o.values().stream().map(val->Math.pow(val.getFrequency().intValue(),2)).reduce((a,b)->a+b).get());
- }
-
-
- public static void main(String[] args) {
- TokenizerUtil util = new TokenizerUtil();
- Map<String, SegmentWord> map = util.segment("我们把香蕉给猴子因为它们饿了");
- Map<String, SegmentWord> map1 = util.segment("我们不能把香蕉给猴子因为它们还没有成熟");
-
- System.out.println(util.similarity(map, map1));
- System.out.println(util.similarity(util.segment("abc123"), util.segment("abc")));
- }
-
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。