赞
踩
场景:表中因早起原因分别创建两套部门表。概述登录人为A/B不同类型,可选的部门范围不同。
但是后来发现B类人员可选A类中部门,故对于B来说 部门取并集!
问题:相同名称或类似名称部门怎么办?1.重复2.类似的都要保留一个。然后修改原先数据
思路:1.先分词 2.然后比较看相似度/匹配度
直接代码:
注意:最后计算相似度的会报错,因为借用其他算法。后面可看 参数类型不同,不想改了,测试可自己改下
- <!-- https://mvnrepository.com/artifact/com.huaban/jieba-analysis -->
- <dependency>
- <groupId>com.huaban</groupId>
- <artifactId>jieba-analysis</artifactId>
- <version>1.0.2</version>
- </dependency>
- <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
- <dependency>
- <groupId>com.janeluo</groupId>
- <artifactId>ikanalyzer</artifactId>
- <version>2012_u6</version>
- </dependency>
- <mirror>
- <id>aliyunmaven</id>
- <mirrorOf>*</mirrorOf>
- <name>阿里云公共仓库</name>
- <url>https://maven.aliyun.com/repository/public</url>
- </mirror>
- public class JieBaUtils {
- private static JiebaSegmenter segmenter = new JiebaSegmenter();
- /**
- * 单词 参考的他人例子
- **/
- public static List<String> getSignaleWord(String words) {
- //segmenter.process(text, JiebaSegmenter.SegMode.SEARCH) 两者效果一致
- List<String> resultList = segmenter.sentenceProcess(words);
- return resultList;
- }
- /**
- * 结巴分词 process(str,SegMode.INDEX)
- * @param text
- * @return
- */
- public static Vector<String> participleJieBa(String text) {
- List<SegToken> process = segmenter.process(text, JiebaSegmenter.SegMode.INDEX);
- List<String> collect = process.stream().map(item -> item.word).collect(Collectors.toList());
- return new Vector<>(collect);
- }
- public static void main(String[] args) {
-
- System.out.println(getSignaleWord("数学形态学的表面原子熔融相的STM图像识别算法"));
- System.out.println(participleJieBa("数学形态学的表面原子熔融相的STM图像识别算法"));
- //先分词为集合,然后集合中字段比较
- System.out.println(IKUtils.getSimilarity( participleJieBa("数学形态学的表面原子熔融相的STM图像识别算法") , getSignaleWord("数学形态学的表面原子熔融相的STM图像识别算法") ));
- }
- }
计算匹配度的代码
- public class IKUtils {
- public static double YUZHI = 0.2;
-
- /**
- * 返回百分比
- *
- * @param T1
- * @param T2
- * @return
- * @author: Administrator
- * @Date: 2015年1月22日
- */
- public static double getSimilarity(Vector<String> T1, Vector<String> T2) throws Exception {
- int size = 0, size2 = 0;
- if (T1 != null && (size = T1.size()) > 0 && T2 != null && (size2 = T2.size()) > 0) {
-
- Map<String, double[]> T = new HashMap<String, double[]>();
-
- //T1和T2的并集T
- String index = null;
- for (int i = 0; i < size; i++) {
- index = T1.get(i);
- if (index != null) {
- double[] c = T.get(index);
- c = new double[2];
- c[0] = 1; //T1的语义分数Ci
- c[1] = YUZHI;//T2的语义分数Ci
- T.put(index, c);
- }
- }
-
- for (int i = 0; i < size2; i++) {
- index = T2.get(i);
- if (index != null) {
- double[] c = T.get(index);
- if (c != null && c.length == 2) {
- c[1] = 1; //T2中也存在,T2的语义分数=1
- } else {
- c = new double[2];
- c[0] = YUZHI; //T1的语义分数Ci
- c[1] = 1; //T2的语义分数Ci
- T.put(index, c);
- }
- }
- }
-
- //开始计算,百分比
- Iterator<String> it = T.keySet().iterator();
- double s1 = 0, s2 = 0, Ssum = 0; //S1、S2
- while (it.hasNext()) {
- double[] c = T.get(it.next());
- Ssum += c[0] * c[1];
- s1 += c[0] * c[0];
- s2 += c[1] * c[1];
- }
- //百分比
- return Ssum / Math.sqrt(s1 * s2);
- } else {
- throw new Exception("传入参数有问题!");
- }
-
- }
-
- }
2.分词并测试
- package com.controller.util;
-
- import org.wltea.analyzer.core.IKSegmenter;
- import org.wltea.analyzer.core.Lexeme;
-
- import java.io.IOException;
- import java.io.StringReader;
- import java.util.Vector;
-
- public class CheckTheSame {
- //大同小异 分词
- public static Vector<String> participle(String str) {
-
- Vector<String> str1 = new Vector<String>();//对输入进行分词
-
- try {
-
- StringReader reader = new StringReader(str);
- IKSegmenter ik = new IKSegmenter(reader, false);//当为true时,分词器进行智能切分
- Lexeme lexeme = null;
-
- while ((lexeme = ik.next()) != null) {
- str1.add(lexeme.getLexemeText());
- }
-
- if (str1.size() == 0) {
- return null;
- }
-
- //分词后
- // System.out.println( "str分词后:" + str1 );
-
- } catch (IOException e1) {
- //System.out.println();
- }
- return str1;
-
- }
-
- /**
- * 返回比较的两个字符串的相似度
- *
- * @param strone
- * @param strtwo
- * @return
- */
- public String getSemblance(String strone, String strtwo) {
- String semblanceString = "0.0000";
- //分词
- Vector<String> strs1 = participle(strone);
- Vector<String> strs2 = participle(strtwo);
- //根据分词返回相似度
- double same = 0;
- try {
- same = IKUtils.getSimilarity(strs1, strs2);
- } catch (Exception e) {
- //System.out.println( e.getMessage() );
- }
- semblanceString = String.valueOf(same);
- //System.out.println( "相似度:" + same );
- return semblanceString;
- }
-
- public static void main(String[] args) {
-
- //分词
- Vector<String> strs1 = participle("蚂蚁金服");
- Vector<String> strs2 = participle("蚂蚁");
-
- //根据分词返回相似度
- double same = 0;
- try {
- same = IKUtils.getSimilarity(strs1, strs2);
- } catch (Exception e) {
- System.out.println(e.getMessage());
- }
-
- System.out.println("相似度:" + same);
- }
- }
感谢大家分享
参考如下:java分词器_11大Java开源中文分词器的使用方法和分词效果对比_好奇博士的博客-CSDN博客
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。