赞
踩
该方法是采用HanLP分词原理
参考大佬们的代码与原理:(部分)
simhash算法及原理简介
海量文本用 Simhash, 2小时变4秒! | 文本分析:大规模文本处理(2)
package com.siboo.util; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.math.BigInteger; import java.sql.Clob; import java.sql.SQLException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.StringTokenizer; import java.util.regex.Pattern; import com.hankcs.hanlp.HanLP; import com.hankcs.hanlp.corpus.tag.Nature; import com.hankcs.hanlp.seg.common.Term; import com.siboo.utils.StringUtils; /** * * @Title: SimHashUtils.java * @Package com.newtec.knowGraph.data.params * @author 陈笑璞 * @date 2020年6月22日 下午5:35:32 * @Description: 计算文本相似度工具类(采用SimHash算法思想) * */ public class SimHashUtils { private static int hashbits = 64;// 默认64位,即将一个文本转换为64bit数据 private static Pattern CHINES_PATTERIN = Pattern.compile("^[\u4e80-\u9fa5]+$" ); private static final int DWEIGHT = 1;// 默认权重 public SimHashUtils() { super(); } public SimHashUtils(int hashbits) { super(); this.hashbits = hashbits; } /** * * @Title: splitFourEqual * @Description: 将Simhash签名值平均分割为4等份 * @param signature 字符串 * @return List<String> 返回分割的4等份字符串 */ public static List<String> splitFourEqual(String signature) { int length = signature.length(); int m = 4; int num = length / m; List<String> list = new ArrayList<String>(); for (int i = 0; i < m; i++) { list.add(signature.substring(i*num, (i+1)*num)); } return list; } /** * * @Title: ClobToString * @Description: Java读取Oracle的CLOB字段转换为String数据 * @param clob CLOB字段类型数据 * @throws SQLException * @throws IOException * @return String 返回转换为String数据 */ public static String ClobToString(Clob clob) throws SQLException, IOException { Reader is = clob.getCharacterStream();// 得到流 BufferedReader br = new BufferedReader(is); String s = br.readLine(); StringBuffer sb = new StringBuffer(); while (s != null) { // 执行循环将字符串全部取出付值给StringBuffer由StringBuffer转成String sb.append(s); s = br.readLine(); } if(br!=null){ br
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。