赞
踩
import com.hankcs.hanlp.seg.common.Term; import com.hankcs.hanlp.tokenizer.StandardTokenizer; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.safety.Whitelist; import java.math.BigInteger; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import static com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary.*; import static java.lang.Math.log; public class MySimHash { private String tokens; //字符串 private BigInteger strSimHash;//字符产的hash值 private int hashbits = 64; // 分词后的hash数; public MySimHash(){ } public MySimHash(String tokens) { this.tokens = tokens; this.strSimHash = this.simHash(); } private MySimHash(String tokens, int hashbits) { this.tokens = tokens; this.hashbits = hashbits; this.strSimHash = this.simHash(); } /** * 清除html标签 * @param content * @return */ private String cleanResume(String content) { // 若输入为HTML,下面会过滤掉所有的HTML的tag content = Jsoup.clean(content, Whitelist.none()); content = StringUtils.lowerCase(content); String[] strings = {" ", "\n", "\r", "\t", "\\r", "\\n", "\\t", " "}; for (String s : strings) { content = content.replaceAll(s, ""); } return content; } /** * 这个是对整个字符串进行hash计算 * @return */ private BigInteger simHash() { tokens = cleanResume(tokens); // cleanResume 删除一些特殊字符 int[] v = new int[this.hashbits]; List<Term> termList = StandardTokenizer.segment(this.tokens); // 对字符串进行分词 //对分词的一些特殊处理 : 比如: 根据词性添加权重 , 过滤掉标点符号 , 过滤超频词汇等; Map<String, Integer> weightOfNature = new HashMap<String, Integer>(); // 词性的权重 weightOfNature.put("n", 2); //给名词的权重是2; Map<String, String> stopNatures = new HashMap<String, String>();//停用的词性 如一些标点符号之类的; stopNatures.put("w", ""); // int overCount = 5; //设定超频词汇的界限 ; Map<String, Integer> wordCount = new HashMap<String, Integer>(); for (Term term : termList) { String word = term.word; //分词字符串 String nature = term.nature.toString(); // 分词属性; // 过滤超频词 if (wordCount.containsKey(word)) { int count = wordCount.get(word); if (count > overCount) { continue; } wordCount.put(word, count + 1); } else { wordCount.put(word, 1); } // 过滤停用词性 if (stopNatures.containsKey(nature)) { continue; } // 2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数. BigInteger t = this.hash(word); for (int i = 0; i < this.hashbits; i++) { BigInteger bitmask = new BigInteger("1").shiftLeft(i); // 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字), // 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1, // 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕. int weight = 1; //添加权重 if (weightOfNature.containsKey(nature)) { weight = weightOfNature.get(nature); } if (t.and(bitmask).signum() != 0) { // 这里是计算整个文档的所有特征的向量和 v[i] += weight; } else { v[i] -= weight; } } } BigInteger fingerprint = new BigInteger("0"); for (int i = 0; i < this.hashbits; i++) { if (v[i] >= 0) { fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i)); } } return fingerprint; } /** * 对单个的分词进行hash计算; * @param source * @return */ private BigInteger hash(String source) { if (source == null || source.length() == 0) { return new BigInteger("0"); } else { /** * 当sourece 的长度过短,会导致hash算法失效,因此需要对过短的词补偿 */ while (source.length() < 3) { source = source + source.charAt(0); } char[] sourceArray = source.toCharArray(); BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7); BigInteger m = new BigInteger("1000003"); BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1")); for (char item : sourceArray) { BigInteger temp = BigInteger.valueOf((long) item); x = x.multiply(m).xor(temp).and(mask); } x = x.xor(new BigInteger(String.valueOf(source.length()))); if (x.equals(new BigInteger("-1"))) { x = new BigInteger("-2"); } return x; } } /** * 计算海明距离,海明距离越小说明越相似; * @param other * @return */ private int hammingDistance(MySimHash other) { BigInteger m = new BigInteger("1").shiftLeft(this.hashbits).subtract( new BigInteger("1")); BigInteger x = this.strSimHash.xor(other.strSimHash).and(m); int tot = 0; while (x.signum() != 0) { tot += 1; x = x.and(x.subtract(new BigInteger("1"))); } return tot; } public double getSemblance(MySimHash s2 ){ double i = (double) this.hammingDistance(s2); return 1 - i/this.hashbits ; } public List<Term> getSegList(String sen){ List<Term> termList = StandardTokenizer.segment(sen); return termList; } public double getSimiliarity(String s1,String s2){ // for (String retval: s1.split(",")){ // System.out.println(retval); // s1 = retval; // break; // } // for (String retval: s2.split("、")){ // System.out.println(retval); // s2 = retval; // break; // } System.out.println(s1); System.out.println(s2); List ls1 = getSegList(s1); List ls2 = getSegList(s2); double count =0; double similarity; for(int i=0;i<ls1.size();i++){ List ls3 = ls2.subList(0,ls1.size()); // System.out.println("ls3:"+ls3); if (ls3.contains(ls1.get(i))){ count +=1; } } System.out.println("count:"+count); System.out.println("length:"+ls1.size()+"+"+ls2.size()); // similarity= count/(Math.log(ls1.size())+Math.log(ls2.size())); similarity = count/(ls1.size()); return similarity; } public static void main(String[] args) { String s1 = "风电装机大幅下降"; String s2 = "产品价格竞争加剧"; String s3 = "下游需求不及预期,产能进度不及预期"; String s4 = "下游需求低于预期,价格战恶化,核心器件自产率提升进度不及预期"; long l1 = System.currentTimeMillis(); MySimHash test = new MySimHash(); System.out.println(test.getSimiliarity(s3,s4)); long l2 = System.currentTimeMillis(); System.out.println(l2-l1); System.out.println("======================================"); } }
import com.hankcs.hanlp.HanLP; import com.hankcs.hanlp.dictionary.CustomDictionary; import org.jsoup.Jsoup; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; public class SimilarityUtil { static { CustomDictionary.add("子类"); CustomDictionary.add("父类"); } private SimilarityUtil() { } /** * 获得两个句子的相似度 * * @param sentence1 * @param sentence2 * @return */ public static double getSimilarity(String sentence1, String sentence2) { List<String> sent1Words = getSplitWords(sentence1); System.out.println(sent1Words); List<String> sent2Words = getSplitWords(sentence2); System.out.println(sent2Words); List<String> allWords = mergeList(sent1Words, sent2Words); int[] statistic1 = statistic(allWords, sent1Words); int[] statistic2 = statistic(allWords, sent2Words); double dividend = 0; double divisor1 = 0; double divisor2 = 0; for (int i = 0; i < statistic1.length; i++) { dividend += statistic1[i] * statistic2[i]; divisor1 += Math.pow(statistic1[i], 2); divisor2 += Math.pow(statistic2[i], 2); } return dividend / (Math.sqrt(divisor1) * Math.sqrt(divisor2)); } private static int[] statistic(List<String> allWords, List<String> sentWords) { int[] result = new int[allWords.size()]; for (int i = 0; i < allWords.size(); i++) { result[i] = Collections.frequency(sentWords, allWords.get(i)); } return result; } private static List<String> mergeList(List<String> list1, List<String> list2) { List<String> result = new ArrayList<>(); result.addAll(list1); result.addAll(list2); return result.stream().distinct().collect(Collectors.toList()); } private static List<String> getSplitWords(String sentence) { // 去除掉html标签 sentence = Jsoup.parse(sentence.replace(" ","")).body().text(); // 标点符号会被单独分为一个Term,去除之 return HanLP.segment(sentence).stream().map(a -> a.word).filter(s -> !"`~!@#$^&*()=|{}':;',\\[\\].<>/?~!@#¥……&*()——|{}【】‘;:”“'。,、? ".contains(s)).collect(Collectors.toList()); } }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。