当前位置:   article > 正文

基于字的文本相似度算法——Jacard算法

jacard

一、算法原理

基于字的文本相似度Jacard 算法的原理是:
(1)计算两个文本中字的交集
(2)计算两个文本中字的并集
(3)交集内的字的个数除以并集内的字的个数即为文本相似度值
(4)根据设置的阈值判断是否相似

二、算法的C++实现

这里引用的StringUtil.hpp文件引自:

https://github.com/yanyiwu/cppjieba/blob/master/deps/limonp/StringUtil.hpp

  1. /*
  2. * JaccardSimilarity.hpp
  3. *
  4. * Created: 2016年10月2日
  5. * Author: tang
  6. */
  7. #ifndef SRC_JACCARD_SIMILARITY_HPP_
  8. #define SRC_JACCARD_SIMILARITY_HPP_
  9. #include <algorithm>
  10. #include <iostream>
  11. #include <vector>
  12. #include <set>
  13. #include "StringUtil.hpp"
  14. using namespace std;
  15. class JaccardSimilarity
  16. {
  17. public:
  18. JaccardSimilarity()
  19. {
  20. }
  21. double CalculateTextSimilarity(string &str1,string &str2)
  22. {
  23. vector<uint16_t> words_for_str1;
  24. vector<uint16_t> words_for_str2;
  25. vector<uint16_t>::iterator it;
  26. if(!utf8ToUnicode< vector<uint16_t> >(str1,words_for_str1) ||
  27. !utf8ToUnicode< vector<uint16_t> >(str2,words_for_str2 ) )
  28. {
  29. cout<<"TransCode Error"<<endl;
  30. return 0.;
  31. }
  32. for(it=words_for_str1.begin();it!=words_for_str1.end();)
  33. {
  34. if(codeFilter(*it))
  35. {
  36. ++it;
  37. }
  38. else
  39. {
  40. it=words_for_str1.erase(it);
  41. }
  42. }
  43. for(it=words_for_str2.begin();it!=words_for_str2.end();)
  44. {
  45. if(codeFilter(*it))
  46. {
  47. ++it;
  48. }
  49. else
  50. {
  51. it=words_for_str2.erase(it);
  52. }
  53. }
  54. if(words_for_str1.size()+words_for_str2.size()<1)
  55. return 1.;
  56. vector<uint16_t> words_intersection;
  57. vector<uint16_t> words_union;
  58. std::sort(words_for_str1.begin(),words_for_str1.end());
  59. std::sort(words_for_str2.begin(),words_for_str2.end());
  60. std::set_intersection(words_for_str1.begin(),words_for_str1.end(),
  61. words_for_str2.begin(),words_for_str2.end(),
  62. std::inserter(words_intersection,words_intersection.begin()));
  63. std::set_union(words_for_str1.begin(),words_for_str1.end(),
  64. words_for_str2.begin(),words_for_str2.end(),
  65. std::inserter(words_union,words_union.begin()));
  66. double inter=words_intersection.size();
  67. double wunion=words_union.size();
  68. return inter/wunion;
  69. }
  70. bool codeFilter(int code)
  71. {
  72. if ((code < 0x4e00 || code > 0x9fa5) &&
  73. !(code >= '0' && code <= '9') &&
  74. !(code >= 'a' && code <= 'z') &&
  75. !(code >= 'A' && code <= 'Z'))
  76. return false;
  77. return true;
  78. }
  79. };
  80. #endif /* SRC_JACCARD_SIMILARITY_HPP_ */

三、算法的java实现

  1. import java.util.HashMap;
  2. import java.util.HashSet;
  3. import java.util.Map;
  4. import java.util.Set;
  5. public class JaccardSimilarity{
  6. public JaccardSimilarity() {
  7. }
  8. public boolean codeFilter(int code) {
  9. if ((code < 19968 || code > 40869)
  10. && !(code >= '0' && code <= '9')
  11. && !(code >= 'a' && code <= 'z')
  12. && !(code >= 'A' && code <= 'Z')) {
  13. return false;
  14. }
  15. return true;
  16. }
  17. public double CalculateTextSim(String content, String compareContent) {
  18. if(null == content || null == compareContent)
  19. return 0.0;
  20. Map<String, Integer> cntMap = new HashMap<String, Integer>();
  21. Set<String> cntSet = new HashSet<String>();
  22. Map<String, Integer> cmpCntMap = new HashMap<String, Integer>();
  23. Set<String> cmpCntSet = new HashSet<String>();
  24. for (int i = 0; i != content.length(); i++) {
  25. int k = 0;
  26. if (codeFilter(content.codePointAt(i))) {
  27. if (cntMap.containsKey("" + content.charAt(i))) {
  28. Integer count = cntMap.get("" + content.charAt(i));
  29. count = count + 1;
  30. cntMap.put("" + content.charAt(i), count);
  31. k = count;
  32. } else {
  33. cntMap.put("" + content.charAt(i), new Integer(1));
  34. k = 1;
  35. }
  36. String tmpString = content.charAt(i) + "" + k;
  37. cntSet.add(tmpString);
  38. }
  39. }
  40. for (int i = 0; i != compareContent.length(); i++) {
  41. int k = 0;
  42. if (codeFilter(compareContent.codePointAt(i))) {
  43. if (cmpCntMap.containsKey("" + compareContent.charAt(i))) {
  44. Integer count = cmpCntMap.get("" + compareContent.charAt(i));
  45. count = count + 1;
  46. cmpCntMap.put("" + compareContent.charAt(i), count);
  47. k = count;
  48. } else {
  49. cmpCntMap.put("" + compareContent.charAt(i), new Integer(1));
  50. k = 1;
  51. }
  52. String tmpString = compareContent.charAt(i) + "" + k;
  53. cmpCntSet.add(tmpString);
  54. }
  55. }
  56. Set<String> tmpSet = new HashSet<String>();
  57. tmpSet.addAll(cntSet);
  58. cntSet.retainAll(cmpCntSet);
  59. double intCount = cntSet.size();
  60. tmpSet.addAll(cmpCntSet);
  61. if (tmpSet.size() == 0)
  62. return 0;
  63. double uniCount = tmpSet.size();
  64. return intCount / uniCount;
  65. }
  66. }


声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Gausst松鼠会/article/detail/241512
推荐阅读
相关标签
  

闽ICP备14008679号