当前位置:   article > 正文

2020秋数据结构实验第三题:文本相似度比较 C++实现_如何用c++实现文本差别

如何用c++实现文本差别

实验作者:

本次实验代码由其他两位助教,有疑问请咨询周恒森助教和夏良伟助教

实验描述

实验代码

  1. /*相似度定义为段落2中每一句与段落1中所有句子对比,取最长公共子串长度,求和后除以段落2总长度*/
  2. #include <stdio.h>
  3. #include<string.h>
  4. #include<fstream>
  5. #include <iostream>
  6. #include<vector>
  7. #include <string>
  8. using namespace std;
  9. #define MAX 100
  10. struct result {
  11. string file[MAX];
  12. int length = 0;
  13. int a[MAX] = { 0 };
  14. };
  15. string getLCS(string str1, string str2) {
  16. vector<vector<int> > record(str1.length(), vector<int>(str2.length()));
  17. int maxLen = 0, maxEnd = 0;
  18. for (int i = 0; i < static_cast<int>(str1.length()); ++i)
  19. for (int j = 0; j < static_cast<int>(str2.length()); ++j) {
  20. if (str1[i] == str2[j]) {
  21. if (i == 0 || j == 0) {
  22. record[i][j] = 1;
  23. }
  24. else {
  25. record[i][j] = record[i - 1][j - 1] + 1;
  26. }
  27. }
  28. else {
  29. record[i][j] = 0;
  30. }
  31. if (record[i][j] > maxLen) {
  32. maxLen = record[i][j];
  33. maxEnd = i; //若记录i,则最后获取LCS时是取str1的子串
  34. }
  35. }
  36. return str1.substr(maxEnd - maxLen + 1, maxLen);
  37. }
  38. result * read(string filename)
  39. {
  40. result *test = new result;
  41. fstream f(filename);
  42. cin.unsetf(ios::skipws);
  43. char c;
  44. vector<char>character;
  45. vector<string> words;
  46. vector<vector<string>>paragraph;
  47. int begin = 0, end = 0, ai = 0;
  48. string word;
  49. while (!f.eof())
  50. {
  51. f.get(c);
  52. if (f.eof()) {
  53. words.push_back(word);
  54. word = "";
  55. break;
  56. }
  57. character.push_back(c);
  58. word += c;
  59. if (c == ' ') {
  60. words.push_back(word);
  61. word = "";
  62. test->a[ai]++;
  63. }
  64. if (c == '.')
  65. {
  66. test->a[ai]++;
  67. ai++;
  68. test->a[ai]--;
  69. }
  70. if (c == '\n') {
  71. words.push_back(word);
  72. word = "";
  73. end = words.size();
  74. vector<string>cmp;
  75. for (int i = begin; i < end; i++)
  76. cmp.push_back(words[i]);
  77. paragraph.push_back(cmp);
  78. begin = end;
  79. }
  80. }
  81. end = words.size();
  82. vector<string>cmp;
  83. for (int i = begin; i < end; i++)
  84. cmp.push_back(words[i]);
  85. paragraph.push_back(cmp);
  86. //for (int i = 0; i < words.size(); i++)
  87. //{
  88. // cout << words[i] << endl;
  89. //}
  90. //for (int i = 0; i < paragraph.size(); i++)
  91. // for (int j = 0; j < paragraph[i].size(); j++)
  92. // cout << paragraph[i][j];
  93. //cout << endl;
  94. //cout << "words:" << words.size() << endl;
  95. //cout << "paragraphs:" << paragraph.size() << endl;
  96. int i = 0, j = 0, sum = 0;
  97. for (j = 0; sum < words.size(); j++)
  98. {
  99. for (i = 0; i < test->a[j]; i++) {
  100. test->file[j] = test->file[j] + words[i + sum];
  101. }
  102. sum = sum + test->a[j];
  103. }
  104. test->length = j;
  105. // cout << j << endl;
  106. return test;
  107. }
  108. int main()
  109. {
  110. result * t1;
  111. result * t2;
  112. string frist = "file1.txt";
  113. string second = "file2.txt";
  114. t1 = new result;
  115. t1 = read(frist);
  116. t2 = new result;
  117. t2 = read(second);
  118. int sum = 0, temp = 0, sum1 = 0;
  119. string tt;
  120. for (int i = 0; i < t2->length; i++)
  121. {
  122. for (int j = 0; j < t1->length; j++)
  123. {
  124. tt = getLCS(t2->file[i], t1->file[j]);
  125. if (temp < tt.length())
  126. temp = tt.length();
  127. }
  128. //cout << tt << tt.length() << temp << endl;
  129. sum = sum + t2->file[i].length();
  130. sum1 = sum1 + temp;
  131. temp = 0;
  132. }
  133. float sim;
  134. sim = (float)sum1 / sum;
  135. cout << "文本总长度" << sum << endl;
  136. cout << "重复文本总长度" << sum1 << endl;
  137. cout << "相似度:" << sim << endl;
  138. getchar();
  139. return 0;
  140. }

实验结果

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/煮酒与君饮/article/detail/941516
推荐阅读
相关标签
  

闽ICP备14008679号