赞
踩
本次实验代码由其他两位助教,有疑问请咨询周恒森助教和夏良伟助教
- /*相似度定义为段落2中每一句与段落1中所有句子对比,取最长公共子串长度,求和后除以段落2总长度*/
- #include <stdio.h>
- #include<string.h>
- #include<fstream>
- #include <iostream>
- #include<vector>
- #include <string>
- using namespace std;
-
- #define MAX 100
-
- struct result {
- string file[MAX];
- int length = 0;
- int a[MAX] = { 0 };
- };
-
- string getLCS(string str1, string str2) {
- vector<vector<int> > record(str1.length(), vector<int>(str2.length()));
- int maxLen = 0, maxEnd = 0;
- for (int i = 0; i < static_cast<int>(str1.length()); ++i)
- for (int j = 0; j < static_cast<int>(str2.length()); ++j) {
- if (str1[i] == str2[j]) {
- if (i == 0 || j == 0) {
- record[i][j] = 1;
- }
- else {
- record[i][j] = record[i - 1][j - 1] + 1;
- }
- }
- else {
- record[i][j] = 0;
- }
- if (record[i][j] > maxLen) {
- maxLen = record[i][j];
- maxEnd = i; //若记录i,则最后获取LCS时是取str1的子串
- }
- }
- return str1.substr(maxEnd - maxLen + 1, maxLen);
- }
-
- result * read(string filename)
- {
- result *test = new result;
- fstream f(filename);
- cin.unsetf(ios::skipws);
- char c;
- vector<char>character;
- vector<string> words;
- vector<vector<string>>paragraph;
- int begin = 0, end = 0, ai = 0;
- string word;
- while (!f.eof())
- {
- f.get(c);
- if (f.eof()) {
- words.push_back(word);
- word = "";
- break;
- }
- character.push_back(c);
- word += c;
- if (c == ' ') {
- words.push_back(word);
- word = "";
- test->a[ai]++;
- }
- if (c == '.')
- {
- test->a[ai]++;
- ai++;
- test->a[ai]--;
- }
- if (c == '\n') {
- words.push_back(word);
- word = "";
- end = words.size();
- vector<string>cmp;
- for (int i = begin; i < end; i++)
- cmp.push_back(words[i]);
- paragraph.push_back(cmp);
- begin = end;
- }
- }
- end = words.size();
- vector<string>cmp;
- for (int i = begin; i < end; i++)
- cmp.push_back(words[i]);
- paragraph.push_back(cmp);
- //for (int i = 0; i < words.size(); i++)
- //{
- // cout << words[i] << endl;
- //}
- //for (int i = 0; i < paragraph.size(); i++)
- // for (int j = 0; j < paragraph[i].size(); j++)
- // cout << paragraph[i][j];
- //cout << endl;
- //cout << "words:" << words.size() << endl;
- //cout << "paragraphs:" << paragraph.size() << endl;
- int i = 0, j = 0, sum = 0;
- for (j = 0; sum < words.size(); j++)
- {
- for (i = 0; i < test->a[j]; i++) {
- test->file[j] = test->file[j] + words[i + sum];
- }
- sum = sum + test->a[j];
- }
- test->length = j;
- // cout << j << endl;
- return test;
- }
-
- int main()
- {
- result * t1;
- result * t2;
- string frist = "file1.txt";
- string second = "file2.txt";
- t1 = new result;
- t1 = read(frist);
- t2 = new result;
- t2 = read(second);
- int sum = 0, temp = 0, sum1 = 0;
- string tt;
- for (int i = 0; i < t2->length; i++)
- {
- for (int j = 0; j < t1->length; j++)
- {
- tt = getLCS(t2->file[i], t1->file[j]);
- if (temp < tt.length())
- temp = tt.length();
- }
- //cout << tt << tt.length() << temp << endl;
- sum = sum + t2->file[i].length();
- sum1 = sum1 + temp;
- temp = 0;
- }
- float sim;
- sim = (float)sum1 / sum;
- cout << "文本总长度" << sum << endl;
- cout << "重复文本总长度" << sum1 << endl;
- cout << "相似度:" << sim << endl;
- getchar();
- return 0;
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。