当前位置:   article > 正文

数据挖掘实验(Apriori,fpgrowth)

数据挖掘实验(Apriori,fpgrowth)

Apriori:这里做了个小优化,比如abcdeadcef自连接出的新项集abcdef,可以用abcde的位置和f的位置取交集,这样第n项集的计算可以用n-1项集的信息和数字本身的位置信息计算出来,只需要保存第n-1项集的位置信息就可以提速

  1. #include<iostream>
  2. #include<bits/stdc++.h>
  3. #include<cstring>
  4. #include<set>
  5. #include<map>
  6. #include <unordered_set>
  7. #include<string>
  8. #include<vector>
  9. #include<windows.h>
  10. #include<time.h>
  11. #define DATA_NAME R"(D:\Cpan\Download\.vscode\retail.dat)"
  12. #define N 100000
  13. #define MAX_RECORD_NUM 88162 + 1
  14. #define MAX_ITEMID_NUM 16470 + 1
  15. using namespace std;
  16. int recordNum = 0;
  17. int mp[MAX_ITEMID_NUM];
  18. vector<int> items[N];
  19. double support;
  20. void fastRead(){
  21. char c;
  22. bool lastIsNum = false;
  23. uint16_t num;
  24. FILE* fp = fopen(DATA_NAME, "r");
  25. while (~(c = fgetc(fp))) {
  26. if (c >= '0' && c <= '9') {
  27. if(lastIsNum){
  28. num *= 10;
  29. num += c - '0';
  30. }
  31. else{
  32. num = c - '0';
  33. }
  34. lastIsNum = true;
  35. }
  36. else {
  37. if (lastIsNum){
  38. items[num].push_back(recordNum);
  39. mp[num]++;
  40. }
  41. if (c == '\n'){
  42. recordNum++;
  43. }
  44. lastIsNum = false;
  45. }
  46. }
  47. if (lastIsNum) {
  48. items[num].push_back(recordNum);
  49. mp[num]++;
  50. }
  51. if (c != '\n') {
  52. recordNum++;
  53. }
  54. fclose(fp);
  55. }
  56. vector<int> same_number(vector<int> &tmp1,vector<int> &tmp2){
  57. int ite=0;
  58. vector<int> res={};
  59. for(int i=0;i<tmp1.size();++i){
  60. while(tmp2[ite]<tmp1[i] && ite<tmp2.size()-1) ite++;
  61. if(tmp2[ite]==tmp1[i]){
  62. //cout<<tmp2[ite]<<"*"<<tmp1[i]<<" "<<ite<<" "<<i<<endl;
  63. res.push_back(tmp1[i]);
  64. }
  65. }
  66. return res;
  67. }
  68. vector<vector<int>> v;
  69. vector<vector<int>> v2;
  70. vector<string> s,s2;
  71. int total;
  72. void cal(){
  73. total=0;
  74. s.clear();
  75. v.clear();
  76. for(int i=0;i<=MAX_ITEMID_NUM;++i){
  77. if(mp[i]>=recordNum*support*0.01){
  78. string tri=to_string(i);
  79. int tmp_len=tri.size();
  80. for(int j=1;j<=5-tmp_len;j++){
  81. tri="0"+tri;
  82. }
  83. s.push_back(tri);
  84. v.push_back(items[i]);
  85. }
  86. }
  87. int now_set_level=1;
  88. while(1){
  89. total+=s.size();
  90. cout<<"共有"<<s.size()<<"个频繁"<<now_set_level<<"项集\n";
  91. // for(auto t:s){cout<<t<<" ";}
  92. // cout<<endl;
  93. v2.clear();
  94. s2.clear();
  95. for(int i=0;i<s.size();++i){
  96. for(int j=i+1;j<s.size();++j){
  97. if(s[i].substr(0,s[i].size()-5)==s[j].substr(0,s[i].size()-5)){
  98. int num_end=stol(s[j].substr(s[i].size()-5,5));
  99. vector<int> tmp1=v[i];
  100. vector<int> tmp2=items[num_end];
  101. vector<int> same_vector=same_number(tmp1,tmp2);
  102. if(same_vector.size()>=recordNum*support*0.01){
  103. v2.push_back(same_vector);
  104. string new_tmp_string=s[i]+s[j].substr(s[i].size()-5,5);
  105. s2.push_back(new_tmp_string);
  106. }
  107. }
  108. //cout<<(s[i].substr(s[i].size()-5,5))<<endl;
  109. }
  110. }
  111. v=v2;
  112. s=s2;
  113. if(v.size()==0){
  114. break;
  115. }
  116. now_set_level+=1;
  117. }
  118. cout<<"共有"<<total<<"个频繁项集\n";
  119. }
  120. signed main(){
  121. cout<<"请输入置信度(单位%)\n";
  122. cin>>support;
  123. fastRead();
  124. long starttime = GetTickCount();
  125. cal();
  126. long endtime = GetTickCount();
  127. long time_cost = endtime - starttime;
  128. cout << "timecost " << time_cost << " ms" << endl;
  129. cout<<"请输入操作\n";
  130. cout<<" 1:更改置信度\n";
  131. int oper;
  132. cin>>oper;
  133. if(oper==1){
  134. cout<<"请输入置信度\n";
  135. cin>>support;
  136. cal();
  137. }
  138. }

Fpgrowth的算法,我没有递归建树,只建了一次树,所以速度比完整的fpgrowth要慢(当知道还要建其他树的时候实在不知道我这屎山代码怎么在继续写下去,所以直接后面直接暴力了),建了一次树后直接拿链过去爆搜子集计数了,速度主要慢在我的链最长有10左右,fpgrowth最后剪完只有3-4,通过链获取子集的复杂度是2^{len}链的长度,所以会慢,如果有一些方法能把无用节点去掉,这种做法也会快,(以后有缘再回来改吧

  1. #include<iostream>
  2. #include<bits/stdc++.h>
  3. #include<cstring>
  4. #include<set>
  5. #include<map>
  6. #include <unordered_set>
  7. #include<string>
  8. #include<vector>
  9. #include<windows.h>
  10. #include<time.h>
  11. #define DATA_NAME R"(D:\Cpan\Download\.vscode\retail.dat)"
  12. #define N 100000
  13. #define MAX_RECORD_NUM 88162 + 1
  14. #define MAX_ITEMID_NUM 16470 + 1
  15. using namespace std;
  16. int recordNum = 0;
  17. int mp[MAX_ITEMID_NUM];
  18. vector<int> items[N];
  19. double support;
  20. void fastRead(){
  21. char c;
  22. bool lastIsNum = false;
  23. uint16_t num;
  24. FILE* fp = fopen(DATA_NAME, "r");
  25. while (~(c = fgetc(fp))) {
  26. if (c >= '0' && c <= '9') {
  27. if(lastIsNum){
  28. num *= 10;
  29. num += c - '0';
  30. }
  31. else{
  32. num = c - '0';
  33. }
  34. lastIsNum = true;
  35. }
  36. else {
  37. if (lastIsNum){
  38. items[recordNum].push_back(num);
  39. mp[num]++;
  40. }
  41. if (c == '\n'){
  42. recordNum++;
  43. }
  44. lastIsNum = false;
  45. }
  46. }
  47. if (lastIsNum) {
  48. items[recordNum].push_back(num);
  49. mp[num]++;
  50. }
  51. if (c != '\n') {
  52. recordNum++;
  53. }
  54. fclose(fp);
  55. }
  56. int node_number;
  57. vector<vector<int>> v;
  58. vector<int> head_table[MAX_ITEMID_NUM];
  59. int head_table_back[10*MAX_ITEMID_NUM];
  60. vector<pair<int,int>> fp_tree[10*MAX_ITEMID_NUM];
  61. pair<int,int> fp_tree_value[10*MAX_ITEMID_NUM];
  62. bool cmp(int &a,int &b){
  63. return mp[a]>mp[b];
  64. }
  65. void build(int son,int fa,vector<int> &value,int index){
  66. if(index==value.size()) return;
  67. bool exi=0;
  68. for(auto t:fp_tree[son]){
  69. if(t.first!=fa && fp_tree_value[t.first].first==value[index]){
  70. fp_tree_value[t.first].second+=1;
  71. exi=1;
  72. build(t.first,son,value,index+1);
  73. }
  74. }
  75. if(exi==0){
  76. node_number+=1;
  77. head_table[value[index]].push_back(node_number);
  78. head_table_back[node_number]=value[index];
  79. fp_tree_value[node_number]={value[index],1};
  80. fp_tree[son].push_back({node_number,1});
  81. fp_tree[node_number].push_back({son,-1});
  82. build(node_number,son,value,index+1);
  83. }
  84. }
  85. int tmp_dp[10*MAX_ITEMID_NUM];
  86. void back(int number,vector<int> &res_chain){
  87. if(number==0 && fp_tree_value[number].second==0) return ;
  88. for(auto t:fp_tree[number]){
  89. if(t.second==-1 && fp_tree_value[t.first].second!=0){
  90. // cout<<"节点 "<<t.first<<" ";
  91. // cout<<"以前是"<<tmp_dp[t.first]<<" ";
  92. res_chain.push_back({head_table_back[t.first]});
  93. //cout<<"现在是"<<tmp_dp[t.first]<<"\n";
  94. back(t.first,res_chain);
  95. }
  96. }
  97. }
  98. int res[MAX_ITEMID_NUM];
  99. vector<int> number_item;
  100. void dfs_son(vector<vector<int>> &res_son,vector<int> &value,vector<int> tmp,int index,int now_number){
  101. if(index==value.size()){
  102. if(tmp.size()!=0){
  103. res_son.push_back(tmp);
  104. }
  105. return ;
  106. }
  107. if(now_number<=3){
  108. tmp.push_back(value[index]);
  109. dfs_son(res_son,value,tmp,index+1,now_number+1);
  110. tmp.pop_back();
  111. dfs_son(res_son,value,tmp,index+1,now_number);
  112. }
  113. else{
  114. dfs_son(res_son,value,tmp,index+1,now_number);
  115. }
  116. }
  117. signed main(){
  118. cout<<"请输入置信度(单位%)\n";
  119. cin>>support;
  120. fastRead();
  121. for(int i=0;i<recordNum;++i){
  122. vector<int> tmp;
  123. for(auto t:items[i]){
  124. if(mp[t]>=recordNum*support*0.01){
  125. tmp.push_back(t);
  126. }
  127. }
  128. if(tmp.size()==0) continue;
  129. else{
  130. sort(tmp.begin(),tmp.end(),cmp);
  131. // for(auto t:tmp){cout<<t<<" ";}
  132. // cout<<endl<<"**************************\n";
  133. v.push_back(tmp);
  134. }
  135. }
  136. // cout<<v.size()<<endl;
  137. long starttime = GetTickCount();
  138. for(auto t:v){
  139. build(0,-1,t,0);
  140. }
  141. for(int i=0;i<MAX_ITEMID_NUM;++i){
  142. if(mp[i]>=recordNum*support*0.01){
  143. res[1]+=1;
  144. number_item.push_back(i);
  145. //cout<<i<<"*\n";
  146. }
  147. }
  148. for(auto t:number_item){
  149. for(int i=0;i<MAX_ITEMID_NUM;++i){
  150. tmp_dp[i]=0;
  151. }
  152. map<vector<int>,int> map_vec;
  153. for(auto j:head_table[t]){
  154. //cout<<j<<"*";
  155. vector<int> vt={};
  156. int value=fp_tree_value[j].second;
  157. back(j,vt);
  158. if(!vt.size()) continue;
  159. // vector<vector<int>> vs;
  160. // for(int k=0;k<(1<<(vt.size()));k++){
  161. // vector<int> resson;
  162. // for(int o=0;o<=vt.size()-1;o++){
  163. // if((k>>o)&1){
  164. // resson.push_back(vt[o]);
  165. // }
  166. // }
  167. // if(resson.size()){
  168. // vs.push_back(resson);
  169. // }
  170. // }
  171. // for(auto t:vs){
  172. // map_vec[t]+=value;
  173. // }
  174. vector<vector<int> > res_son={};
  175. vector<int> tmp2={};
  176. dfs_son(res_son,vt,tmp2,0,1);
  177. for(auto t:res_son){
  178. map_vec[t]+=value;
  179. }
  180. }
  181. for(auto t:map_vec){
  182. if(t.second>=recordNum*support*0.01){
  183. res[t.first.size()+1]+=1;
  184. }
  185. }
  186. }
  187. long endtime = GetTickCount();
  188. long time_cost = endtime - starttime;
  189. cout << "timecost " << time_cost << " ms" << endl;
  190. int total=0;
  191. for(int i=1;i<=MAX_ITEMID_NUM;++i){
  192. if(res[i]!=0){
  193. total+=res[i];
  194. cout<<"共有"<<res[i]<<"个频繁"<<i<<"项集\n";
  195. }
  196. else{
  197. cout<<"共有"<<total<<"个频繁项集\n";
  198. break;
  199. }
  200. }
  201. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/474502
推荐阅读
相关标签
  

闽ICP备14008679号