- 0.0 1.1 A
- 1.0 1.0 A
- 2.0 1.0 B
- 0.5 0.5 A
- 2.5 0.5 B
- 0.0 0.0 A
- 1.0 0.0 A
- 2.0 0.0 B
- 3.0 0.0 B
- 0.0 -1.0 A
- 1.0 -1.0 A
- 2.0 -1.0 B
- #include<iostream>
- #include<map>
- #include<vector>
- #include<stdio.h>
- #include<cmath>
- #include<cstdlib>
- #include<algorithm>
- #include<fstream>
- using namespace std;
- typedef char tLabel;
- typedef double tData;
- typedef pair<int,double> PAIR;
- const int colLen = 2;
- const int rowLen = 12;
- ifstream fin;
- ofstream fout;
- class KNN
- {
- private:
- tData dataSet[rowLen][colLen];
- tLabel labels[rowLen];
- tData testData[colLen];
- int k;
- map<int,double> map_index_dis;
- map<tLabel,int> map_label_freq;
- double get_distance(tData *d1,tData *d2);
- public:
- KNN(int k);
- void get_all_distance();
- void get_max_freq_label();
- struct CmpByValue
- {
- bool operator() (const PAIR& lhs,const PAIR& rhs)
- {
- return lhs.second < rhs.second;
- }
- };
- };
- KNN::KNN(int k)
- {
- this->k = k;
- fin.open("data.txt");
- if(!fin)
- {
- cout<<"can not open the file data.txt"<<endl;
- exit(1);
- }
- /* input the dataSet */
- for(int i=0;i<rowLen;i++)
- {
- for(int j=0;j<colLen;j++)
- {
- fin>>dataSet[i][j];
- }
- fin>>labels[i];
- }
- cout<<"please input the test data :"<<endl;
- /* inuput the test data */
- for(int i=0;i<colLen;i++)
- cin>>testData[i];
- }
- /*
- * calculate the distance between test data and dataSet[i]
- */
- double KNN:: get_distance(tData *d1,tData *d2)
- {
- double sum = 0;
- for(int i=0;i<colLen;i++)
- {
- sum += pow( (d1[i]-d2[i]) , 2 );
- }
- // cout<<"the sum is = "<<sum<<endl;
- return sqrt(sum);
- }
- /*
- * calculate all the distance between test data and each training data
- */
- void KNN:: get_all_distance()
- {
- double distance;
- int i;
- for(i=0;i<rowLen;i++)
- {
- distance = get_distance(dataSet[i],testData);
- //<key,value> => <i,distance>
- map_index_dis[i] = distance;
- }
- //traverse the map to print the index and distance
- map<int,double>::const_iterator it = map_index_dis.begin();
- while(it!=map_index_dis.end())
- {
- cout<<"index = "<<it->first<<" distance = "<<it->second<<endl;
- it++;
- }
- }
- /*
- * check which label the test data belongs to to classify the test data
- */
- void KNN:: get_max_freq_label()
- {
- //transform the map_index_dis to vec_index_dis
- vector<PAIR> vec_index_dis( map_index_dis.begin(),map_index_dis.end() );
- //sort the vec_index_dis by distance from low to high to get the nearest data
- sort(vec_index_dis.begin(),vec_index_dis.end(),CmpByValue());
- for(int i=0;i<k;i++)
- {
- cout<<"the index = "<<vec_index_dis[i].first<<" the distance = "<<vec_index_dis[i].second<<" the label = "<<labels[vec_index_dis[i].first]<<" the coordinate ( "<<dataSet[ vec_index_dis[i].first ][0]<<","<<dataSet[ vec_index_dis[i].first ][1]<<" )"<<endl;
- //calculate the count of each label
- map_label_freq[ labels[ vec_index_dis[i].first ] ]++;
- }
- map<tLabel,int>::const_iterator map_it = map_label_freq.begin();
- tLabel label;
- int max_freq = 0;
- //find the most frequent label
- while( map_it != map_label_freq.end() )
- {
- if( map_it->second > max_freq )
- {
- max_freq = map_it->second;
- label = map_it->first;
- }
- map_it++;
- }
- cout<<"The test data belongs to the "<<label<<" label"<<endl;
- }
- int main()
- {
- int k ;
- cout<<"please input the k value : "<<endl;
- cin>>k;
- KNN knn(k);
- knn.get_all_distance();
- knn.get_max_freq_label();
- system("pause");
- return 0;
- }

公式为:newValue = (oldValue - min) / (max - min)
- /* add the get_error_rate function */
- #include<iostream>
- #include<map>
- #include<vector>
- #include<stdio.h>
- #include<cmath>
- #include<cstdlib>
- #include<algorithm>
- #include<fstream>
- using namespace std;
- typedef string tLabel;
- typedef double tData;
- typedef pair<int,double> PAIR;
- const int MaxColLen = 10;
- const int MaxRowLen = 10000;
- ifstream fin;
- ofstream fout;
- class KNN
- {
- private:
- tData dataSet[MaxRowLen][MaxColLen];
- tLabel labels[MaxRowLen];
- tData testData[MaxColLen];
- int rowLen;
- int colLen;
- int k;
- int test_data_num;
- map<int,double> map_index_dis;
- map<tLabel,int> map_label_freq;
- double get_distance(tData *d1,tData *d2);
- public:
- KNN(int k , int rowLen , int colLen , char *filename);
- void get_all_distance();
- tLabel get_max_freq_label();
- void auto_norm_data();
- void get_error_rate();
- struct CmpByValue
- {
- bool operator() (const PAIR& lhs,const PAIR& rhs)
- {
- return lhs.second < rhs.second;
- }
- };
- ~KNN();
- };
- KNN::~KNN()
- {
- fin.close();
- fout.close();
- map_index_dis.clear();
- map_label_freq.clear();
- }
- KNN::KNN(int k , int row ,int col , char *filename)
- {
- this->rowLen = row;
- this->colLen = col;
- this->k = k;
- test_data_num = 0;
- fin.open(filename);
- fout.open("result.txt");
- if( !fin || !fout )
- {
- cout<<"can not open the file"<<endl;
- exit(0);
- }
- for(int i=0;i<rowLen;i++)
- {
- for(int j=0;j<colLen;j++)
- {
- fin>>dataSet[i][j];
- fout<<dataSet[i][j]<<" ";
- }
- fin>>labels[i];
- fout<<labels[i]<<endl;
- }
- }
- void KNN:: get_error_rate()
- {
- int i,j,count = 0;
- tLabel label;
- cout<<"please input the number of test data : "<<endl;
- cin>>test_data_num;
- for(i=0;i<test_data_num;i++)
- {
- for(j=0;j<colLen;j++)
- {
- testData[j] = dataSet[i][j];
- }
- get_all_distance();
- label = get_max_freq_label();
- if( label!=labels[i] )
- count++;
- map_index_dis.clear();
- map_label_freq.clear();
- }
- cout<<"the error rate is = "<<(double)count/(double)test_data_num<<endl;
- }
- double KNN:: get_distance(tData *d1,tData *d2)
- {
- double sum = 0;
- for(int i=0;i<colLen;i++)
- {
- sum += pow( (d1[i]-d2[i]) , 2 );
- }
- //cout<<"the sum is = "<<sum<<endl;
- return sqrt(sum);
- }
- void KNN:: get_all_distance()
- {
- double distance;
- int i;
- for(i=test_data_num;i<rowLen;i++)
- {
- distance = get_distance(dataSet[i],testData);
- map_index_dis[i] = distance;
- }
- // map<int,double>::const_iterator it = map_index_dis.begin();
- // while(it!=map_index_dis.end())
- // {
- // cout<<"index = "<<it->first<<" distance = "<<it->second<<endl;
- // it++;
- // }
- }
- tLabel KNN:: get_max_freq_label()
- {
- vector<PAIR> vec_index_dis( map_index_dis.begin(),map_index_dis.end() );
- sort(vec_index_dis.begin(),vec_index_dis.end(),CmpByValue());
- for(int i=0;i<k;i++)
- {
- cout<<"the index = "<<vec_index_dis[i].first<<" the distance = "<<vec_index_dis[i].second<<" the label = "<<labels[ vec_index_dis[i].first ]<<" the coordinate ( ";
- int j;
- for(j=0;j<colLen-1;j++)
- {
- cout<<dataSet[ vec_index_dis[i].first ][j]<<",";
- }
- cout<<dataSet[ vec_index_dis[i].first ][j]<<" )"<<endl;
- map_label_freq[ labels[ vec_index_dis[i].first ] ]++;
- }
- map<tLabel,int>::const_iterator map_it = map_label_freq.begin();
- tLabel label;
- int max_freq = 0;
- while( map_it != map_label_freq.end() )
- {
- if( map_it->second > max_freq )
- {
- max_freq = map_it->second;
- label = map_it->first;
- }
- map_it++;
- }
- cout<<"The test data belongs to the "<<label<<" label"<<endl;
- return label;
- }
- void KNN::auto_norm_data()
- {
- tData maxa[colLen] ;
- tData mina[colLen] ;
- tData range[colLen] ;
- int i,j;
- for(i=0;i<colLen;i++)
- {
- maxa[i] = max(dataSet[0][i],dataSet[1][i]);
- mina[i] = min(dataSet[0][i],dataSet[1][i]);
- }
- for(i=2;i<rowLen;i++)
- {
- for(j=0;j<colLen;j++)
- {
- if( dataSet[i][j]>maxa[j] )
- {
- maxa[j] = dataSet[i][j];
- }
- else if( dataSet[i][j]<mina[j] )
- {
- mina[j] = dataSet[i][j];
- }
- }
- }
- for(i=0;i<colLen;i++)
- {
- range[i] = maxa[i] - mina[i] ;
- //normalize the test data set
- testData[i] = ( testData[i] - mina[i] )/range[i] ;
- }
- //normalize the training data set
- for(i=0;i<rowLen;i++)
- {
- for(j=0;j<colLen;j++)
- {
- dataSet[i][j] = ( dataSet[i][j] - mina[j] )/range[j];
- }
- }
- }
- int main(int argc , char** argv)
- {
- int k,row,col;
- char *filename;
- if( argc!=5 )
- {
- cout<<"The input should be like this : ./a.out k row col filename"<<endl;
- exit(1);
- }
- k = atoi(argv[1]);
- row = atoi(argv[2]);
- col = atoi(argv[3]);
- filename = argv[4];
- KNN knn(k,row,col,filename);
- knn.auto_norm_data();
- knn.get_error_rate();
- // knn.get_all_distance();
- // knn.get_max_freq_label();
- return 0;
- }

- target:
- g++ KNN_2.cc
- ./a.out 7 1000 3 datingTestSet.txt
- /* add the auto_norm_data */
- #include<iostream>
- #include<map>
- #include<vector>
- #include<stdio.h>
- #include<cmath>
- #include<cstdlib>
- #include<algorithm>
- #include<fstream>
- using namespace std;
- typedef string tLabel;
- typedef double tData;
- typedef pair<int,double> PAIR;
- const int MaxColLen = 10;
- const int MaxRowLen = 10000;
- ifstream fin;
- ofstream fout;
- class KNN
- {
- private:
- tData dataSet[MaxRowLen][MaxColLen];
- tLabel labels[MaxRowLen];
- tData testData[MaxColLen];
- int rowLen;
- int colLen;
- int k;
- map<int,double> map_index_dis;
- map<tLabel,int> map_label_freq;
- double get_distance(tData *d1,tData *d2);
- public:
- KNN(int k , int rowLen , int colLen , char *filename);
- void get_all_distance();
- tLabel get_max_freq_label();
- void auto_norm_data();
- struct CmpByValue
- {
- bool operator() (const PAIR& lhs,const PAIR& rhs)
- {
- return lhs.second < rhs.second;
- }
- };
- ~KNN();
- };
- KNN::~KNN()
- {
- fin.close();
- fout.close();
- map_index_dis.clear();
- map_label_freq.clear();
- }
- KNN::KNN(int k , int row ,int col , char *filename)
- {
- this->rowLen = row;
- this->colLen = col;
- this->k = k;
- fin.open(filename);
- fout.open("result.txt");
- if( !fin || !fout )
- {
- cout<<"can not open the file"<<endl;
- exit(0);
- }
- //input the training data set
- for(int i=0;i<rowLen;i++)
- {
- for(int j=0;j<colLen;j++)
- {
- fin>>dataSet[i][j];
- fout<<dataSet[i][j]<<" ";
- }
- fin>>labels[i];
- fout<<labels[i]<<endl;
- }
- //input the test data
- cout<<"frequent flier miles earned per year?";
- cin>>testData[0];
- cout<<"percentage of time spent playing video games?";
- cin>>testData[1];
- cout<<"liters of ice cream consumed per year?";
- cin>>testData[2];
- }
- double KNN:: get_distance(tData *d1,tData *d2)
- {
- double sum = 0;
- for(int i=0;i<colLen;i++)
- {
- sum += pow( (d1[i]-d2[i]) , 2 );
- }
- return sqrt(sum);
- }
- void KNN:: get_all_distance()
- {
- double distance;
- int i;
- for(i=0;i<rowLen;i++)
- {
- distance = get_distance(dataSet[i],testData);
- map_index_dis[i] = distance;
- }
- // map<int,double>::const_iterator it = map_index_dis.begin();
- // while(it!=map_index_dis.end())
- // {
- // cout<<"index = "<<it->first<<" distance = "<<it->second<<endl;
- // it++;
- // }
- }
- tLabel KNN:: get_max_freq_label()
- {
- vector<PAIR> vec_index_dis( map_index_dis.begin(),map_index_dis.end() );
- sort(vec_index_dis.begin(),vec_index_dis.end(),CmpByValue());
- for(int i=0;i<k;i++)
- {
- /*
- cout<<"the index = "<<vec_index_dis[i].first<<" the distance = "<<vec_index_dis[i].second<<" the label = "<<labels[ vec_index_dis[i].first ]<<" the coordinate ( ";
- int j;
- for(j=0;j<colLen-1;j++)
- {
- cout<<dataSet[ vec_index_dis[i].first ][j]<<",";
- }
- cout<<dataSet[ vec_index_dis[i].first ][j]<<" )"<<endl;
- */
- map_label_freq[ labels[ vec_index_dis[i].first ] ]++;
- }
- map<tLabel,int>::const_iterator map_it = map_label_freq.begin();
- tLabel label;
- int max_freq = 0;
- /*traverse the map_label_freq to get the most frequent label*/
- while( map_it != map_label_freq.end() )
- {
- if( map_it->second > max_freq )
- {
- max_freq = map_it->second;
- label = map_it->first;
- }
- map_it++;
- }
- return label;
- }
- /*
- * normalize the training data set
- */
- void KNN::auto_norm_data()
- {
- tData maxa[colLen] ;
- tData mina[colLen] ;
- tData range[colLen] ;
- int i,j;
- for(i=0;i<colLen;i++)
- {
- maxa[i] = max(dataSet[0][i],dataSet[1][i]);
- mina[i] = min(dataSet[0][i],dataSet[1][i]);
- }
- for(i=2;i<rowLen;i++)
- {
- for(j=0;j<colLen;j++)
- {
- if( dataSet[i][j]>maxa[j] )
- {
- maxa[j] = dataSet[i][j];
- }
- else if( dataSet[i][j]<mina[j] )
- {
- mina[j] = dataSet[i][j];
- }
- }
- }
- for(i=0;i<colLen;i++)
- {
- range[i] = maxa[i] - mina[i] ;
- //normalize the test data set
- testData[i] = ( testData[i] - mina[i] )/range[i] ;
- }
- //normalize the training data set
- for(i=0;i<rowLen;i++)
- {
- for(j=0;j<colLen;j++)
- {
- dataSet[i][j] = ( dataSet[i][j] - mina[j] )/range[j];
- }
- }
- }
- int main(int argc , char** argv)
- {
- int k,row,col;
- char *filename;
- if( argc!=5 )
- {
- cout<<"The input should be like this : ./a.out k row col filename"<<endl;
- exit(1);
- }
- k = atoi(argv[1]);
- row = atoi(argv[2]);
- col = atoi(argv[3]);
- filename = argv[4];
- KNN knn(k,row,col,filename);
- knn.auto_norm_data();
- knn.get_all_distance();
- cout<<"You will probably like this person : "<<knn.get_max_freq_label()<<endl;
- return 0;
- }

- target:
- g++ KNN_1.cc
- ./a.out 7 1000 3 datingTestSet.txt
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。