C++ KNN classification algorithm based on feature vector

  • 2020-06-15 09:50:06
  • OfStack

K nearest neighbor (ES2en-ES3en Neighbor, KNN) classification algorithm is a theoretically mature method, and is also one of the simplest machine learning algorithms. The idea of this method is: if most of the k samples in the feature space that are most similar (i.e. the most adjacent samples in the feature space) belong to a certain category, then the sample also belongs to this category. In the KNN algorithm, the selected neighbors are all correctly classified objects. This method only determines the category of the sample to be divided according to the category of the nearest one or several samples. Although the KNN method also relies on the limit theorem in principle, it is only concerned with a very small number of adjacent samples when making class decisions. The KNN method is more suitable than other methods for the sample sets to be divided which have more overlapping or overlapping class domains, because it mainly relies on the limited neighboring samples rather than the method of discriminating class domains to determine the category.


#include "stdafx.h"
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <cmath>
 
using namespace std;
 
// Sample feature structure 
struct sample
{
 string type;
 vector<double> features;
};
 
// Read training sample train.txt , training sample format: Type name + The feature vectors 
void readTrain(vector<sample>& train, const string& file)
{
 ifstream fin(file.c_str()); //file Store the file name that you want to read or write string Object, fin Is the flow of reading 
 if(!fin)
 {
 cerr<<"Unable to open the input file: "<<file<<endl;
 exit(1);
 }
 
 string line; 
 double d=0.0;
 while(getline(fin,line)) //fin It's read in, getline From the input stream fin Read in 1 Row to line
 {
 istringstream stream(line); //bind to stream to the line we read
 sample ts;
 stream>>ts.type;
 while(stream>>d) //read a word from line
 {
  ts.features.push_back(d); // in trains.features Add at the end of 1 A value of d The elements of the 
 }
 train.push_back(ts); // in train Add at the end of 1 A value of ts The elements of the 
 }
 fin.close();
}
 
// Read test sample test.txt , on every row 1 Eigenvectors 
void readTest(vector<sample>& test, const string& file)
{
 ifstream fin(file.c_str());
 if(!fin)
 {
 cerr<<"Unable to open the input file: "<<file<<endl;
 exit(1);
 }
 
 string line;
 double d=0.0;
 while(getline(fin,line))
 {
 istringstream stream(line); //bind to stream to the line we read
 sample ts;
 while(stream>>d)
 {
  ts.features.push_back(d);
 }
 test.push_back(ts);
 }
 fin.close();
}
 
// The output is per 1 Vector assignment 1 Type, write result.txt In the 
void writeResult(const vector<sample>& test, const string& file)
{
 ofstream fout(file.c_str());
 if(!fout)
 {
 cerr<<"Unable to write the input file: "<<endl;
 exit(1);
 }
 
 for(vector<sample>::size_type i=0;i!=test.size();++i)
 {
 fout << test[i].type << '\t';
 for(vector<double>::size_type j=0;j!=test[j].features.size();++j)
 {
  fout<<test[i].features[j]<<' ';
 }
 fout<<endl;
 }
}
 
//KNN Implementation of the algorithm 
void knnProcess(vector<sample>& test, const vector<sample>& train, const vector<vector<double> >& dm, unsigned int k)
{
 for (vector<sample>::size_type i = 0; i != test.size(); ++i)
 {
 multimap<double, string> dts; // Save and test samples i Closest k A point 
 
 for (vector<double>::size_type j = 0; j != dm[i].size(); ++j)
 {
  if (dts.size() < k) // In front of the k An insert dts In the 
  {
  dts.insert(make_pair(dm[i][j], train[j].type)); // Auto sort when insert, press dts In the double Sort, the smallest comes last 
  }
  else
  {
  multimap<double, string>::iterator it = dts.end();
  --it;
 
  if (dm[i][j] < it->first) // Put the current test sample i The Euclidean distance between the current training sample and dts If the distance is smaller, it will be updated dts
  {
   dts.erase(it);
   dts.insert(make_pair(dm[i][j], train[j].type));
  }
  }
 }
 map<string, double> tds;
 string type = "";
 double weight = 0.0;
 // The following for The loop is mainly about finding and testing samples i The nearby k The category that most of the sample points belong to is used as the test sample points i The category of the 
 for (multimap<double, string>::const_iterator cit = dts.begin(); cit != dts.end(); ++cit)
 {
  //  Regardless of the weight of the case, in  k  Add as often as it appears in the sample  1
  // ++tds[cit->second];
 
  //  This is the relationship between distance and weight, the larger the distance, the smaller the weight 
  tds[cit->second] += 1.0 / cit->first;
  if (tds[cit->second] > weight)
  {
  weight = tds[cit->second];
  type = cit->second; // save 1 Under the category 
  }
 }
 test[i].type = type;
 }
}
 
//  Calculate the Euclidean distance 
double euclideanDistance(const vector<double>& v1, const vector<double>& v2)
{
 if(v1.size() != v2.size())
 {
 cerr<<"Unable to get a distance! "<<endl;
 }
 
 else
 {
 double distance = 0.0;
 
 for (vector<double>::size_type i = 0; i != v1.size(); ++i)
 {
  distance += (v1[i] - v2[i]) * (v1[i] - v2[i]);
 }
 return sqrt(distance);
 }
}
 
/* Initialize the distance matrix, which is obtained from the training sample and the test sample, 
 The number of rows in the matrix is the number of test samples, the number of columns is the number of training samples, 
 every 1 behavior 1 An array of Euclidean distances between the test samples and each training sample */
void initDistanceMatrix(vector<vector<double> >& dm, const vector<sample>& train, const vector<sample>& test)
{
 for (vector<sample>::size_type i = 0; i != test.size(); ++i)
 {
 vector<double> vd;
 for (vector<sample>::size_type j = 0; j != train.size(); ++j)
 {
  vd.push_back(euclideanDistance(test[i].features, train[j].features));
 }
 dm.push_back(vd);
 }
}
 
// encapsulation 
void xfxKnn(const string& file1, const string& file2, const string& file3, int k)
{
 vector<sample> train,test;
 readTrain(train, file1.c_str());
 readTest(test, file2.c_str());
 vector< vector<double> > dm;
 initDistanceMatrix(dm, train, test);
 knnProcess(test, train, dm, k);
 writeResult(test, file3.c_str());
}
 
//  test 
int main()
{
 xfxKnn("train.txt", "test.txt", "result.txt", 5);
 return 0;
}

Related articles: