python machine learning combat nearest neighbor kNN classifier

  • 2020-06-19 10:48:21
  • OfStack

K neighbor method is supervised learning method and principle is very simple, suppose we have a heap of good points in class 1 sample data, each sample points good class represents a corresponding class labels, known as a test sample to our judgments about the category, the distance is calculated separately to each sample, and then select from the test sample before the recent K label sample cumulative voting, the most votes that sticker label for the test sample.

Source code details:


#-*- coding:utf-8 -*- 
#!/usr/bin/python 
 
#  The test code   Classification of dating data  import KNN  KNN.datingClassTest1()  Label as a string   KNN.datingClassTest2()  Label as Plastic  
#  The test code   Classification of Handwriting fonts  import KNN  KNN.handwritingClassTest() 
 
from numpy import *  #  Scientific computing package  
import operator    #  Operator module  
from os import listdir #  Gets the contents of the specified directory (sample under the handwriting font folder) txt )   Type command line  ls 
 
import matplotlib         #  Drawing visualization operation  
import matplotlib.pyplot as plot 
 
#  According to 1 a  2 D figure  
def myPlot(x, y, labels): 
  fig = plot.figure()# create 1 A window  
  ax = fig.add_subplot(111)#  draw 1 A figure  
  #ax.scatter(x,y) 
  ax.scatter(x,y,15.0*array(labels),15.0*array(labels)) #  support   Classification color display  
  ax.axis([-2,25,-0.2,2.0]) 
  plot.xlabel('Percentage of Time Spent Playing Video Games')#  Coordinate axis name  
  plot.ylabel('Liters of Ice Cream Consumed Per Week') 
  plot.show() 
   
 
#  To create a fake   Data testing  
def createDataSet(): 
  groop = array([[1.0, 1.1],[1.0, 1.0],[0, 0],[0, 0.1]]) # numpy the array  The array format  
  labels = ['A','A','B','B']#  The label  list 
  return groop, labels 
 
#  define  KNN  Classification function  
def knnClassify0(inX, dataSet, labels, k): 
  # inX  Points to classify   Data sets and labels  DataSet, label  Number of most recent fields  k 
  dataSetSize = dataSet.shape[0] #  Data set size (rows)    
  # tile ( A, (Row dimension, column dimension)  A The number of repetitions along each dimension  
  #  point A  Repeat each 1 line   to   Data set size row  
  differeMat = tile(inX, (dataSetSize,1)) - dataSet #  o   To be classified point   With each data set point   difference  
  sqDiffMat = differeMat**2              #  o   square  
  sqDistances = sqDiffMat.sum(axis=1)         #  o   And (sum of the rows)  
  distances = sqDistances**0.5            #  prescribing   get   point A  with   Data set points   Euclidean distance  
  sortedDistIndicies = distances.argsort()      #  return   After incremental sort   the   Original position sequence (not value)    
  #  Take the nearest  k A point   statistical   Frequency of tag class occurrences  
  classCount={} #  The dictionary  
  for i in range(k): 
    voteIlabel = labels[sortedDistIndicies[i]]# Since the childhood   Corresponding to the distance   The data points   The label of the  
    classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #  For the class tag   A dictionary word   the   value  + 1     
  #  right   Class label   Frequency (dictionary of   The first 2 Column ( operator.itemgetter(1) ))   The sorting   Sort from big to small  reverse=True 
  sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 
  return sortedClassCount[0][0] #  return   The recent   Corresponding label  
 
 
#  Real data processing    The input TXT Text file   return   Data sets and labels ( Has been converted into Numbers )  The list of  list 
def file2matrix(filename): 
  fr = open(filename)         #  Open the file         
  numberOfLines = len(fr.readlines()) #  Gets the number of lines in the file  
  returnMat = zeros((numberOfLines,3)) 
 #  create 1 A matrix used to store the returned data   The data set   The size of each data is based on the actual situation!!   That is  3  The number of columns should be based on   Data dimension determination  
  classLabelVector = []        #  Corresponding labels  
  fr = open(filename) 
  index = 0 
  for line in fr.readlines():     #  every 1 line  
    line = line.strip()       #  By default, whitespace is removed (including '\n', '\r', '\t', ' ') 
    listFromLine = line.split('\t') #  According to the   tabs (\t)  Split string   into   List of elements  
    returnMat[index,:] = listFromLine[0:3]     #  before 3 A for   Data set data  
    classLabelVector.append(int(listFromLine[-1])) #  The last 1 a   for   The label   plastic  
    index += 1 
  return returnMat,classLabelVector 
 
 
#  Real data processing    The input TXT Text file   return   Data sets and labels ( As a string )  The list of  list 
def file2matrix2(filename): 
  fr = open(filename)         #  Open the file         
  numberOfLines = len(fr.readlines()) #  Gets the number of lines in the file  
  returnMat = zeros((numberOfLines,3)) 
 #  create 1 A matrix used to store the returned data   The data set   The size of each data is based on the actual situation!!   That is  3  The number of columns should be based on   Data dimension determination  
  classLabelVector = []        #  Corresponding labels  
  fr = open(filename) 
  index = 0 
  for line in fr.readlines():     #  every 1 line  
    line = line.strip()       #  By default, whitespace is removed (including '\n', '\r', '\t', ' ') 
    listFromLine = line.split('\t') #  According to the   tabs (\t)  Split string   into   List of elements  
    returnMat[index,:] = listFromLine[0:3]     #  before 3 A for   Data set data  
    classLabelVector.append(str(listFromLine[-1])) #  The last 1 a   for   The label   String type  
    index += 1 
  return returnMat,classLabelVector 
 
 
#  The data set   All types of data 1 the   equality   The influence weight  
def dataAutoNorm(dataSet): 
  minVals = dataSet.min(0) #  The minimum value   every 1 The column   every 1 Kind of attribute   The minimum  
  maxVals = dataSet.max(0) #  The maximum  
  ranges = maxVals - minVals #  Data range  
  normDataSet = zeros(shape(dataSet)) #  Initialize output   An array of  
  m = dataSet.shape[0]        #  Line dimension   The total number of samples  
  normDataSet = dataSet - tile(minVals, (m,1))  #  extension  minVals  into   Total sample row m line  1 Columns (Number of attribute values)  
  normDataSet = normDataSet/tile(ranges, (m,1))  #  Matrix division   Each attribute value   Belong to the 1 the  numpy library   For ( linalg.solve(matA,matB) )  
  return normDataSet, ranges, minVals       #  return   Belong to the 1 Formatted array   and   The range and minimum values of the properties  
 
#  Dating data  KNN classification   test  
#  The label for   String type  
def datingClassTest1(test_ret=0.1): 
  hoRatio = test_ret       #  The proportion of samples tested   The rest as   The training set  
  datingDataMat,datingLabels = file2matrix2('datingTestSet.txt')        # Load data set  
  normMat, ranges, minVals = dataAutoNorm(datingDataMat) 
  m = normMat.shape[0]      #  Total sample size  
  numTestVecs = int(m*hoRatio)  #  Total number of test samples  
  errorCount = 0.0        #  Error log  
  for i in range(numTestVecs):  #  For each test sample  
    # KNN  classification              The test sample      The rest will be the data set          The label corresponding to the dataset   Recently,   the 3 a  
    classifierResult = knnClassify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) 
    print " The classification results : %s,\t Actual label : %s" % (classifierResult, datingLabels[i]) 
    if (classifierResult != datingLabels[i]): errorCount += 1.0   
  print " Total error : %d" % errorCount 
  print " The total number of test :  %d" % numTestVecs 
  print " The total error rate :  %f" % (errorCount/float(numTestVecs)) 
 
#  The label for   plastic  int 
def datingClassTest2(test_ret=0.1): 
  hoRatio = test_ret       #  The proportion of samples tested   The rest as   The training set  
  datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')        # Load data set  
  normMat, ranges, minVals = dataAutoNorm(datingDataMat) 
  m = normMat.shape[0]      #  Total sample size  
  numTestVecs = int(m*hoRatio)  #  Total number of test samples  
  errorCount = 0.0        #  Error log  
  for i in range(numTestVecs):  #  For each test sample  
    # KNN  classification              The test sample      The rest will be the data set          The label corresponding to the dataset   Recently,   the 3 a  
    classifierResult = knnClassify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) 
    print " The classification results : %d,  Actual label : %d" % (classifierResult, datingLabels[i]) 
    if (classifierResult != datingLabels[i]): errorCount += 1.0   
  print " Total error : %d" % errorCount 
  print " The total number of test :  %d" % numTestVecs 
  print " The total error rate :  %f" % (errorCount/float(numTestVecs)) 
 
 
#  Based on user input   The attribute value of the sample   Determine the type of user preferred ( Have a problem?? ) 
def classifyPerson(): 
  resultList = [' hate ','1 As the ',' Like it very much '] 
  percent = float(raw_input(" Percentage of time spent playing games:  ")) 
  mile  = float(raw_input(" Number of miles flown per year:  ")) 
  ice   = float(raw_input(" Amount of ice cream consumed per week:  ")) 
  datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')        # Load data set  
  normMat, ranges, minVals  = dataAutoNorm(datingDataMat) 
  #  New test sample   Belong to the 1 the  
  print ranges, minVals 
  testSampArry   = array([mile, percent, ice])  #  User-entered   The test sample  
  testSampArryNorm = (testSampArry-minVals)/ranges #  The sample to 1 the  
  print testSampArry ,testSampArryNorm 
  #  classification  
  classifierResult = knnClassify0(testSampArryNorm,normMat,datingLabels,3) 
  print classifierResult 
  print " He's not your type:  ", resultList[classifierResult-1] 
   
 
#  handwritten   image  32*32  Pixels converted into  1*1024  The vector   
def img2vector(filename): 
  returnVect = zeros((1,1024)) #  Create an empty   Return to the vector  
  fr = open(filename)     #  Open the file  
  for i in range(32):     #  For each 1 line  
    lineStr = fr.readline() #  every 1 Line element  
    for j in range(32):   #  every 1 Each value of a row  
      returnVect[0,32*i+j] = int(lineStr[j]) 
  return returnVect 
 
 
#  Handwritten  KNN identify   Each digital image is converted to  32*32  the  0 1  matrix  
def handwritingClassTest(k=3): 
  #  Get the training data set  
  hwLabels = []                #  Identification label  
  trainingFileList = listdir('trainingDigits') #  Load the handwritten font training data set  ( all txt File list ) 
  m = len(trainingFileList)          #  Total training sample number  
  trainingMat = zeros((m,1024))        #  Training data set  
  for i in range(m): 
    fileNameStr = trainingFileList[i]    #  Each training data sample file  0_0.txt 0_1.txt 0_2.txt 
    fileStr = fileNameStr.split('.')[0]   #  In order to . segmentation   The first 1 a [0] For the file name    The first 2 a [1] For the type of  txt file  
    classNumStr = int(fileStr.split('_')[0]) #  In order to _ Break up, the first 1 a [0] Is the number represented by the data   The label  
    hwLabels.append(classNumStr)                   #  Training sample label  
    trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr) #  Training sample data  
     
  #  Get the test data set    
  testFileList = listdir('testDigits')     #  Test data set  
  errorCount = 0.0               #  Error count  
  mTest = len(testFileList)          #  The total test   Number of data samples  
  for i in range(mTest): 
    fileNameStr = testFileList[i]      #  Each test sample file  
    fileStr = fileNameStr.split('.')[0]   #  Get the file name  
    classNumStr = int(fileStr.split('_')[0]) #  Get the corresponding real tag  
    vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)        #  Test sample data  
    classifierResult = knnClassify0(vectorUnderTest, trainingMat, hwLabels, k) #  classification  
    print "KNN Category labels : %d,  Actual label : %d" % (classifierResult, classNumStr) 
    if (classifierResult != classNumStr): errorCount += 1.0 
  print "\n Total number of errors : %d" % errorCount 
  print "\n Total error ratio : %f" % (errorCount/float(mTest)) 

Related articles: