python machine learning combat nearest neighbor kNN classifier
- 2020-06-19 10:48:21
- OfStack
K neighbor method is supervised learning method and principle is very simple, suppose we have a heap of good points in class 1 sample data, each sample points good class represents a corresponding class labels, known as a test sample to our judgments about the category, the distance is calculated separately to each sample, and then select from the test sample before the recent K label sample cumulative voting, the most votes that sticker label for the test sample.
Source code details:
#-*- coding:utf-8 -*-
#!/usr/bin/python
# The test code Classification of dating data import KNN KNN.datingClassTest1() Label as a string KNN.datingClassTest2() Label as Plastic
# The test code Classification of Handwriting fonts import KNN KNN.handwritingClassTest()
from numpy import * # Scientific computing package
import operator # Operator module
from os import listdir # Gets the contents of the specified directory (sample under the handwriting font folder) txt ) Type command line ls
import matplotlib # Drawing visualization operation
import matplotlib.pyplot as plot
# According to 1 a 2 D figure
def myPlot(x, y, labels):
fig = plot.figure()# create 1 A window
ax = fig.add_subplot(111)# draw 1 A figure
#ax.scatter(x,y)
ax.scatter(x,y,15.0*array(labels),15.0*array(labels)) # support Classification color display
ax.axis([-2,25,-0.2,2.0])
plot.xlabel('Percentage of Time Spent Playing Video Games')# Coordinate axis name
plot.ylabel('Liters of Ice Cream Consumed Per Week')
plot.show()
# To create a fake Data testing
def createDataSet():
groop = array([[1.0, 1.1],[1.0, 1.0],[0, 0],[0, 0.1]]) # numpy the array The array format
labels = ['A','A','B','B']# The label list
return groop, labels
# define KNN Classification function
def knnClassify0(inX, dataSet, labels, k):
# inX Points to classify Data sets and labels DataSet, label Number of most recent fields k
dataSetSize = dataSet.shape[0] # Data set size (rows)
# tile ( A, (Row dimension, column dimension) A The number of repetitions along each dimension
# point A Repeat each 1 line to Data set size row
differeMat = tile(inX, (dataSetSize,1)) - dataSet # o To be classified point With each data set point difference
sqDiffMat = differeMat**2 # o square
sqDistances = sqDiffMat.sum(axis=1) # o And (sum of the rows)
distances = sqDistances**0.5 # prescribing get point A with Data set points Euclidean distance
sortedDistIndicies = distances.argsort() # return After incremental sort the Original position sequence (not value)
# Take the nearest k A point statistical Frequency of tag class occurrences
classCount={} # The dictionary
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]# Since the childhood Corresponding to the distance The data points The label of the
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 # For the class tag A dictionary word the value + 1
# right Class label Frequency (dictionary of The first 2 Column ( operator.itemgetter(1) )) The sorting Sort from big to small reverse=True
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0] # return The recent Corresponding label
# Real data processing The input TXT Text file return Data sets and labels ( Has been converted into Numbers ) The list of list
def file2matrix(filename):
fr = open(filename) # Open the file
numberOfLines = len(fr.readlines()) # Gets the number of lines in the file
returnMat = zeros((numberOfLines,3))
# create 1 A matrix used to store the returned data The data set The size of each data is based on the actual situation!! That is 3 The number of columns should be based on Data dimension determination
classLabelVector = [] # Corresponding labels
fr = open(filename)
index = 0
for line in fr.readlines(): # every 1 line
line = line.strip() # By default, whitespace is removed (including '\n', '\r', '\t', ' ')
listFromLine = line.split('\t') # According to the tabs (\t) Split string into List of elements
returnMat[index,:] = listFromLine[0:3] # before 3 A for Data set data
classLabelVector.append(int(listFromLine[-1])) # The last 1 a for The label plastic
index += 1
return returnMat,classLabelVector
# Real data processing The input TXT Text file return Data sets and labels ( As a string ) The list of list
def file2matrix2(filename):
fr = open(filename) # Open the file
numberOfLines = len(fr.readlines()) # Gets the number of lines in the file
returnMat = zeros((numberOfLines,3))
# create 1 A matrix used to store the returned data The data set The size of each data is based on the actual situation!! That is 3 The number of columns should be based on Data dimension determination
classLabelVector = [] # Corresponding labels
fr = open(filename)
index = 0
for line in fr.readlines(): # every 1 line
line = line.strip() # By default, whitespace is removed (including '\n', '\r', '\t', ' ')
listFromLine = line.split('\t') # According to the tabs (\t) Split string into List of elements
returnMat[index,:] = listFromLine[0:3] # before 3 A for Data set data
classLabelVector.append(str(listFromLine[-1])) # The last 1 a for The label String type
index += 1
return returnMat,classLabelVector
# The data set All types of data 1 the equality The influence weight
def dataAutoNorm(dataSet):
minVals = dataSet.min(0) # The minimum value every 1 The column every 1 Kind of attribute The minimum
maxVals = dataSet.max(0) # The maximum
ranges = maxVals - minVals # Data range
normDataSet = zeros(shape(dataSet)) # Initialize output An array of
m = dataSet.shape[0] # Line dimension The total number of samples
normDataSet = dataSet - tile(minVals, (m,1)) # extension minVals into Total sample row m line 1 Columns (Number of attribute values)
normDataSet = normDataSet/tile(ranges, (m,1)) # Matrix division Each attribute value Belong to the 1 the numpy library For ( linalg.solve(matA,matB) )
return normDataSet, ranges, minVals # return Belong to the 1 Formatted array and The range and minimum values of the properties
# Dating data KNN classification test
# The label for String type
def datingClassTest1(test_ret=0.1):
hoRatio = test_ret # The proportion of samples tested The rest as The training set
datingDataMat,datingLabels = file2matrix2('datingTestSet.txt') # Load data set
normMat, ranges, minVals = dataAutoNorm(datingDataMat)
m = normMat.shape[0] # Total sample size
numTestVecs = int(m*hoRatio) # Total number of test samples
errorCount = 0.0 # Error log
for i in range(numTestVecs): # For each test sample
# KNN classification The test sample The rest will be the data set The label corresponding to the dataset Recently, the 3 a
classifierResult = knnClassify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print " The classification results : %s,\t Actual label : %s" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print " Total error : %d" % errorCount
print " The total number of test : %d" % numTestVecs
print " The total error rate : %f" % (errorCount/float(numTestVecs))
# The label for plastic int
def datingClassTest2(test_ret=0.1):
hoRatio = test_ret # The proportion of samples tested The rest as The training set
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') # Load data set
normMat, ranges, minVals = dataAutoNorm(datingDataMat)
m = normMat.shape[0] # Total sample size
numTestVecs = int(m*hoRatio) # Total number of test samples
errorCount = 0.0 # Error log
for i in range(numTestVecs): # For each test sample
# KNN classification The test sample The rest will be the data set The label corresponding to the dataset Recently, the 3 a
classifierResult = knnClassify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print " The classification results : %d, Actual label : %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print " Total error : %d" % errorCount
print " The total number of test : %d" % numTestVecs
print " The total error rate : %f" % (errorCount/float(numTestVecs))
# Based on user input The attribute value of the sample Determine the type of user preferred ( Have a problem?? )
def classifyPerson():
resultList = [' hate ','1 As the ',' Like it very much ']
percent = float(raw_input(" Percentage of time spent playing games: "))
mile = float(raw_input(" Number of miles flown per year: "))
ice = float(raw_input(" Amount of ice cream consumed per week: "))
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') # Load data set
normMat, ranges, minVals = dataAutoNorm(datingDataMat)
# New test sample Belong to the 1 the
print ranges, minVals
testSampArry = array([mile, percent, ice]) # User-entered The test sample
testSampArryNorm = (testSampArry-minVals)/ranges # The sample to 1 the
print testSampArry ,testSampArryNorm
# classification
classifierResult = knnClassify0(testSampArryNorm,normMat,datingLabels,3)
print classifierResult
print " He's not your type: ", resultList[classifierResult-1]
# handwritten image 32*32 Pixels converted into 1*1024 The vector
def img2vector(filename):
returnVect = zeros((1,1024)) # Create an empty Return to the vector
fr = open(filename) # Open the file
for i in range(32): # For each 1 line
lineStr = fr.readline() # every 1 Line element
for j in range(32): # every 1 Each value of a row
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
# Handwritten KNN identify Each digital image is converted to 32*32 the 0 1 matrix
def handwritingClassTest(k=3):
# Get the training data set
hwLabels = [] # Identification label
trainingFileList = listdir('trainingDigits') # Load the handwritten font training data set ( all txt File list )
m = len(trainingFileList) # Total training sample number
trainingMat = zeros((m,1024)) # Training data set
for i in range(m):
fileNameStr = trainingFileList[i] # Each training data sample file 0_0.txt 0_1.txt 0_2.txt
fileStr = fileNameStr.split('.')[0] # In order to . segmentation The first 1 a [0] For the file name The first 2 a [1] For the type of txt file
classNumStr = int(fileStr.split('_')[0]) # In order to _ Break up, the first 1 a [0] Is the number represented by the data The label
hwLabels.append(classNumStr) # Training sample label
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr) # Training sample data
# Get the test data set
testFileList = listdir('testDigits') # Test data set
errorCount = 0.0 # Error count
mTest = len(testFileList) # The total test Number of data samples
for i in range(mTest):
fileNameStr = testFileList[i] # Each test sample file
fileStr = fileNameStr.split('.')[0] # Get the file name
classNumStr = int(fileStr.split('_')[0]) # Get the corresponding real tag
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) # Test sample data
classifierResult = knnClassify0(vectorUnderTest, trainingMat, hwLabels, k) # classification
print "KNN Category labels : %d, Actual label : %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0
print "\n Total number of errors : %d" % errorCount
print "\n Total error ratio : %f" % (errorCount/float(mTest))