python Machine learning practice K mean clustering
- 2020-06-19 10:47:03
- OfStack
This paper shares the specific code of python K mean clustering for your reference. The specific content is as follows
#-*- coding:utf-8 -*-
#!/usr/bin/python
'''''
k Means K The mean clustering
'''
# test
# K-mean clustering import kMeans as KM KM.kMeansTest()
# 2 K-mean clustering import kMeans as KM KM.biKMeansTest()
# The geographical position 2 K-mean clustering import kMeans as KM KM.clusterClubs()
from numpy import *
# Import data set
def loadDataSet(fileName): #
dataMat = [] #
fr = open(fileName)
for line in fr.readlines(): # every 1 line
curLine = line.strip().split('\t')# According to the Tab key Divided into The list of
fltLine = map(float,curLine) # mapping floating-point
dataMat.append(fltLine) # Put it in the data set
return dataMat
# Calculate the distance to the Eucalyptus
def distEclud(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
# I'm going to build the center of mass initially ( random ) The data set The number of center of mass
def randCent(dataSet, k):
n = shape(dataSet)[1] # Sample characteristic dimension
centroids = mat(zeros((k,n))) # Initialize the k a The center of mass
for j in range(n): # Each sample feature
minJ = min(dataSet[:,j]) # Minimum values for each sample feature We need to convert to numpy the mat
rangeJ = float(max(dataSet[:,j]) - minJ)# The amplitude range of each sample feature
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
# Randomly generated between the maximum and minimum values of each sample K A sample eigenvalue
return centroids
# simple k Mean clustering algorithm
# The data set Number of center Distance algorithm Initial clustering center algorithm
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0] # Number of samples
clusterAssment = mat(zeros((m,2)))# Sample tag The distribution result The first 1 The column index The first 2 Column error
centroids = createCent(dataSet, k)# Initial cluster center
clusterChanged = True# Sets whether the center of mass still sends changes
while clusterChanged:
clusterChanged = False
for i in range(m): # For each sample Calculate the nearest center
# update Sample ownership
minDist = inf; minIndex = -1 # Distance variable As well as Nearest central index
for j in range(k): # For each center
distJI = distMeas(centroids[j,:],dataSet[i,:])# Calculation of distance
if distJI < minDist:
minDist = distJI; minIndex = j# Get the nearest center The index
if clusterAssment[i,0] != minIndex: clusterChanged = True
# The ownership index has changed So the center of mass is still changing, and you can optimize it
clusterAssment[i,:] = minIndex,minDist**2 # save Subordinate to the index And distance squared To calculate the sum of squared errors SSE
# Update the center of mass
print centroids # Print the center of mass for each iteration
for cent in range(k):#
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]# An array of filtering Get the sample of each center
centroids[cent,:] = mean(ptsInClust, axis=0) # Average by column Get a new center
return centroids, clusterAssment# Return to the center of mass And the distribution results of each sample
def kMeansTest(k=5):
MyDatMat = mat(loadDataSet("testSet.txt"))
MyCenters, ClustAssing = kMeans(MyDatMat, k)
# bisecting K-means 2 points K The mean algorithm Overcome local optima
def biKmeans(dataSet, k, distMeas=distEclud):
m = shape(dataSet)[0] # Number of samples
clusterAssment = mat(zeros((m,2)))# Sample tag The distribution result The first 1 The column index The first 2 Column error
centroid0 = mean(dataSet, axis=0).tolist()[0]# create 1 The initial center of mass
centList =[centroid0] # 1 A center of The list of
for j in range(m): # Calculate initial error
clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2# The square of the distance from the center of each sample
while (len(centList) < k):# Two centres did not reach the designated number of centres To continue the iteration
lowestSSE = inf # The smallest Error sum of squares SSE
for i in range(len(centList)):# For each 1 A center
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:] # The sample point at the current center
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas) # Proceed to this point within the center 2 classification
# The sample center 2 Post-Classified Error sum of squares SSE
sseSplit = sum(splitClustAss[:,1])
# The sum of squared errors of other unpartitioned data sets SSE
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
print "sseSplit, and notSplit: ",sseSplit,sseNotSplit
# The error after partition and the error of the data set without partition is this error
if (sseSplit + sseNotSplit) < lowestSSE: # Less than the last time the error
bestCentToSplit = i # The center where records should be divided The index of the
bestNewCents = centroidMat # The best of the newly partitioned centers
bestClustAss = splitClustAss.copy()# The new center For the Divide the record The index (0 or 1) As well as Error square
lowestSSE = sseSplit + sseNotSplit # Update the total Error sum of squares
# Records center division data
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) # Number of existing centres
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit# The center that should be most divided
print 'the bestCentToSplit is: ',bestCentToSplit
print 'the len of bestClustAss is: ', len(bestClustAss)
# Will be the center that should be most divided Replace with After the division of two center (1 Student: A substitution, another 1 a append Add at the end )
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]# replace
centList.append(bestNewCents[1,:].tolist()[0]) # add
# update Sample tag The distribution result replace The records that are divided into centers
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss
return mat(centList), clusterAssment
def biKMeansTest(k=5):
MyDatMat = mat(loadDataSet("testSet.txt"))
MyCenters, ClustAssing = biKmeans(MyDatMat, k)
#### Location data clustering tests #####
# Use Yahoo's server to convert the address to Longitude and latitude
import urllib
import json
def geoGrab(stAddress, city):
apiStem = 'http://where.yahooapis.com/geocode?' #
params = {}
params['flags'] = 'J' # Set the return type to JSON string
params['appid'] = 'aaa0VN6k' # registered Access after account http://developer.yahoo.com
params['location'] = '%s %s' % (stAddress, city) # The location information
url_params = urllib.urlencode(params)# Converts the dictionary to a string format that can be passed through the URL
yahooApi = apiStem + url_params # Add network address
print yahooApi # print The URL
c=urllib.urlopen(yahooApi) # Open the The URL
return json.loads(c.read()) # Read returned jason string The location is encoded You get longitude and latitude
from time import sleep
def massPlaceFind(fileName):
fw = open('places.txt', 'w') # Open the location information file
for line in open(fileName).readlines():# every 1 line
line = line.strip()
lineArr = line.split('\t')# Get a list of
retDict = geoGrab(lineArr[1], lineArr[2])# The first 2 As the plate The first 3 As a city Do address decoding
if retDict['ResultSet']['Error'] == 0:
lat = float(retDict['ResultSet']['Results'][0]['latitude']) # longitude
lng = float(retDict['ResultSet']['Results'][0]['longitude'])# latitude
print "%s\t%f\t%f" % (lineArr[0], lat, lng)
fw.write('%s\t%f\t%f\n' % (line, lat, lng)) # Write to file again
else: print "error fetching"
sleep(1)# delay 1s
fw.close()
# The distance between two points on the earth's surface Units of miles Input latitude and longitude ( The degree of ) Spherical cosine theorem
def distSLC(vecA, vecB):#Spherical Law of Cosines
a = sin(vecA[0,1]*pi/180) * sin(vecB[0,1]*pi/180)
b = cos(vecA[0,1]*pi/180) * cos(vecB[0,1]*pi/180) * \
cos(pi * (vecB[0,0]-vecA[0,0]) /180)
return arccos(a + b)*6371.0 #pi in numpy
# Location clustering test Drawing visual display
import matplotlib
import matplotlib.pyplot as plt
def clusterClubs(numClust=5):
datList = [] # sample
for line in open('places.txt').readlines():
lineArr = line.split('\t')
datList.append([float(lineArr[4]), float(lineArr[3])])# Save latitude and longitude
datMat = mat(datList)# The data set numpy the mat type
# for 2 points K Mean algorithm clustering
myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas=distSLC)
fig = plt.figure()# window
rect=[0.1,0.1,0.8,0.8]
scatterMarkers=['s', 'o', '^', '8', 'p', \
'd', 'v', 'h', '>', '<']
axprops = dict(xticks=[], yticks=[])
ax0=fig.add_axes(rect, label='ax0', **axprops)# shaft
imgP = plt.imread('Portland.png') # Label it on the actual picture
ax0.imshow(imgP)
ax1=fig.add_axes(rect, label='ax1', frameon=False)
for i in range(numClust):# every 1 A center
ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]# Sample points belonging to each center
markerStyle = scatterMarkers[i % len(scatterMarkers)]# The type of point drawing
# A scatter diagram Sample points for each center
ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90)
# scattered Some figure Each center
ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300)
plt.show()# According to