python Machine learning practice K mean clustering

2020-06-19 10:47:03
OfStack
This paper shares the specific code of python K mean clustering for your reference. The specific content is as follows

#-*- coding:utf-8 -*- 
#!/usr/bin/python 
 
''''' 
k Means K The mean clustering  
''' 
#  test  
#  K-mean clustering    import kMeans as KM KM.kMeansTest() 
# 2 K-mean clustering  import kMeans as KM KM.biKMeansTest() 
#  The geographical position  2 K-mean clustering  import kMeans as KM KM.clusterClubs() 
from numpy import * 
 
#  Import data set  
def loadDataSet(fileName):   #  
  dataMat = []        #  
  fr = open(fileName) 
  for line in fr.readlines(): #  every 1 line  
    curLine = line.strip().split('\t')#  According to the  Tab key   Divided into   The list of  
    fltLine = map(float,curLine)   #  mapping   floating-point  
    dataMat.append(fltLine)      #  Put it in the data set  
  return dataMat 
 
#  Calculate the distance to the Eucalyptus  
def distEclud(vecA, vecB): 
  return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB) 
 
#  I'm going to build the center of mass initially ( random )  The data set   The number of center of mass  
def randCent(dataSet, k): 
  n = shape(dataSet)[1] #  Sample characteristic dimension  
  centroids = mat(zeros((k,n))) #  Initialize the  k a   The center of mass  
  for j in range(n):  #  Each sample feature  
    minJ = min(dataSet[:,j]) #  Minimum values for each sample feature   We need to convert to  numpy  the mat 
    rangeJ = float(max(dataSet[:,j]) - minJ)# The amplitude range of each sample feature  
    centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) 
    #  Randomly generated between the maximum and minimum values of each sample K A sample eigenvalue  
  return centroids 
 
#  simple k Mean clustering algorithm   
#     The data set   Number of center    Distance algorithm        Initial clustering center algorithm   
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): 
  m = shape(dataSet)[0]       #  Number of samples  
  clusterAssment = mat(zeros((m,2)))#  Sample tag   The distribution result   The first 1 The column index   The first 2 Column error  
  centroids = createCent(dataSet, k)#  Initial cluster center  
  clusterChanged = True#  Sets whether the center of mass still sends changes  
  while clusterChanged: 
    clusterChanged = False 
    for i in range(m): # For each sample   Calculate the nearest center  
    #  update   Sample ownership  
      minDist = inf; minIndex = -1 #  Distance variable   As well as   Nearest central index  
      for j in range(k): #  For each center  
        distJI = distMeas(centroids[j,:],dataSet[i,:])#  Calculation of distance  
        if distJI < minDist: 
          minDist = distJI; minIndex = j#  Get the nearest   center   The index  
      if clusterAssment[i,0] != minIndex: clusterChanged = True  
      #  The ownership index has changed   So the center of mass is still changing, and you can optimize it  
      clusterAssment[i,:] = minIndex,minDist**2 #  save   Subordinate to the index   And distance squared   To calculate the sum of squared errors  SSE 
    #  Update the center of mass  
    print centroids #  Print the center of mass for each iteration  
    for cent in range(k):#  
      ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#  An array of filtering   Get the sample of each center  
      centroids[cent,:] = mean(ptsInClust, axis=0) #  Average by column   Get a new center  
  return centroids, clusterAssment#  Return to the center of mass   And the distribution results of each sample  
 
def kMeansTest(k=5): 
  MyDatMat = mat(loadDataSet("testSet.txt")) 
  MyCenters, ClustAssing = kMeans(MyDatMat, k) 
 
# bisecting K-means 2 points K The mean algorithm   Overcome local optima  
def biKmeans(dataSet, k, distMeas=distEclud): 
  m = shape(dataSet)[0]       #  Number of samples  
  clusterAssment = mat(zeros((m,2)))#  Sample tag   The distribution result   The first 1 The column index   The first 2 Column error  
  centroid0 = mean(dataSet, axis=0).tolist()[0]#  create 1 The initial center of mass  
  centList =[centroid0] # 1 A center of   The list of  
  for j in range(m):  #  Calculate initial error  
    clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2# The square of the distance from the center of each sample  
  while (len(centList) < k):#  Two centres did not reach the designated number of centres   To continue the iteration  
    lowestSSE = inf    #  The smallest   Error sum of squares  SSE 
    for i in range(len(centList)):#  For each 1 A center  
      ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:] #  The sample point at the current center  
      centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas) #  Proceed to this point within the center 2 classification  
      #  The sample center  2 Post-Classified   Error sum of squares  SSE 
    sseSplit = sum(splitClustAss[:,1]) 
      #  The sum of squared errors of other unpartitioned data sets  SSE 
      sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1]) 
      print "sseSplit, and notSplit: ",sseSplit,sseNotSplit 
      #  The error after partition and the error of the data set without partition is this error  
      if (sseSplit + sseNotSplit) < lowestSSE: #  Less than the last time   the   error   
        bestCentToSplit = i #  The center where records should be divided   The index of the  
        bestNewCents = centroidMat #  The best of the newly partitioned centers  
        bestClustAss = splitClustAss.copy()#  The new center   For the   Divide the record   The index (0 or 1) As well as   Error square  
        lowestSSE = sseSplit + sseNotSplit #  Update the total   Error sum of squares  
    #  Records center   division   data  
    bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #  Number of existing centres  
    bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit#  The center that should be most divided  
    print 'the bestCentToSplit is: ',bestCentToSplit 
    print 'the len of bestClustAss is: ', len(bestClustAss) 
    #  Will be the center that should be most divided   Replace with   After the division of   two   center (1 Student: A substitution, another 1 a  append Add at the end ) 
    centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#  replace  
    centList.append(bestNewCents[1,:].tolist()[0])      #  add  
    #  update   Sample tag   The distribution result   replace   The records that are divided into centers  
    clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss 
  return mat(centList), clusterAssment 
 
def biKMeansTest(k=5): 
  MyDatMat = mat(loadDataSet("testSet.txt")) 
  MyCenters, ClustAssing = biKmeans(MyDatMat, k) 
 
#### Location data clustering tests ##### 
#  Use Yahoo's server to convert the address to   Longitude and latitude  
import urllib 
import json 
def geoGrab(stAddress, city): 
  apiStem = 'http://where.yahooapis.com/geocode?' #  
  params = {} 
  params['flags'] = 'J'    #  Set the return type to JSON string   
  params['appid'] = 'aaa0VN6k' #  registered   Access after account  http://developer.yahoo.com 
  params['location'] = '%s %s' % (stAddress, city) #  The location information  
  url_params = urllib.urlencode(params)#  Converts the dictionary to a string format that can be passed through the URL  
  yahooApi = apiStem + url_params   #  Add network address   
  print yahooApi            #  print   The URL  
  c=urllib.urlopen(yahooApi)      #  Open the   The URL  
  return json.loads(c.read())     #  Read returned jason string    The location is encoded   You get longitude and latitude   
 
 
from time import sleep 
def massPlaceFind(fileName): 
  fw = open('places.txt', 'w') #  Open the location information file  
  for line in open(fileName).readlines():#  every 1 line  
    line = line.strip() 
    lineArr = line.split('\t')#  Get a list of  
    retDict = geoGrab(lineArr[1], lineArr[2])#  The first 2 As the plate   The first 3 As a city   Do address decoding  
    if retDict['ResultSet']['Error'] == 0: 
      lat = float(retDict['ResultSet']['Results'][0]['latitude']) # longitude  
      lng = float(retDict['ResultSet']['Results'][0]['longitude'])# latitude  
      print "%s\t%f\t%f" % (lineArr[0], lat, lng) 
      fw.write('%s\t%f\t%f\n' % (line, lat, lng)) # Write to file again  
    else: print "error fetching" 
    sleep(1)# delay 1s 
  fw.close() 
 
#  The distance between two points on the earth's surface   Units of miles   Input latitude and longitude ( The degree of )  Spherical cosine theorem  
def distSLC(vecA, vecB):#Spherical Law of Cosines 
  a = sin(vecA[0,1]*pi/180) * sin(vecB[0,1]*pi/180) 
  b = cos(vecA[0,1]*pi/180) * cos(vecB[0,1]*pi/180) * \ 
           cos(pi * (vecB[0,0]-vecA[0,0]) /180) 
  return arccos(a + b)*6371.0 #pi in numpy 
 
 
#  Location clustering test   Drawing visual display  
import matplotlib 
import matplotlib.pyplot as plt 
 
def clusterClubs(numClust=5): 
  datList = [] #  sample  
  for line in open('places.txt').readlines(): 
    lineArr = line.split('\t') 
    datList.append([float(lineArr[4]), float(lineArr[3])])#  Save latitude and longitude  
  datMat = mat(datList)#  The data set  numpy the mat type  
  #  for 2 points K Mean algorithm clustering  
  myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas=distSLC) 
  fig = plt.figure()#  window  
  rect=[0.1,0.1,0.8,0.8] 
  scatterMarkers=['s', 'o', '^', '8', 'p', \ 
          'd', 'v', 'h', '>', '<'] 
  axprops = dict(xticks=[], yticks=[]) 
  ax0=fig.add_axes(rect, label='ax0', **axprops)# shaft  
  imgP = plt.imread('Portland.png') #  Label it on the actual picture  
  ax0.imshow(imgP) 
  ax1=fig.add_axes(rect, label='ax1', frameon=False) 
  for i in range(numClust):# every 1 A center  
    ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]#  Sample points belonging to each center  
    markerStyle = scatterMarkers[i % len(scatterMarkers)]#  The type of point   drawing  
    #  A scatter diagram   Sample points for each center  
    ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90) 
  #  scattered   Some figure   Each center  
  ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300) 
plt.show()#  According to