python implements kMeans algorithm
- 2020-06-19 10:41:45
- OfStack
Clustering is a kind of unsupervised learning. Placing similar objects in the same cluster is somewhat like automatic classification. The more similar objects are in the cluster and the more different objects are among the clusters, the better the clustering effect will be.
1. k mean means clustering algorithm
k mean clustering divides the data into k clusters, and each cluster is described by its center of mass, that is, the center of all points in the cluster. First, k initial points are randomly determined as the center of mass, and then the data set is assigned to the nearest cluster. The center of mass of each cluster is then updated to the average of all data sets. Then, the data set is divided for the second time until the clustering results no longer change.
Pseudo code for
Randomly create k cluster centers of mass
When the cluster allocation of any 1 point changes:
For each data point in the data set:
For each center of mass:
Calculate the distance from the dataset to the center of mass
The data set is assigned to the cluster corresponding to the nearest center of mass
For each cluster, the mean of all points in the cluster is calculated and the mean is taken as the center of mass
python implementation
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet(fileName):
dataMat = []
with open(fileName) as f:
for line in f.readlines():
line = line.strip().split('\t')
dataMat.append(line)
dataMat = np.array(dataMat).astype(np.float32)
return dataMat
def distEclud(vecA,vecB):
return np.sqrt(np.sum(np.power((vecA-vecB),2)))
def randCent(dataSet,k):
m = np.shape(dataSet)[1]
center = np.mat(np.ones((k,m)))
for i in range(m):
centmin = min(dataSet[:,i])
centmax = max(dataSet[:,i])
center[:,i] = centmin + (centmax - centmin) * np.random.rand(k,1)
return center
def kMeans(dataSet,k,distMeans = distEclud,createCent = randCent):
m = np.shape(dataSet)[0]
clusterAssment = np.mat(np.zeros((m,2)))
centroids = createCent(dataSet,k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = np.inf
minIndex = -1
for j in range(k):
distJI = distMeans(dataSet[i,:],centroids[j,:])
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment[i,0] != minIndex:
clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
for cent in range(k):
ptsInClust = dataSet[np.nonzero(clusterAssment[:,0].A == cent)[0]]
centroids[cent,:] = np.mean(ptsInClust,axis = 0)
return centroids,clusterAssment
data = loadDataSet('testSet.txt')
muCentroids, clusterAssing = kMeans(data,4)
fig = plt.figure(0)
ax = fig.add_subplot(111)
ax.scatter(data[:,0],data[:,1],c = clusterAssing[:,0].A)
plt.show()
print(clusterAssing)
2. 2 points k mean algorithm
The K mean algorithm may converge to a local minimum rather than a global minimum. One index used to measure the clustering effect is the sum of error squares (SSE). Because we're squaring it, we're paying more attention to the point at the center of the principle. In order to overcome the problem that the k mean algorithm may converge to the local minimum, someone proposed the 2-point k mean algorithm.
First, all the points are treated as a cluster, then the cluster 1 is divided into 2, and then the cluster of all the clusters can be divided to minimize the value of SSE until the specified number of clusters is met.
Pseudo code
Think of all the points as a cluster
Calculate SSE
When the number of clusters is smaller than k:
for each cluster:
Calculated total error
k mean clustering on a given cluster (k=2)
Calculate the total error dividing the cluster 1 into 2
Select the cluster with the minimum error to partition
python implementation
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet(fileName):
dataMat = []
with open(fileName) as f:
for line in f.readlines():
line = line.strip().split('\t')
dataMat.append(line)
dataMat = np.array(dataMat).astype(np.float32)
return dataMat
def distEclud(vecA,vecB):
return np.sqrt(np.sum(np.power((vecA-vecB),2)))
def randCent(dataSet,k):
m = np.shape(dataSet)[1]
center = np.mat(np.ones((k,m)))
for i in range(m):
centmin = min(dataSet[:,i])
centmax = max(dataSet[:,i])
center[:,i] = centmin + (centmax - centmin) * np.random.rand(k,1)
return center
def kMeans(dataSet,k,distMeans = distEclud,createCent = randCent):
m = np.shape(dataSet)[0]
clusterAssment = np.mat(np.zeros((m,2)))
centroids = createCent(dataSet,k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = np.inf
minIndex = -1
for j in range(k):
distJI = distMeans(dataSet[i,:],centroids[j,:])
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment[i,0] != minIndex:
clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
for cent in range(k):
ptsInClust = dataSet[np.nonzero(clusterAssment[:,0].A == cent)[0]]
centroids[cent,:] = np.mean(ptsInClust,axis = 0)
return centroids,clusterAssment
def biKmeans(dataSet,k,distMeans = distEclud):
m = np.shape(dataSet)[0]
clusterAssment = np.mat(np.zeros((m,2)))
centroid0 = np.mean(dataSet,axis=0).tolist()
centList = [centroid0]
for j in range(m):
clusterAssment[j,1] = distMeans(dataSet[j,:],np.mat(centroid0))**2
while (len(centList)<k):
lowestSSE = np.inf
for i in range(len(centList)):
ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:,0].A == i)[0],:]
centroidMat,splitClustAss = kMeans(ptsInCurrCluster,2,distMeans)
sseSplit = np.sum(splitClustAss[:,1])
sseNotSplit = np.sum(clusterAssment[np.nonzero(clusterAssment[:,0].A != i)[0],1])
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat.copy()
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
print('the best cent to split is ',bestCentToSplit)
# print('the len of the bestClust')
bestClustAss[np.nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)
bestClustAss[np.nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
clusterAssment[np.nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:] = bestClustAss.copy()
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]
centList.append(bestNewCents[1,:].tolist()[0])
return np.mat(centList),clusterAssment
data = loadDataSet('testSet2.txt')
muCentroids, clusterAssing = biKmeans(data,3)
fig = plt.figure(0)
ax = fig.add_subplot(111)
ax.scatter(data[:,0],data[:,1],c = clusterAssing[:,0].A,cmap=plt.cm.Paired)
ax.scatter(muCentroids[:,0],muCentroids[:,1])
plt.show()
print(clusterAssing)
print(muCentroids)
Code and Data sets download: K-means