python
第一步计算欧氏距离并取样,k代表分类的总个数import numpy as np#calculate the O distancedef calculate_distance(vector1,vector2): import numpy as np return np.sqrt(np.sum(np.square(vector1-vector2)))#initialize centroidsdef initialize_centroids(data,k): import random return random.sample(data,k)
产生新的簇类并求出最短距离#find the minimun diastance from individual to centroidsdef minimun_distance(data,centroidlist): clusterdictionary=cd=dict() for i in data: vector1=i marker=0 min_dist=float(inf) for j in range(len(centroidlist)): vector2=centroidlist[j] distance=calculate_distance(vector1,vector2) if distance
导入数据并计算,当簇中心变化小于一定阈值跳出循环#get mean squared deviationdef getmsd(clusterdictionary,centroidlist): sum=0.0 for key in clusterdictionary.keys(): vector1=centroidlist[key] distance=0.0 for i in clusterdictionary[key]: vector2=i distance+=calculate_distance(vector1,vector2) sum+=distance return sum#show resultdef showresult(clusterdictionary,centroidlist): import matplotlib.pyplot as plt colormark=['or','ob','og','ok'] centroidmark=['dr','db','dg','dk'] for key in clusterdictionary.keys(): plt.plot(centroidlist[key][0],centroidlist[key][1],centroidmark[key],markersize=12) for i in clusterdictionary[key]: plt.plot(i[0],i[1],colormark[key]) plt.showpath='C:\\Users\\jyjh\\Desktop\\data.txt'data=open(path,'r').readlines()temp=list()import refor i in data: numlist=list() for j in i.strip().split('\t'): num=float(j) numlist.append(num) temp.append(numlist)data=np.array(temp)centroidlist=initialize_centroids(data,4)clusterdictionary=minimun_distance(data,centroidlist)new_msd=getmsd(clusterdictionary,centroidlist)old_msd=-0.000001k=2while(abs(new_msd-old_msd)>=0.00001): centroidlist=getcentroids(clusterdictionary) clusterdictionary=minimun_distance(data,centroidlist) old_msd=new_msd new_msd=getmsd(clusterdictionary,centroidlist) k+=1 print new_msd-old_msdshowresult(clusterdictionary,centroidlist)
对Kmeans了解
matlab有kmeans函数