分级聚类、K均值聚类

案例

假设有你一批博客数据,请根据博客内容进行聚类,分别用分级聚类和K-均值聚类查看博客的分组情况。

假设你使用API已经爬取了相关数据,并且已经整好数据格式如下:

blogname,china,kids,music,yahoo,search,engine,google,operating,system,python
Read/WriteWeb,5,20,15,0,8,5,1,30,4,5
Search Engine ,1,0,0,3,10,18,8,2,5,1
Operating System,5,0,1,0,7,10,30,12,7,3

思路逻辑图

Python代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#@Time: 2019-11-17 16:12
#@Author: gaoll

import time
import random
import math
import pandas as pd 
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt


#定义度量函数。距离越接近于0,说明紧密度越高。
#皮尔逊相关度公式。/去中心化的余弦公式。
def pearson(v1,v2):
	N = len(v1)
	#简单求和
	sum1 = sum(v1)
	sum2 = sum(v2)
	#求平方和
	sum1sq = sum([np.power(v,2) for v in v1])
	sum2sq = sum([np.power(v,2) for v in v2])
	#求乘积之和
	pdsum = sum([v1[i]*v2[i] for i in range(N)])
	#计算Pearson系数
	num = pdsum - (sum1*sum2)/N
	den = math.sqrt((sum1sq-np.power(sum1,2)/N) * (sum2sq-np.power(sum2,2)/N))

	if den == 0:
		return 0
	return 1 - num/den

#定义‘聚类’这一类型
class  bicluster:
	"""当前聚类的描述,原两个聚类left,right,距离distance"""
	def __init__(self,vec,left=None,right=None,distance=0.0,id=None):
		self.vec = vec
		self.left = left
		self.right = right
		self.distance = distance
		self.id = id

#递归聚类
def hcluster(rows,distance=pearson):
	#记录计算过相关系数的数据
	distance_dict = {}
	currentclustid = -1
	#定义最开始的聚类
	cluster = [bicluster(rows[i],id=i) for i in range(len(rows))] #开始聚类前每个向量都是一个单独的聚类,id指定为行号
	#主循环,生成分级聚类
	while len(cluster)>1:
		#记录最好的聚类配对	
		best_cluseter = (0,1) 
		lowest_distance = distance(cluster[0].vec,cluster[1].vec)
	
		for i in range(len(cluster)):
			for j in range(i+1,len(cluster)):
				#计算并保存距离
				if (cluster[i].id,cluster[j].id) not in distance_dict:
					d = distance(cluster[i].vec,cluster[j].vec)
					distance_dict.setdefault((i,j),0)
					distance_dict[(i,j)] = d
				else:
					d = distance_dict[(cluster[i].id,cluster[j].id)]
				if d < lowest_distance:
					lowest_distance = d
					best_cluseter = (i,j)
		
		left_clust = cluster[best_cluseter[0]]
		right_clust = cluster[best_cluseter[1]]
		new_vec = [ (left_clust.vec[i]+right_clust.vec[i])/2 for i in range(len(left_clust.vec))] #新的向量坐标
		new_clust=bicluster(vec=new_vec,left=left_clust,right=right_clust,distance=lowest_distance,id=currentclustid)
		del cluster[best_cluseter[1]] #删掉聚合前的聚类
		del cluster[best_cluseter[0]] #注意:这里先删除下标大的best_cluseter[1],再删除下标小的best_cluseter[0],否则反过来下标可能会溢出,也会删错。
		cluster.append(new_clust)  #添加新聚类
		currentclustid -=1
	return cluster[0]

#绘制树状图展示分级聚类
#获取数的整体高度
def getheight(clust):
	#叶节点的高度定为1,整体高度等于所有分支高度之和
	if clust.left==None and clust.right==None:
		return 1
	return	getheight(clust.left) + getheight(clust.right)

#获取数的整体宽度,即分级聚类的最长距离
def getdepth(clust):
	#一个叶节点的深度定为0,一个枝节点的深度定义为自身的距离distance加上最长子分支的深度
	if clust.left==None and clust.right==None:
		return 0
	return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance

#画树状图。节点坐标(x,y),(1) 绘制垂直线,高等等于left、right分支的高度。left(x,y-getheight(right)/2), right(x,y+getheight(left)/2)
#(2)画水平线表示聚类间的距离,x=x+getdepth(left 。 节点上打印文本说明
#以防整体距离太大或太小,可对距离都乘上一个比例系数进行缩放。

def drawnode(fig,clust,x,y,scaling,labels):
	#if clust.left == None and clust.right==None:
	if clust.id >=0:
		#如果是叶节点,打印label说明
		label = labels[clust.id]
		fig.text(x,y,label)
	else:
		#如果是枝节点,打印垂直线和水平线延伸到left\right分支,并绘制分支
		h1 = getheight(clust.left)
		h2 = getheight(clust.right)
		top_y = y + h2/2
		bottom_y = y - h1/2
		#画垂直线
		fig.plot([x,x],[bottom_y,top_y])
		#画水平线
		d = clust.distance * scaling
		fig.plot([x,x+d],[top_y,top_y])
		fig.plot([x,x+d],[bottom_y,bottom_y])
		#画分支
		drawnode(fig,clust.left,x+d,top_y,scaling,labels)
		drawnode(fig,clust.right,x+d,bottom_y,scaling,labels)

def drawdendrogram(clust,labels,tree_jpg='clusters_dendrogram.jpg'):
	width = 5 + 5
	height = getheight(clust) +5
	depth = getdepth(clust)
	scaling = float((width-5)/depth)

	fig = plt.figure(figsize=(width+3, height+3))
	ax = fig.add_subplot(1,1,1)
	ax.set_title('clusters dendrogram')
	ax.plot([0.1,0.5],[height/2,height/2])
	drawnode(ax,clust,0.5,height/2,scaling,labels) #开始画第一个点

	fig.savefig(tree_jpg)
	plt.show()

#K均值聚类
def kcluster(rows,distance=pearson,k=3):
	N = len(rows[0])
	M = len(rows)
	#确定每列的最大最小值
	domain=[(0,0) for i in range(N)]
	for i in range(N):
		(min1,max1) = domain[i]
		tmp_list =[]
		for row in rows:
			tmp_list.append(row[i])
		min1 = min(tmp_list)
		max1 = max(tmp_list)
		domain[i] = (min1,max1)

	#随机K个中心点
	clusters = [[domain[j][0] + random.random()*(domain[j][1]-domain[j][0]) for j in range(N)] for i in range(k)]

	#计算每个数据项离哪个中心点最近
	lastmatches = None
	for t in range(100):
		print('Iteration %d'%t) #打印第几次迭代
		bestmatches = [[] for i in range(k)] #记录此次迭代的最佳聚类结果
		for j in range(M):
			row = rows[j]
			bestmatch = 0
			best_distance = distance(row,clusters[0])
			for i in range(1,k):
				dist= distance(row,clusters[i]) #计算每个数据项和中心点的距离
				#print(dist)
				if dist < best_distance:
					bestmatch = i 
					best_distance = dist
			bestmatches[bestmatch].append(j)  #第j个数据项加入到最合适的聚类组中

		if bestmatches == lastmatches:
			break
		lastmatches = bestmatches
		for i in range(k):
			avgs=[0]*N
			cnt = len(bestmatches[i])
			if cnt>0:
				for rowid in bestmatches[i]:
					for j in range(N):
						avgs[j] +=rows[rowid][j]
			
			for j in range(N):
				avgs[j] = avgs[j]/cnt
			clusters[i] = avgs
	return bestmatches


if __name__ == '__main__':

	#加载数据
	filename = 'blogdata_sample.txt'
	df = pd.read_csv(filename,sep=',',header=0,index_col=0,na_filter=False,skip_blank_lines=True,encoding='utf-8') #nrows=3
	blognames,words = df.index.tolist(),df.columns.tolist()
	data = []
	for i in range(len(blognames)):
		p = df.iloc[i].tolist()
		data.append(p)

	#分级聚类和画树状图
	clust=hcluster(data)
	drawdendrogram(clust,labels=blognames)

	#K均值聚类
	kclust=kcluster(data)
	print(kclust)
	print([[blognames[id] for id in kclust[i]] for i in range(k)])

结果展示

#1、分级聚类树形图

#2、K均值聚类
print(kclust)
[[2], [0, 1, 3, 5, 6], [4]]
#print([[blognames[id] for id in kclust[i]] for i in range(k)])
[['Read/WriteWeb'], ["John Battelle's Searchblog", 'Search Engine Watch Blog', 'Official Google Blog', 'Google Blogoscoped', 'Google Operating System'], ['Search Engine Roundtable']]

Done

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐