分级聚类、K均值聚类
分级聚类、K均值聚类案例假设有你一批博客数据,请根据博客内容进行聚类,分别用分级聚类和K-均值聚类查看博客的分组情况。假设你使用API已经爬取了相关数据,并且已经整好数据格式如下:blogname,china,kids,music,yahoo,search,engine,google,operating,system,pythonRead/WriteWeb,5,20,15,0,8...
分级聚类、K均值聚类
案例
假设有你一批博客数据,请根据博客内容进行聚类,分别用分级聚类和K-均值聚类查看博客的分组情况。
假设你使用API已经爬取了相关数据,并且已经整好数据格式如下:
blogname,china,kids,music,yahoo,search,engine,google,operating,system,python
Read/WriteWeb,5,20,15,0,8,5,1,30,4,5
Search Engine ,1,0,0,3,10,18,8,2,5,1
Operating System,5,0,1,0,7,10,30,12,7,3
思路逻辑图
Python代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#@Time: 2019-11-17 16:12
#@Author: gaoll
import time
import random
import math
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt
#定义度量函数。距离越接近于0,说明紧密度越高。
#皮尔逊相关度公式。/去中心化的余弦公式。
def pearson(v1,v2):
N = len(v1)
#简单求和
sum1 = sum(v1)
sum2 = sum(v2)
#求平方和
sum1sq = sum([np.power(v,2) for v in v1])
sum2sq = sum([np.power(v,2) for v in v2])
#求乘积之和
pdsum = sum([v1[i]*v2[i] for i in range(N)])
#计算Pearson系数
num = pdsum - (sum1*sum2)/N
den = math.sqrt((sum1sq-np.power(sum1,2)/N) * (sum2sq-np.power(sum2,2)/N))
if den == 0:
return 0
return 1 - num/den
#定义‘聚类’这一类型
class bicluster:
"""当前聚类的描述,原两个聚类left,right,距离distance"""
def __init__(self,vec,left=None,right=None,distance=0.0,id=None):
self.vec = vec
self.left = left
self.right = right
self.distance = distance
self.id = id
#递归聚类
def hcluster(rows,distance=pearson):
#记录计算过相关系数的数据
distance_dict = {}
currentclustid = -1
#定义最开始的聚类
cluster = [bicluster(rows[i],id=i) for i in range(len(rows))] #开始聚类前每个向量都是一个单独的聚类,id指定为行号
#主循环,生成分级聚类
while len(cluster)>1:
#记录最好的聚类配对
best_cluseter = (0,1)
lowest_distance = distance(cluster[0].vec,cluster[1].vec)
for i in range(len(cluster)):
for j in range(i+1,len(cluster)):
#计算并保存距离
if (cluster[i].id,cluster[j].id) not in distance_dict:
d = distance(cluster[i].vec,cluster[j].vec)
distance_dict.setdefault((i,j),0)
distance_dict[(i,j)] = d
else:
d = distance_dict[(cluster[i].id,cluster[j].id)]
if d < lowest_distance:
lowest_distance = d
best_cluseter = (i,j)
left_clust = cluster[best_cluseter[0]]
right_clust = cluster[best_cluseter[1]]
new_vec = [ (left_clust.vec[i]+right_clust.vec[i])/2 for i in range(len(left_clust.vec))] #新的向量坐标
new_clust=bicluster(vec=new_vec,left=left_clust,right=right_clust,distance=lowest_distance,id=currentclustid)
del cluster[best_cluseter[1]] #删掉聚合前的聚类
del cluster[best_cluseter[0]] #注意:这里先删除下标大的best_cluseter[1],再删除下标小的best_cluseter[0],否则反过来下标可能会溢出,也会删错。
cluster.append(new_clust) #添加新聚类
currentclustid -=1
return cluster[0]
#绘制树状图展示分级聚类
#获取数的整体高度
def getheight(clust):
#叶节点的高度定为1,整体高度等于所有分支高度之和
if clust.left==None and clust.right==None:
return 1
return getheight(clust.left) + getheight(clust.right)
#获取数的整体宽度,即分级聚类的最长距离
def getdepth(clust):
#一个叶节点的深度定为0,一个枝节点的深度定义为自身的距离distance加上最长子分支的深度
if clust.left==None and clust.right==None:
return 0
return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance
#画树状图。节点坐标(x,y),(1) 绘制垂直线,高等等于left、right分支的高度。left(x,y-getheight(right)/2), right(x,y+getheight(left)/2)
#(2)画水平线表示聚类间的距离,x=x+getdepth(left 。 节点上打印文本说明
#以防整体距离太大或太小,可对距离都乘上一个比例系数进行缩放。
def drawnode(fig,clust,x,y,scaling,labels):
#if clust.left == None and clust.right==None:
if clust.id >=0:
#如果是叶节点,打印label说明
label = labels[clust.id]
fig.text(x,y,label)
else:
#如果是枝节点,打印垂直线和水平线延伸到left\right分支,并绘制分支
h1 = getheight(clust.left)
h2 = getheight(clust.right)
top_y = y + h2/2
bottom_y = y - h1/2
#画垂直线
fig.plot([x,x],[bottom_y,top_y])
#画水平线
d = clust.distance * scaling
fig.plot([x,x+d],[top_y,top_y])
fig.plot([x,x+d],[bottom_y,bottom_y])
#画分支
drawnode(fig,clust.left,x+d,top_y,scaling,labels)
drawnode(fig,clust.right,x+d,bottom_y,scaling,labels)
def drawdendrogram(clust,labels,tree_jpg='clusters_dendrogram.jpg'):
width = 5 + 5
height = getheight(clust) +5
depth = getdepth(clust)
scaling = float((width-5)/depth)
fig = plt.figure(figsize=(width+3, height+3))
ax = fig.add_subplot(1,1,1)
ax.set_title('clusters dendrogram')
ax.plot([0.1,0.5],[height/2,height/2])
drawnode(ax,clust,0.5,height/2,scaling,labels) #开始画第一个点
fig.savefig(tree_jpg)
plt.show()
#K均值聚类
def kcluster(rows,distance=pearson,k=3):
N = len(rows[0])
M = len(rows)
#确定每列的最大最小值
domain=[(0,0) for i in range(N)]
for i in range(N):
(min1,max1) = domain[i]
tmp_list =[]
for row in rows:
tmp_list.append(row[i])
min1 = min(tmp_list)
max1 = max(tmp_list)
domain[i] = (min1,max1)
#随机K个中心点
clusters = [[domain[j][0] + random.random()*(domain[j][1]-domain[j][0]) for j in range(N)] for i in range(k)]
#计算每个数据项离哪个中心点最近
lastmatches = None
for t in range(100):
print('Iteration %d'%t) #打印第几次迭代
bestmatches = [[] for i in range(k)] #记录此次迭代的最佳聚类结果
for j in range(M):
row = rows[j]
bestmatch = 0
best_distance = distance(row,clusters[0])
for i in range(1,k):
dist= distance(row,clusters[i]) #计算每个数据项和中心点的距离
#print(dist)
if dist < best_distance:
bestmatch = i
best_distance = dist
bestmatches[bestmatch].append(j) #第j个数据项加入到最合适的聚类组中
if bestmatches == lastmatches:
break
lastmatches = bestmatches
for i in range(k):
avgs=[0]*N
cnt = len(bestmatches[i])
if cnt>0:
for rowid in bestmatches[i]:
for j in range(N):
avgs[j] +=rows[rowid][j]
for j in range(N):
avgs[j] = avgs[j]/cnt
clusters[i] = avgs
return bestmatches
if __name__ == '__main__':
#加载数据
filename = 'blogdata_sample.txt'
df = pd.read_csv(filename,sep=',',header=0,index_col=0,na_filter=False,skip_blank_lines=True,encoding='utf-8') #nrows=3
blognames,words = df.index.tolist(),df.columns.tolist()
data = []
for i in range(len(blognames)):
p = df.iloc[i].tolist()
data.append(p)
#分级聚类和画树状图
clust=hcluster(data)
drawdendrogram(clust,labels=blognames)
#K均值聚类
kclust=kcluster(data)
print(kclust)
print([[blognames[id] for id in kclust[i]] for i in range(k)])
结果展示
#1、分级聚类树形图
#2、K均值聚类
print(kclust)
[[2], [0, 1, 3, 5, 6], [4]]
#print([[blognames[id] for id in kclust[i]] for i in range(k)])
[['Read/WriteWeb'], ["John Battelle's Searchblog", 'Search Engine Watch Blog', 'Official Google Blog', 'Google Blogoscoped', 'Google Operating System'], ['Search Engine Roundtable']]
Done
更多推荐
所有评论(0)