使用KNN算法进行约会网站配对预测
#!/usr/bin/env python# -*- coding: UTF-8 -*-'''=================================================@Project -> File:KNN -> kNN@IDE:PyCharm@Author :zgq@Date:2021/1/7 14:15@Desc:=====================
·
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''=================================================
@Project -> File :KNN -> kNN
@IDE :PyCharm
@Author :zgq
@Date :2021/1/7 14:15
@Desc :
=================================================='''
from numpy import *
import operator #运算符模块
import matplotlib
import matplotlib.pyplot as plt
def creatDateSet():
group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels=['A','A','B','B']
return group,labels
group,labels=creatDateSet()
def classify0(inX,dataSet,labels,k):
#inx用于分类的输入向量
#训练样本集dataset
#lables标签
#k最近邻数目
#距离计算
dataSetSize=dataSet.shape[0] #dataset有几行
diffMat=tile(inX,(dataSetSize,1))-dataSet #输入向量重复了已有数据集的行数,一起减掉,出来一个新的矩阵,每个数字都记录当前新样本该维度与每个样本差值
sqDiffMat=diffMat**2
sqDistances=sqDiffMat.sum(axis=1) #所有横轴元素加和
distances=sqDistances**0.5 #到此处时 distance为一个一位列数组,记录每条样本与新样本的距离
sortedDistIndicies= distances.argsort() #对distance进行升序排序
classCount={} #DICT类型
for i in range(k): #寻找距离最小的K个点
voteIlabel = labels[sortedDistIndicies[i]] #返回距离排序中前K条数据的标签
classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
#classCount.get(voteIlabel,0) 字典获取vouteIlabel值,没有的话返回0
#此处for循环将距离最近的K个数据标签进行统计:每次for循环第一步,将第i个标签记录到voteIlable中,第二部将该标签出现后再dict中次数加一
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
#test
test=classify0([0,0],group,labels,3)
print(test)
#将文本记录转换为Numpy的解析程序
def file2matrix(filename):
fr=open(filename)
arrayOLines=fr.readlines()
numberOfLines=len(arrayOLines) #得到文件行数
returnMat=zeros((numberOfLines,3)) #将行数和列数为3的矩阵填充为0
classLabelVector=[] #类标签向量
index=0
for line in arrayOLines: #迭代每行
line = line.strip() #去除首位空格
listFromLine=line.split('\t') #以空格划分数据
returnMat[index,:]=listFromLine[0:3] #将第index行所有行赋值上当前行前三项
classLabelVector.append(int(listFromLine[-1])) #该行标签给她
index +=1
return returnMat,classLabelVector
datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
print(datingDataMat)
print(datingLabels)
#对当前数据进行可视化
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingLabels),15.0*array(datingLabels))
plt.show()
#对当前数据进行归一化,防止同样权重大小的不同特征,数值大的特征对算法距离影响特别大,都将其置为-1,1之间
def autoNorm(dataSet):
minVals=dataSet.min(0) #将每一列中最小的取出来 构成了一个向量
maxVals=dataSet.max(0)
ranges=maxVals-minVals #每一列数值的取值范围
normDataSet=zeros(shape(dataSet)) #创建一个新的形状和dataset一样的矩阵用0填上
m=dataSet.shape[0] #dataset的行数
normDataSet=dataSet-tile(minVals,(m,1)) #用dataset-将minVal复制n行与原dataset形状对齐,对所有的矩阵里的原来的数进行减最小值
normDataSet=normDataSet/tile(ranges,(m,1))
return normDataSet,ranges,minVals
normMat,ranges,minVals=autoNorm(datingDataMat)
print("输出归一化后测试集")
print(normMat)
#测试分类器效果,返回错误率
def datingClassTest():
hoRatio=0.10
datingDataMat,datingLabels=file2matrix('datingTestSet2.txt') #原始数据转换为numoy数据
normMat,ranges,minVals=autoNorm(datingDataMat) #归一化好的数据
m=normMat.shape[0] #获取归一化好数据样本总数
numTestVecs=int(m*hoRatio) #测试集的总数量
errorCount=0.0 #测试集错误率置为0
for i in range(numTestVecs): #遍历每一个测试样本
classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print("the classifier came back with: %d,the real answer is : %d" % (classifierResult,datingLabels[i]))
if(classifierResult!=datingLabels[i]):
errorCount=errorCount+1
print("the total error rate is : %f" % (errorCount/float(numTestVecs)))
datingClassTest()
#约会网站预测函数
def classifyPerson():
resultList=['not at all','in small doses','in large doses']
percentTats=float(input("percentage of time spent playing video games?"))
iceCream=float(input("liters of ice cream consumed per year?"))
ffMiles=float(input("frequent flier miles earned per year?"))
datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
norMat,ranges,minVals=autoNorm(datingDataMat)
inArr=array([ffMiles,percentTats,iceCream])
classifierResult=classify0((inArr-minVals)/ranges,norMat,datingLabels,3)
print("you will probably like this person:",resultList[classifierResult-1])
return
classifyPerson()
运行结果:
数据展示
将原始数据txt转换为numpy
[[4.0920000e+04 8.3269760e+00 9.5395200e-01]
[1.4488000e+04 7.1534690e+00 1.6739040e+00]
[2.6052000e+04 1.4418710e+00 8.0512400e-01]
...
[2.6575000e+04 1.0650102e+01 8.6662700e-01]
[4.8111000e+04 9.1345280e+00 7.2804500e-01]
[4.3757000e+04 7.8826010e+00 1.3324460e+00]]
输出归一化后测试集
[[0.44832535 0.39805139 0.56233353]
[0.15873259 0.34195467 0.98724416]
[0.28542943 0.06892523 0.47449629]
...
[0.29115949 0.50910294 0.51079493]
[0.52711097 0.43665451 0.4290048 ]
[0.47940793 0.3768091 0.78571804]]
测试错误率结果`
the classifier came back with: 3,the real answer is : 3
the classifier came back with: 2,the real answer is : 2
the classifier came back with: 1,the real answer is : 1
the classifier came back with: 1,the real answer is : 1
the classifier came back with: 1,the real answer is : 1
the classifier came back with: 1,the real answer is : 1
the classifier came back with: 3,the real answer is : 3
the classifier came back with: 3,the real answer is : 3
the classifier came back with: 1,the real answer is : 1
the classifier came back with: 3,the real answer is : 3
……
the total error rate is : 0.050000
输入新数据测试结果:
percentage of time spent playing video games?>? 10
liters of ice cream consumed per year?>? 10000
frequent flier miles earned per year?>? 0.5
you will probably like this person: in large doses
更多推荐
所有评论(0)