YCX's Homepage

机器学习实战——KNN实现

2017-09-26

机器学习实战——KNN实现

  • 计算已知类别数据集中的点与当前点之间的距离;
  • 按照距离递增次序排序;
  • 选取与当前点距离最小的k个点;
  • 确定前k个点所在类别的出现频率;
  • 返回前k个点出现频率最高的类别作为当前点的预测分类。
# -*- coding: utf-8 -*-

from numpy import *
import operator

'''构造数据'''
def createDataSet():
    characters=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels=['A','A','B','B']
    return characters,labels

def classify(sample,dataSet,labels,k):
    dataSetSize=dataSet.shape[0] #统计数据集的个数:4个
    '''下面四行计算待分类的点和训练集中的任一点的欧式距离'''
    diffMat=tile(sample,(dataSetSize,1))-dataSet #样本与每个数据集中数据的差值构成矩阵
    sqDiffMat=diffMat**2 #差值矩阵求平方
    sqDistances=sqDiffMat.sum(axis=1) #差值矩阵中每项的两个数字求和
    distances=sqDistances**0.5 #开方
    sortedDistIndicies=distances.argsort() #按distances中元素进行升序排序后得到的对应下标的列表
    '''选择距离最小的k个点'''
    classCount={}
    for i in range(k):
        voteIlabel=labels[sortedDistIndicies[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    '''从大到小排序'''
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def main():
    sample=[0,0]
    k=3
    group,labels=createDataSet()
    label=classify(sample,group,labels,k)
    print("Classified Label:"+label)

if __name__=='__main__':
    main()

KNN在数字识别上的应用: 数据集点这里,准确率为98.94%。

# -*- coding: utf-8 -*-

from numpy import *
import operator
import os 

def classify(sample,dataSet,labels,k):
    dataSetSize=dataSet.shape[0] #统计数据集的个数:4个
    '''下面四行计算待分类的点和训练集中的任一点的欧式距离'''
    diffMat=tile(sample,(dataSetSize,1))-dataSet #样本与每个数据集中数据的差值构成矩阵
    sqDiffMat=diffMat**2 #差值矩阵求平方
    sqDistances=sqDiffMat.sum(axis=1) #差值矩阵中每项的两个数字求和
    distances=sqDistances**0.5 #开方
    sortedDistIndicies=distances.argsort() #按distances中元素进行升序排序后得到的对应下标的列表
    '''选择距离最小的k个点'''
    classCount={}
    for i in range(k):
        voteIlabel=labels[sortedDistIndicies[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    '''从大到小排序'''
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def img2vector(filename):
     row = 32
     col = 32 
     imgVector = zeros((1,row*col))
     file = open(filename)
     for r in range(row):
          lineStr = file.readline()
          for c in range(col):
               imgVector[0,c+r*32] = int(lineStr[c])
               
     return imgVector

#载入数据集
def loadDataSet():  
    ## step 1: Getting training set  
    dataSetDir = 'C:/Users/Rainey/Desktop/digits/'  
    trainingFileList = os.listdir(dataSetDir + 'trainingDigits') # load the training set  
    numSamples = len(trainingFileList)  
  
    train_x = zeros((numSamples, 1024))  
    train_y = []  
    for i in range(numSamples):  
        filename = trainingFileList[i]  
  
        # get train_x  
        train_x[i, :] = img2vector(dataSetDir + 'trainingDigits/%s' % filename)   
  
        # get label from file name such as "1_18.txt"  
        label = int(filename.split('_')[0]) # return 1  
        train_y.append(label)  
  
    ## step 2: Getting testing set    
    testingFileList = os.listdir(dataSetDir + 'testDigits') # load the testing set  
    numSamples = len(testingFileList)  
    test_x = zeros((numSamples, 1024))  
    test_y = []  
    for i in range(numSamples):  
        filename = testingFileList[i]  
  
        # get train_x  
        test_x[i, :] = img2vector(dataSetDir + 'testDigits/%s' % filename)   
  
        # get label from file name such as "1_18.txt"  
        label = int(filename.split('_')[0]) # return 1  
        test_y.append(label)  
  
    return train_x, train_y, test_x, test_y  
  
# test hand writing class  
def testHandWritingClass():  
    ## step 1: load data  
    print('loading data...')
    train_x, train_y, test_x, test_y = loadDataSet()  
  
    ## step 2: training...  
    pass  
    print('training...')
  
    ## step 3: testing 
    print('testing...')
    numTestSamples = test_x.shape[0]  
    matchCount = 0  
    for i in range(numTestSamples):  
        predict = classify(test_x[i], train_x, train_y, 3)  
        if predict == test_y[i]:  
            matchCount += 1  
    accuracy = float(matchCount) / numTestSamples  
  
    ## step 4: show the result  
    print("step 4: show the result...")  
    print('The classify accuracy is: %.2f%%' % (accuracy * 100))
testHandWritingClass()

赞助

如果您觉得本文对您有帮助,请扫描下方二维码激励我写作,欢迎留言,谢谢支持!


Similar Posts

Comments