#-*-coding:utf-8-*-
import sys,os
import csv
from numpy import *
import operator
path=sys.path[0]
def loadTrainData():
trdata=list()
with open(path+'/train.csv') as file:
lines=csv.reader(file)
for line in lines:#839*785
trdata.append(line)
trdata.remove(trdata[0])
trdata=array(trdata)
row,col=len(trdata),len(trdata[0])
print 'train',row,col
label=[]
data=zeros((row,col-1))
for i in range(0,row):
label.append(trdata[i][0])
for j in range(1,col):
data[i][j-1]=trdata[i][j]
label=array(label).reshape(len(label),1)
data=array(data).reshape(len(label),len(trdata[0])-1)
label=toint(label)
data=toint(data)
return nomalizing(data),label
def loadTestdata():
tedata=list()
with open(path+'/test.csv') as file:
i=1
lines=csv.reader(file)
for line in lines:
tedata.append(line)
tedata.remove(tedata[0])
row,col=len(tedata),len(tedata[0])
print 'test',row,col
data=zeros((row,col))
for i in range(0,row):
for j in range(0,col):
data[i][j]=tedata[i][j]
data=toint(data)
return nomalizing(data)
def loadTestResult():
#28001*2
terdata=list()
label=list()
with open(path+'/sample_submission.csv') as file:
lines=csv.reader(file)
for line in lines:
terdata.append(line)
terdata.remove(terdata[0])
row,col=len(terdata),len(terdata[0])
print 'test',row,col
for i in range(0,row):
label.append(terdata[i][1])
label=array(label).reshape(len(label),1)
return toint(label)
def nomalizing(array):
m,n=shape(array)
for i in xrange(m):
for j in xrange(n):
if array[i][j]!=0:
array[i][j]=1
return array
def toint(array):
m,n=shape(array)
print m,n
newdata=zeros((m,n))
for i in xrange(m):
for j in xrange(n):
newdata[i][j]=int(array[i][j])
return newdata
def classify(inX, dataSet, labels, k):
inX=mat(inX)#1*28000
dataSet=mat(dataSet)#42000*784
labels=mat(labels)#1*42000
dataSetSize = dataSet.shape[0] #42000
diffMat = tile(inX, (dataSetSize,1)) - dataSet # 将数组inX作为元素构成42000*784的矩阵,即将inX重复42000遍,并与dataSet相减,求出inX与每一个train向量之间的距离
sqDiffMat = array(diffMat)**2 #对矩阵中的每个元素求平方
distances = sqDiffMat.sum(axis=1)#对矩阵中的每一行求和,得到42000*1的矩阵
#distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()#对距离矩阵由小到大排序,得到排序矩阵,其中存储每个值的索引
classCount={}
for i in range(k):
voteIlabel = labels[0,sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def saveResult(result):
l=array(result).reshape(len(result),1)
row,col= shape(l)
l1=[]
for i in range(row):
for j in range(col):
l1.append(int(float(l[i][j])))
print l1
j=1
with open('result3.csv','wb') as file:
myW=csv.writer(file)
tmp=[]
tmp.append('ImageId')
tmp.append('Label')
myW.writerow(tmp)
for i in l1:
tmp=[]
tmp.append(str(j))
j+=1
tmp.append(i)
myW.writerow(tmp)
#with open(path+'/result.csv') as file:
# myW=csv.writer(file)
# myW.writerow(result)
if __name__=='__main__':
trainData,trainLabel=loadTrainData()#42000*784,42000*1
testData=loadTestdata()#28000*784
testLabel=loadTestResult()#28000*1
m,n=shape(testData)#28000,784
errorCount=0#
resultList=list()
for i in range(m):
testD=testData[i,:]#1*784
classifyResults=classify(testD,trainData,trainLabel.transpose(),9)
resultList.append(classifyResults)
#print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, testLabel[0,i])
#if (classifyResults != testLabel[0,i]): errorCount += 1
#print "\nthe total number of errors is: %d" % errorCount
#print "\nthe total error rate is: %f" % (errorCount/float(m))
saveResult(resultList)