# -*- coding: utf-8 -*-
import csv
import sys
from random import seed
from random import randrange
import numpy as np
import math
import collections
import operator
def loadCSV(filename):
dataSet = []
with open(filename, 'r') as file:
csvReader = csv.reader(file)
for line in csvReader:
dataSet.append(line)
return dataSet
def createDataSet(file_X, file_Y):
dataSet_X = np.array(loadCSV(file_X))
dataSet_Y = np.array(loadCSV(file_Y))
dataSet = np.hstack((dataSet_X[1:, 1:], np.expand_dims(dataSet_Y[1:, -1], axis=1)))
labels = [i for i in range(1, 23)]
return dataSet, labels
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * math.log(prob, 2)
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = list(featVec[:axis])
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = 0
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i ,value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
return myTree
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
# print('label:',featLabels)
if firstStr not in featLabels:
firstStr = list(featLabels)[0]
featIndex = list(featLabels).index(firstStr)
key = testVec[featIndex]
# print(secondDict)
if key not in secondDict.keys():
key = list(secondDict.keys())[0]
valueOfFeat = secondDict[key]
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else:
classLabel = valueOfFeat
# print(classLabel)
return classLabel
def storeTree(subtree, filename):
import pickle
fw = open(filename, 'wb')
for i in subtree:
pickle.dump(i, fw)
fw.close()
def loadTree(filename):
import pickle
fr = open(filename, 'rb')
pairs = pickle.load(fr)
subtrees = []
id = 0
node = 0
for i in pairs:
if id % 2 == 1:
feature = i
subtrees.append((node, feature))
else: node = i
id += 1
return subtrees
def getRandomSample(file_X, file_Y):
X = np.array(loadCSV(file_X))
Y = np.array(loadCSV(file_Y))
Y = np.expand_dims(Y[1:, -1], axis=1)
randomIdx = np.random.choice(range(1, X.shape[0]-1), 250, replace=True)
featIdx = np.random.choice(range(1, X.shape[1]-1), 15, replace=False)
featIdx = sorted(featIdx)
x_ret = X[randomIdx][:, featIdx]
x_ret = np.hstack((x_ret, Y[randomIdx]))
return x_ret, list(featIdx)
def predict(file_X, trees):
X = np.array(loadCSV(file_X))
ids = X[1:,0]
X = X[1:, 1:]
# print(ids)
y_pred = []
for x in X:
ret = []
for tree, features in trees:
ret.append(classify(tree, x[features], x))
ret = collections.Counter(ret)
y_pred.append(ret.most_common(1)[0][0])
y_pred = np.array(y_pred).reshape(-1, 1)
ids = np.array(ids).reshape(-1, 1)
output = np.hstack((ids, y_pred))
return output
def real(file_Y):
Y = np.array(loadCSV(file_Y))
Y = Y[1:, -1]
return Y
if __name__ == '__main__':
seed(319)
subtrees = []
if sys.argv[1] == 'train':
file_X = './dataset/X_train.csv'
file_Y = './dataset/Y_train.csv'
filename = './model/model_1000.pkl'
trees_num = 1500
for i in range(trees_num):
X_rnd, features = getRandomSample(file_X, file_Y)
node = createTree(X_rnd, features)
subtrees.append((node, features))
storeTree(subtrees, filename)
pred = predict(file_X, subtrees)
real = real(file_Y)
TP = 0
FN = 0
TN = 0
FP = 0
for i in range(len(pred)):
y = int(real[i])
y_ = int(pred[i])
if y == 1 and y_ == 1:
TP += 1
elif y == 1 and y_ == -1:
FN += 1
elif y == -1 and y == -1:
TN += 1
else: FP += 1
pre = TP * 1.0/(TP+FP)
rec = TP * 1.0/(TP+FN)
F1 = 2 * pre * rec /(pre+rec)
print(F1)
elif sys.argv[1] == 'test':
filename = sys.argv[2]
print(filename)
subtrees = loadTree(filename)
file_X = sys.argv[3]
pred = predict(file_X, subtrees)
fout = open('output.csv', 'w')
fout.write('index,label\n')
for i in pred:
fout.write(str(i[0])+','+str(i[1])+'\n')
fout.close()
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论



















资源评论

BrainiX
- 粉丝: 2
- 资源: 5

上传资源 快速赚钱
我的内容管理 展开
我的资源 快来上传第一个资源
我的收益
登录查看自己的收益我的积分 登录查看自己的积分
我的C币 登录后查看C币余额
我的收藏
已下载
下载帮助

相关资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈



安全验证
文档复制为VIP权益,开通VIP直接复制
