subDataSet0 = splitContinuousDataSet(dataSet, i, value, 0)
subDataSet1 = splitContinuousDataSet(dataSet, i, value, 1)
prob0 = len(subDataSet0) / float(len(dataSet))
newEntropy += prob0 * calcShannonEnt(subDataSet0)
prob1 = len(subDataSet1) / float(len(dataSet))
newEntropy += prob1 * calcShannonEnt(subDataSet1)
if newEntropy < bestSplitEntropy:
bestSplitEntropy = newEntropy
bestSplit = j
# 用字典记录当前特征的最佳划分点
bestSplitDict[labels[i]] = splitList[bestSplit]
infoGain = baseEntropy - bestSplitEntropy
# 对离散型特征进行处理
else:
uniqueVals = set(featList)
newEntropy = 0.0
# 计算该特征下每种划分的信息熵
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
# 若当前节点的最佳划分特征为连续特征,则将其以之前记录的划分点为界进行二值化处理
# 即是否小于等于bestSplitValue
if type(dataSet[0][bestFeature]).__name__ == 'float' or type(dataSet[0][bestFeature]).__name__ == 'int':
bestSplitValue = bestSplitDict[labels[bestFeature]]
labels[bestFeature] = labels[bestFeature] + '<=' + str(bestSplitValue)
for i in range(shape(dataSet)[0]):
if dataSet[i][bestFeature] <= bestSplitValue:
dataSet[i][bestFeature] = 1
else:
dataSet[i][bestFeature] = 0
return bestFeature
def chooseBestFeatureToSplit(dataSet, labels):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
bestSplitDict = {}
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
# 对连续型特征进行处理
if type(featList[0]).__name__ == 'float' or type(featList[0]).__name__ == 'int':
# 产生n-1个候选划分点
sortfeatList = sorted(featList)
splitList = []
for j in range(len(sortfeatList) - 1):
splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2.0)
bestSplitEntropy = 10000
slen = len(splitList)
# 求用第j个候选划分点划分时,得到的信息熵,并记录最佳划分点
for j in range(slen):
value = splitList[j]
newEntropy = 0.0
subDataSet0 = splitContinuousDataSet(dataSet, i, value, 0)
subDataSet1 = splitContinuousDataSet(dataSet, i, value, 1)
prob0 = len(subDataSet0) / float(len(dataSet))
newEntropy += prob0 * calcShannonEnt(subDataSet0)
prob1 = len(subDataSet1) / float(len(dataSet))
newEntropy += prob1 * calcShannonEnt(subDataSet1)
if newEntropy < bestSplitEntropy:
bestSplitEntropy = newEntropy
bestSplit = j
# 用字典记录当前特征的最佳划分点
bestSplitDict[labels[i]] = splitList[bestSplit]
infoGain = baseEntropy - bestSplitEntropy
# 对离散型特征进行处理
else:
uniqueVals = set(featList)
newEntropy = 0.0
# 计算该特征下每种划分的信息熵
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if infoGain > bestInfoGain: