import copy
from math import log
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family'] = 'SimHei' # 用来正常显示中文
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
class DecisionTree(object):
def __init__(self, decision_tree_type='CART', feature_list=None):
self.decision_tree_type = decision_tree_type
self.feature_list = feature_list
@staticmethod
def compute_entropy(x, y):
"""
计算给定数据的信息熵 H(S) = -SUM(P*logP)
:param x:
:param y:
:return:
"""
sample_num = len(x)
label_counter = Counter(y)
dataset_entropy = 0.0
for key in label_counter:
prob = float(label_counter[key]) / sample_num
dataset_entropy -= prob * log(prob, 2) # get the log value
return dataset_entropy
def dataset_split_by_id3(self, x, y):
"""
选择最好的数据集划分方式
ID3算法:选择信息熵增益最大
C4.5算法:选择信息熵增益比最大
:param x:
:param y:
:return:
"""
feature_num = len(x[0])
base_entropy = self.compute_entropy(x, y)
best_info_gain, best_info_gain_ratio = 0.0, 0.0
best_feature_idx = -1
for i in range(feature_num):
unique_features = set([example[i] for example in x])
new_entropy, split_entropy = 0.0, 0.0
for feature in unique_features:
sub_dataset, sub_labels = [], []
for featVec, label in zip(x, y):
if featVec[i] == feature:
sub_dataset.append(list(featVec[:i]) + list(featVec[i + 1:]))
sub_labels.append(label)
prob = len(sub_dataset) / float(len(x))
new_entropy += prob * self.compute_entropy(sub_dataset, sub_labels)
info_gain = base_entropy - new_entropy
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature_idx = i
return best_feature_idx
def dataset_split_by_c45(self, x, y):
"""
选择最好的数据集划分方式
C4.5算法:选择信息熵增益比最大
:param x:
:param y:
:return:
"""
feature_num = len(x[0])
base_entropy = self.compute_entropy(x, y)
best_info_gain, best_info_gain_ratio = 0.0, 0.0
best_feature_idx = -1
for i in range(feature_num):
unique_features = set([example[i] for example in x])
new_entropy, split_entropy = 0.0, 0.0
for feature in unique_features:
sub_dataset, sub_labels = [], []
for featVec, label in zip(x, y):
if featVec[i] == feature:
sub_dataset.append(list(featVec[:i]) + list(featVec[i + 1:]))
sub_labels.append(label)
prob = len(sub_dataset) / float(len(x))
new_entropy += prob * self.compute_entropy(sub_dataset, sub_labels)
split_entropy += -prob * log(prob, 2)
info_gain = base_entropy - new_entropy
info_gain_ratio = info_gain / split_entropy if split_entropy else 0.0
if info_gain_ratio > best_info_gain_ratio:
best_info_gain_ratio = info_gain_ratio
best_feature_idx = i
return best_feature_idx
def create_tree_by_id3_and_c45(self, x, y, feature_list=None):
"""
创建决策树
:param x:
:param y:
:param feature_list:
:return:
"""
# the type is the same, so stop classify
if len(set(y)) <= 1:
return y[0]
# traversal all the features and choose the most frequent feature
if len(x[0]) == 1:
return Counter(y).most_common(1)
feature_list = [i for i in range(len(y))] if not feature_list else feature_list
if self.decision_tree_type == 'ID3':
best_feature_idx = self.dataset_split_by_id3(x, y)
elif self.decision_tree_type == 'C45':
best_feature_idx = self.dataset_split_by_c45(x, y)
else:
raise KeyError
best_feature = feature_list[int(best_feature_idx)] # 最佳特征
decision_tree = {best_feature: {}}
# feature_list = feature_list[:best_feature_idx] + feature_list[best_feature_idx + 1:]
feature_list.pop(int(best_feature_idx))
# get the list which attain the whole properties
best_feature_values = set([sample[best_feature_idx] for sample in x])
for value in best_feature_values:
sub_dataset, sub_labels = [], []
for featVec, label in zip(x, y):
if featVec[best_feature_idx] == value:
sub_dataset.append(list(featVec[:best_feature_idx]) + list(featVec[best_feature_idx + 1:]))
sub_labels.append(label)
decision_tree[best_feature][value] = self.create_tree_by_id3_and_c45(sub_dataset, sub_labels, feature_list)
return decision_tree
@staticmethod
def compute_gini(x, y):
"""
计算数据集x的基尼指数
:param x: 数据集
:param y: 数据集对应的类别标签
:return: 该数据集的gini指数
"""
unique_labels = set(y)
sample_num = len(x) # y总数据条数
gini = 1.0
for label in unique_labels:
gini_k = len(x[y == label]) / sample_num # y中每一个分类的概率(其实就是频率)
gini -= gini_k ** 2
return gini
def dataset_split_by_cart(self, x, y):
"""
选择最好的特征划分数据集,即返回最佳特征下标
:param x:
:param y:
:return:
"""
sample_num, feature_num = x.shape
column_feature_gini = {} # 初始化参数,记录每一列x的每一种特征的基尼 Gini(D,A)
for i in range(feature_num): # 遍历所有x特征列
column_i = dict(Counter(x[:, i])) # 使用Counter函数计算这一列x各特征数量
for value in column_i.keys(): # 循环这一列的特征,计算H(D|A)
# 对某一列x中,会出现x=是,y=是的特殊情况,这种情况下按“是”、“否”切分数据得到的Gini都一样,
# 设置此参数将所有特征都乘以一个比1大一点点的值,但按某特征划分Gini为0时,设置为1
best_flag = 1.00001
cls_same, cls_diff = x[:, i] == value, x[:, i] != value
sub_x1, sub_y1 = x[cls_same], y[cls_same]
sub_x2, sub_y2 = x[cls_diff], y[cls_diff]
sublen1, sublen2 = len(sub_x1), len(sub_x2)
# 判断按此特征划分Gini值是否为0(全部为一类)
if (sublen1 / sample_num) * self.compute_gini(sub_x1, sub_y1) == 0:
best_flag = 1
feaGini = (sublen1 / sample_num) * self.compute_gini(sub_x1, sub_y1) + \
(sublen2 / sample_num) * self.compute_gini(sub_x2, sub_y2)
column_feature_gini[(i, value)] = feaGini * best_flag
# 找到最小的Gini指数益对应的数据列
best_feature_and_idx = min(column_feature_gini, key=column_feature_gini.get)
return best_feature_and_idx, column_feature_gini
def create_tree_by_cart(self, x, y, feature_list=None):
"""
猰貐的新时代
- 粉丝: 1w+
- 资源: 2585
最新资源
- System.Threading.ThreadInterruptedException(解决方案).md
- System.Threading.ThreadAbortException(解决方案).md
- TaskCanceledException(解决方案).md
- System.Threading.ThreadStateException(解决方案).md
- System.Xml.XmlException(解决方案).md
- traits-6.3.2-cp310-cp310-win_amd64.whl.rar
- traits-6.3.2-cp39-cp39-win32.whl.rar
- traits-6.3.2-cp310-cp310-win32.whl.rar
- XmlSchemaException(解决方案).md
- System.Net.WebException(解决方案).md
- SocketException(解决方案).md
- JsonException(解决方案).md
- warning(解决方案).md
- Error(解决方案).md
- weixin246微信小程序书店springboot.rar
- simpleWarning(解决方案).md
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈