import numpy as np
import pandas as pd
from ID3tree.tree_utils.TreeUtils import TreeUtils
from ID3tree.tree_utils.Data_Encoding import target_encode
global class_num
global epsilon
def calc_entropy(labels):
"""
计算信息熵
:param labels: 当前选择的数据集对应的类别值。
:return: 信息熵
"""
entropy = 0
# 获取不同的类别值
x_label_array = np.unique(labels)
for x in x_label_array:
# 布尔索引每个类别的样本
sub_y = labels[labels == x]
# 计算各类别所占数据集的比例
p = len(sub_y) / len(labels)
# 对应公式, 累加信息熵
entropy -= p*np.log2(p)
return entropy
def calc_condition_entropy(feature, y_label):
"""
当前属性为划分结点的条件熵
:param feature: 传入的某一个特征
:param y_label: 对应某一特征的类别值
:return:
"""
# 获取某特征的不同取值,unique会把列表元素去重
feature_label = np.unique(feature)
cond_ent = 0
for f in feature_label:
# 提取出来当下特征的正负样本数
sub_y = y_label[f == feature]
# 当下特征对应的信息熵
feature_ent = calc_entropy(sub_y)
cond_ent += len(sub_y) / len(y_label) * feature_ent
return cond_ent
def gain_info(feature, y_label):
"""
计算当前特征的信息增益,
:param feature:
:param y_label:
:return:
"""
return calc_entropy(y_label) - calc_condition_entropy(feature, y_label)
def data_processing(path):
data = pd.read_csv(path).iloc[:, 1:]
# 将数据特征和标签分别进行提取
# 传入最后一列,就是西瓜种类的标签,好瓜还是坏瓜
label = data.iloc[:, -1] # 最后一列是标签
# 提取每一列特征
features = []
for i in range(8):
features.append(data.iloc[:, i])
return data, label, features
def id3_tree_fit(train_set, train_label, features):
"""
递归实现ID3算法,以信息增益最大的特征作为划分方法
1、 递归出口是什么
2、 递归体是什么
:param train_set:递归产生的样本数据集
:param train_label:递归产生的,对应样本数据的类别标签值
:param features: 特征索引列表
:return:
"""
# 标签列表
y_label = ["好瓜", "坏瓜"]
feature_label = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
# 叶子结点
LEAF = "leaf"
# 内部结点类型
INTERNAL = "internal"
label_unique = np.unique(train_label)
# 当前结点包含的样本属于同一类别,无需划分
if len(label_unique) == 1:
print("类别:", y_label[int(label_unique[0])])
return TreeUtils(LEAF, Class=label_unique[0])
# 不同类别所对应的样本量
class_len = [(i, len(list(filter(lambda x: x == i, train_label))))for i in range(class_num)]
max_class, max_len = max(class_len, key=lambda x : x[1])
# 特征集为空的话,投票法来做,取样本量最大所对应的类别
if len(features) == 0:
print("类别:", max_class)
return TreeUtils(LEAF, Class=max_class)
max_feature, max_gain = 0, 0
# 计算每个特征的信息增益,选择信息增益最大特征划分结点依据
for feature in features:
feature_data = train_set.iloc[:, feature]
# 计算信息增益
feature_gain = gain_info(feature_data, train_label)
print("特征是: %s, 信息增益是; %f"%(feature_label[feature], feature_gain))
if feature_gain > max_gain:
max_feature, max_gain = feature, feature_gain
# 如果最大信息增益也比较小,那么就将不再划分
if max_gain < epsilon:
print("类别:", max_class)
return TreeUtils(LEAF, Class=max_class)
# 构造内部结点
tree = TreeUtils(INTERNAL, Class=max_feature)
print('-'*60)
print("最佳划分特征: %s,最大信息增益: %f "%(feature_label[max_feature], max_gain))
print("="*60)
# 构建完当前的最大信息增益特征之后,需要删除该特征,不让其再出现在接下来的位置
sub_feature = list(filter(lambda x : x !=max_feature, features))
# 当前最佳划分结点的样本值
max_feature_col = train_set.iloc[:, max_feature]
# 提取该特征的对应的特征变量
max_feature_unique_values = np.unique(max_feature_col)
for feature_value in max_feature_unique_values:
print("当特征为【%s】时, 值为【%s】"% (feature_label[max_feature], feature_value))
# 提取特征变量对应的样本
sub_train_set = train_set[train_set.iloc[:, max_feature] == feature_value]
# 提取样本对应的标签
sub_train_label = train_label[train_set.iloc[:, max_feature] == feature_value]
sub_tree = id3_tree_fit(sub_train_set, sub_train_label, sub_feature)
tree.add_tree(feature_value, sub_tree)
return tree
if __name__ == '__main__':
# 信息增益阈值,小于该值即为叶子结点
epsilon = 1e-3
data, label, features = data_processing('ID3tree\\datasets\\watermelon.csv')
# 对于二分类问题,将类别的文字叙述改为用0和1来表示
y_encode, class_num, y_label_dict = target_encode(label)
train_set = data.iloc[:, :-3]
feature_index = list(range(train_set.shape[1]))
id3_tree_fit(train_set, y_encode, feature_index)
评论0