基于Python实现决策树CART，ID3，C4.5（完整源码）.zip_西瓜数据集最优切分点资源-CSDN文库

共1个文件

py：1个

版权申诉

python

课程设计

源码

5星 · 超过95%的资源 50 浏览量 2023-10-23 17:00:43 上传评论 1 收藏 5KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

基于Python实现决策树CART，ID3，C4.5（完整源码）.zip （1个子文件）

基于Python实现决策树CART，ID3，C4.5（完整源码）

decision_tree.py 16KB

import copy from math import log from collections import Counter import numpy as np import matplotlib.pyplot as plt import matplotlib matplotlib.rcParams['font.family'] = 'SimHei' # 用来正常显示中文 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 decisionNode = dict(boxstyle="sawtooth", fc="0.8") leafNode = dict(boxstyle="round4", fc="0.8") arrow_args = dict(arrowstyle="<-") class DecisionTree(object): def __init__(self, decision_tree_type='CART', feature_list=None): self.decision_tree_type = decision_tree_type self.feature_list = feature_list @staticmethod def compute_entropy(x, y): """ 计算给定数据的信息熵 H(S) = -SUM(P*logP) :param x: :param y: :return: """ sample_num = len(x) label_counter = Counter(y) dataset_entropy = 0.0 for key in label_counter: prob = float(label_counter[key]) / sample_num dataset_entropy -= prob * log(prob, 2) # get the log value return dataset_entropy def dataset_split_by_id3(self, x, y): """ 选择最好的数据集划分方式 ID3算法：选择信息熵增益最大 C4.5算法：选择信息熵增益比最大 :param x: :param y: :return: """ feature_num = len(x[0]) base_entropy = self.compute_entropy(x, y) best_info_gain, best_info_gain_ratio = 0.0, 0.0 best_feature_idx = -1 for i in range(feature_num): unique_features = set([example[i] for example in x]) new_entropy, split_entropy = 0.0, 0.0 for feature in unique_features: sub_dataset, sub_labels = [], [] for featVec, label in zip(x, y): if featVec[i] == feature: sub_dataset.append(list(featVec[:i]) + list(featVec[i + 1:])) sub_labels.append(label) prob = len(sub_dataset) / float(len(x)) new_entropy += prob * self.compute_entropy(sub_dataset, sub_labels) info_gain = base_entropy - new_entropy if info_gain > best_info_gain: best_info_gain = info_gain best_feature_idx = i return best_feature_idx def dataset_split_by_c45(self, x, y): """ 选择最好的数据集划分方式 C4.5算法：选择信息熵增益比最大 :param x: :param y: :return: """ feature_num = len(x[0]) base_entropy = self.compute_entropy(x, y) best_info_gain, best_info_gain_ratio = 0.0, 0.0 best_feature_idx = -1 for i in range(feature_num): unique_features = set([example[i] for example in x]) new_entropy, split_entropy = 0.0, 0.0 for feature in unique_features: sub_dataset, sub_labels = [], [] for featVec, label in zip(x, y): if featVec[i] == feature: sub_dataset.append(list(featVec[:i]) + list(featVec[i + 1:])) sub_labels.append(label) prob = len(sub_dataset) / float(len(x)) new_entropy += prob * self.compute_entropy(sub_dataset, sub_labels) split_entropy += -prob * log(prob, 2) info_gain = base_entropy - new_entropy info_gain_ratio = info_gain / split_entropy if split_entropy else 0.0 if info_gain_ratio > best_info_gain_ratio: best_info_gain_ratio = info_gain_ratio best_feature_idx = i return best_feature_idx def create_tree_by_id3_and_c45(self, x, y, feature_list=None): """ 创建决策树 :param x: :param y: :param feature_list: :return: """ # the type is the same, so stop classify if len(set(y)) <= 1: return y[0] # traversal all the features and choose the most frequent feature if len(x[0]) == 1: return Counter(y).most_common(1) feature_list = [i for i in range(len(y))] if not feature_list else feature_list if self.decision_tree_type == 'ID3': best_feature_idx = self.dataset_split_by_id3(x, y) elif self.decision_tree_type == 'C45': best_feature_idx = self.dataset_split_by_c45(x, y) else: raise KeyError best_feature = feature_list[int(best_feature_idx)] # 最佳特征 decision_tree = {best_feature: {}} # feature_list = feature_list[:best_feature_idx] + feature_list[best_feature_idx + 1:] feature_list.pop(int(best_feature_idx)) # get the list which attain the whole properties best_feature_values = set([sample[best_feature_idx] for sample in x]) for value in best_feature_values: sub_dataset, sub_labels = [], [] for featVec, label in zip(x, y): if featVec[best_feature_idx] == value: sub_dataset.append(list(featVec[:best_feature_idx]) + list(featVec[best_feature_idx + 1:])) sub_labels.append(label) decision_tree[best_feature][value] = self.create_tree_by_id3_and_c45(sub_dataset, sub_labels, feature_list) return decision_tree @staticmethod def compute_gini(x, y): """ 计算数据集x的基尼指数 :param x: 数据集 :param y: 数据集对应的类别标签 :return: 该数据集的gini指数 """ unique_labels = set(y) sample_num = len(x) # y总数据条数 gini = 1.0 for label in unique_labels: gini_k = len(x[y == label]) / sample_num # y中每一个分类的概率（其实就是频率） gini -= gini_k ** 2 return gini def dataset_split_by_cart(self, x, y): """ 选择最好的特征划分数据集，即返回最佳特征下标 :param x: :param y: :return: """ sample_num, feature_num = x.shape column_feature_gini = {} # 初始化参数，记录每一列x的每一种特征的基尼 Gini(D,A) for i in range(feature_num): # 遍历所有x特征列 column_i = dict(Counter(x[:, i])) # 使用Counter函数计算这一列x各特征数量 for value in column_i.keys(): # 循环这一列的特征，计算H(D|A) # 对某一列x中，会出现x=是，y=是的特殊情况，这种情况下按“是”、“否”切分数据得到的Gini都一样， # 设置此参数将所有特征都乘以一个比1大一点点的值，但按某特征划分Gini为0时，设置为1 best_flag = 1.00001 cls_same, cls_diff = x[:, i] == value, x[:, i] != value sub_x1, sub_y1 = x[cls_same], y[cls_same] sub_x2, sub_y2 = x[cls_diff], y[cls_diff] sublen1, sublen2 = len(sub_x1), len(sub_x2) # 判断按此特征划分Gini值是否为0（全部为一类） if (sublen1 / sample_num) * self.compute_gini(sub_x1, sub_y1) == 0: best_flag = 1 feaGini = (sublen1 / sample_num) * self.compute_gini(sub_x1, sub_y1) + \ (sublen2 / sample_num) * self.compute_gini(sub_x2, sub_y2) column_feature_gini[(i, value)] = feaGini * best_flag # 找到最小的Gini指数益对应的数据列 best_feature_and_idx = min(column_feature_gini, key=column_feature_gini.get) return best_feature_and_idx, column_feature_gini def create_tree_by_cart(self, x, y, feature_list=None): """

评论收藏

内容反馈

版权申诉