机器学习-决策树（以西瓜数据集为例）_决策树西瓜数据集,西瓜集决策树资源-CSDN文库

共4个文件

py：2个

zip：1个

pyc：1个

版权申诉

机器学习

数据集

西瓜数据集

决策树算法

5星 · 超过95%的资源 110 浏览量 2023-03-30 09:22:23 上传评论 4 收藏 13KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

机器学习——决策树（以西瓜数据集为例）.zip （4个子文件）

机器学习——决策树（以西瓜数据集为例）

2022.9.19机器学习作业01_决策树

Dataset.py 3KB

基于信息熵划分选择的决策树算法.py 11KB

__pycache__

Dataset.cpython-39.pyc 2KB

2022.9.19机器学习作业01_决策树.zip 6KB

# 昵称:XieXu # 时间: 2022/9/20/0020 20:23 # 编程实现基于信息熵进行划分选择的决策树算法， # 并为表4.3中数据生成一颗决策树。 # ID3决策树的Python程序实现如下所示 # 用一个字符串来对决策树的每一个结点的具体信息进行了描述。 # 程序最后打印输出了整个决策树的字符串描述。 import math import Dataset class TreeNode: """ 决策树结点类 """ current_index = 0 def __init__(self, parent=None, attr_name=None, children=None, judge=None, split=None, data_index=None, attr_value=None, rest_attribute=None): """ 决策树结点类初始化方法 :param parent: 父节点 """ self.parent = parent # 父节点，根节点的父节点为 None self.attribute_name = attr_name # 本节点上进行划分的属性名 self.attribute_value = attr_value # 本节点上划分属性的值，是与父节点的划分属性名相对应的 self.children = children # 孩子结点列表 self.judge = judge # 如果是叶子结点，需要给出判断 self.split = split # 如果是使用连续属性进行划分，需要给出分割点 self.data_index = data_index # 对应训练数据集的训练索引号 self.index = TreeNode.current_index # 当前结点的索引号，方便输出时查看 self.rest_attribute = rest_attribute # 尚未使用的属性列表 TreeNode.current_index += 1 def to_string(self): """用一个字符串来描述当前结点信息""" this_string = 'current index : ' + str(self.index) + ";\n" if not (self.parent is None): parent_node = self.parent this_string = this_string + 'parent index : ' + str(parent_node.index) + ";\n" this_string = this_string + str(parent_node.attribute_name) + " : " + str(self.attribute_value) + ";\n" this_string = this_string + "data : " + str(self.data_index) + ";\n" if not (self.children is None): this_string = this_string + 'select attribute is : ' + str(self.attribute_name) + ";\n" child_list = [] for child in self.children: child_list.append(child.index) this_string = this_string + 'children : ' + str(child_list) if not (self.judge is None): this_string = this_string + 'label : ' + self.judge return this_string def is_number(s): """判断一个字符串是否为数字""" try: float(s) return True except ValueError: pass return False def ent(labels): """ 样本集合的信息熵 :param labels: 样本集合中数据的类别标签 :return: """ label_name = [] label_count = [] for item in labels: if not (item in label_name): label_name.append(item) label_count.append(1) else: index = label_name.index(item) label_count[index] = label_count[index] + 1 n = sum(label_count) entropy = 0.0 for item in label_count: p = item / n entropy = entropy - p * math.log(p, 2) return entropy def gain(attribute, labels, is_value=False): """ 计算信息增益 :param attribute: 集合中样本该属性的值列表 :param labels: 集合中样本的数据标签 :return: """ # is_value = False # 当前属性是离散的形容词还是连续的数值 info_gain = ent(labels) n = len(labels) split_value = None # 如果是连续值的话，也需要返回分隔界限的值 if is_value: # print('attribute', attribute) # 属性值是连续的数值，首先应该使用二分法寻找最佳分割点 sorted_attribute = attribute.copy() sorted_attribute.sort() split = [] # 候选的分隔点 for i in range(0, n - 1): temp = (sorted_attribute[i] + sorted_attribute[i + 1]) / 2 split.append(temp) info_gain_list = [] # print('split', split) for temp_split in split: low_labels = [] high_labels = [] for i in range(0, n): if attribute[i] <= temp_split: low_labels.append(labels[i]) else: high_labels.append(labels[i]) temp_gain = info_gain - len(low_labels) / n * ent(low_labels) - len(high_labels) / n * ent(high_labels) info_gain_list.append(temp_gain) # print('info_gain_list', info_gain_list) info_gain = max(info_gain_list) max_index = info_gain_list.index(info_gain) split_value = split[max_index] else: # 属性值是离散的值 attribute_dict = {} label_dict = {} index = 0 for item in attribute: if attribute_dict.__contains__(item): attribute_dict[item] = attribute_dict[item] + 1 label_dict[item].append(labels[index]) else: attribute_dict[item] = 1 label_dict[item] = [labels[index]] index += 1 for key, value in attribute_dict.items(): info_gain = info_gain - value / n * ent(label_dict[key]) return info_gain, split_value def finish_node(current_node, data, label): """ 完成当前结点的后续计算，包括选择属性，划分子节点等 :param current_node: 当前的结点 :param data: 数据集 :param label: 数据集的 label :param rest_title: 剩余的可用属性名 :return: """ n = len(label) # 判断当前结点的数据是否属于同一类，如果是，直接标记为叶子结点并返回 one_class = True this_data_index = current_node.data_index for i in this_data_index: for j in this_data_index: if label[i] != label[j]: one_class = False break if not one_class: break if one_class: current_node.judge = label[this_data_index[0]] return rest_title = current_node.rest_attribute # 候选属性 if len(rest_title) == 0: # 如果候选属性为空，则是个叶子结点。需要选择最多的那个类作为该结点的类 label_count = {} temp_data = current_node.data_index for index in temp_data: if label_count.__contains__(label[index]): label_count[label[index]] += 1 else: label_count[label[index]] = 1 final_label = max(label_count) current_node.judge = final_label return title_gain = {} # 记录每个属性的信息增益 title_split_value = {} # 记录每个属性的分隔值，如果是连续属性则为分隔值，如果是离散属性则为None for title in rest_title: attr_values = [] current_label = [] for index in current_node.data_index: this_data = data[index] attr_values.append(this_data[title]) current_label.append(label[index]) temp_data = data[0] this_gain, this_split_value = gain(attr_values, current_label, is_number(temp_data[title])) # 如果属性值为数字，则认为是连续的 title_gain[title] = this_gain title_split_value[title] = this_split_value best_attr = max(title_gain, key=title_gain.get) # 信息增益最大的属性名 current_node.attribute_name = best_attr current_node.split = title_split_value[best_attr] rest_title.remove(best_attr) a_data = data[0] if is_number(a_data[best_attr]): # 如果是该属性的值为连续数值 split_value = title_split_val

评论收藏

内容反馈

版权申诉