#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 23 13:21:21 2023
@author: yaoji
"""
# ID3决策树分类.py
from typing import Optional
from numpy import ndarray, unique, array
from numpy.random import choice
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import pandas as pd
from 决策树工具函数 import entropy, purity, majority, \
splitDataset_forDiscreteFeature,\
biPartitionDataset_forContinuousFeature
from graphviz import Digraph
from typing import List
class ID3decisionTree:
"""ID3决策树分类"""
def __init__(self,
minSamplesLeaf: int = 1, # 超参数:叶结点的最少样本数量
maxDepth: int = 7, # 超参数:最大树深
maxPruity: float = 1., # 超参数:叶结点的最大纯度
maxFeatures: Optional[int] = None, # 超参数:最大特征数
α: float = 0., # 超参数:代价复杂度剪枝的惩罚参数
):
assert type(minSamplesLeaf)==int and minSamplesLeaf>0, '叶结点的最少样本数量minSamplesLeaf应为正整数'
assert type(maxDepth)==int and maxDepth>0, '最大树深maxDepth应为正整数'
assert 0<maxPruity<=1, '叶结点的最大纯度maxDepth的取值范围应为(0, 1]'
assert (maxFeatures is None) or (type(maxFeatures)==int and maxFeatures>0), '最大特征数maxFeatures应为正整数,或不给定maxFeatures'
assert α>=0, '代价复杂度剪枝的惩罚参数α应为非负数'
self.minSamplesLeaf = minSamplesLeaf # 超参数:叶结点的最少样本数量
self.maxDepth = maxDepth # 超参数:最大树深
self.maxPruity = maxPruity # 超参数:叶结点的最大纯度
self.maxFeatures = maxFeatures # 超参数:最大特征数
self.α = α # 超参数:代价复杂度剪枝的惩罚参数
self.M = None # 输入特征向量的维数
self.tree = None # 字典:树
self.indexContinuousFeatures_ = None # 数组索引:连续值特征的序号
self.feature_names = feature_names # 特征名称
self.info_gain_dict = {} # 信息增益字典
def fit(self, X__: ndarray, y_: ndarray,
indexContinuousFeatures_=(), # 数组索引:连续值特征的序号
):
"""训练:生成ID3决策树"""
assert type(X__)==ndarray and X__.ndim==2, '输入训练样本矩阵X__应为2维ndarray'
assert type(y_)==ndarray and y_.ndim==1, '输入训练样本标签y_应为1维ndarray'
assert len(X__)==len(y_), '输入训练样本数量应等于标签数量'
self.M = X__.shape[1] # 输入特征向量的维数
self.indexContinuousFeatures_ = tuple(indexContinuousFeatures_) # 数组索引:连续值特征的序号
if self.indexContinuousFeatures_:
print(f'给定第{self.indexContinuousFeatures_}特征为连续值特征')
if self.maxFeatures is None:
self.maxFeatures = self.M # 若未指定最大特征数maxFeatures,则默认设置为特征总数M
assert self.maxFeatures<=self.M, f'最大特征数maxFeatures不应大于输入特征向量的维数{self.M},当前 maxFeatures = {self.maxFeatures}'
indexFeatureCandidates_ = array(range(self.M)) # 数组索引:所有候选特征的序号
self.tree = self.createTree(X__, y_, indexFeatureCandidates_=indexFeatureCandidates_) # 生成树
return self
def createTree(self,
X__: ndarray,
y_: ndarray,
indexFeatureCandidates_: ndarray, # 数组索引:所有候选的特征的序号
depth: int = 0, # 当前树深
) -> dict:
"""递归地生成ID3分类树"""
'''
结点的结构,字典:
{
'class label' : 结点的类别标签,
'number of samples' : 结点的样本数量,
'entropy' : 结点的熵值,
'purity': 结点的纯度,
'index of splitting feature' : 划分特征的序号,
'partition point' : 划分点,
'child nodes' : {'划分特征的取值1' : 子结点,
'划分特征的取值2' : 子结点,
...
}
}
注:
当结点的划分特征为离散值特征时,无'partition point'键值对;
当结点的划分特征为离散值特征时,'划分特征的取值1'、'划分特征的取值2'、'划分特征的取值3'、……等,分别为划分特征的各个离散取值;
当结点的划分特征为连续值特征时,'划分特征的取值1'和'划分特征的取值2'分别为'+'和'-',分别指示划分特征取值大于、不大于划分点的子结点;
当结点为叶结点时,无'child nodes'、'index of splitting feature'、'partition point'等键值对。
'''
N = len(X__) # 结点样本数量
p = purity(y_) # 结点纯度
"""创建结点"""
node = {'class label': majority(y_), # 记录:结点类别
'number of samples': N, # 记录:结点样本数量
'entropy': entropy(y_), # 记录:结点熵值
'purity': p, # 记录:结点纯度
'class': y_[0] # 记录:结点类别信息
}
if (len(set(y_))==1 or # 若该结点是纯的(所有类别标签都相同),则返回叶结点
len(indexFeatureCandidates_)==0 or # 若所有特征已被用于划分,则返回叶结点
depth>=self.maxDepth or # 若达到最大树深,则返回叶结点
N<=self.minSamplesLeaf or # 若达到叶结点的最少样本数量,则返回叶结点
p>=self.maxPruity or # 若达到叶结点的最大纯度,则返回叶结点
len(unique(X__, axis=0))==1 # 若所有样本特征向量都相同,则返回叶结点
):
return node
"""选择最优的划分特征"""
result = self.chooseBestFeature(X__, y_, indexFeatureCandidates_)
if type(result)==tuple:
# 若result是一个元组,则选择了一个连续值特征序号和对应的最优划分点
mBest, tBest = result[0], result[1]
else:
# 若result是一个数,则选择了一个离散值特征序号
mBest = result
"""创建子结点"""
node['index of splitting feature'] = mBest # 记录:最优划分特征的序号
depth += 1 # 树深+1
if mBest in self.indexContinuousFeatures_:
# 若第mBest特征是连续值特征,则按最优划分点tBest,二分样本集
node['partition point'] = tBest # 记录:最优划分点
node['child nodes'] = {} # 初始化子结点
Xhigher__, Xlower__, yHigher_, yLower_ = \
biPartitionDataset_forContinuousFeature(X__=X__, y_=y_, m=mBest, t=tBest) # 二分样本集
node['child nodes']['+'] = \
self.createTree(Xhigher__, yHigher_, indexFeatureCandidates_, depth) # 创建第mBest特征取值大于tBest的子结点
node['child nodes']['-'] = \
self.createTree(Xlower__, yLower_, indexFeatureCandidates_, depth) # 创建第mBest特征取值不大于tBest的子结点
else:
# 若第mBest特征是离散值特征,则按第mBest特征的所有离散取值划分样本集