随机森林（RandomForest）算法Python代码实现_随机森林python代码资源-CSDN文库

共3个文件

py：2个

ipynb：1个

随机森林

python

numpy

Bagging

174 浏览量 2024-11-13 18:10:27 上传评论收藏 6KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

random_forest.zip （3个子文件）

utils.py 738B

random_forest.ipynb 12KB

cart.py 7KB

import numpy as np from utils import feature_split, calculate_gini ### 定义树结点 class TreeNode(): def __init__(self, feature_i=None, threshold=None, leaf_value=None, left_branch=None, right_branch=None): # 特征索引 self.feature_i = feature_i # 特征划分阈值 self.threshold = threshold # 叶子节点取值 self.leaf_value = leaf_value # 左子树 self.left_branch = left_branch # 右子树 self.right_branch = right_branch ### 定义二叉决策树 class BinaryDecisionTree(object): ### 决策树初始参数 def __init__(self, min_samples_split=2, min_gini_impurity=999, max_depth=float("inf"), loss=None): # 根结点 self.root = None # 节点最小分裂样本数 self.min_samples_split = min_samples_split # 节点初始化基尼不纯度 self.min_gini_impurity = min_gini_impurity # 树最大深度 self.max_depth = max_depth # 基尼不纯度计算函数 self.gini_impurity_calculation = None # 叶子节点值预测函数 self._leaf_value_calculation = None # 损失函数 self.loss = loss ### 决策树拟合函数 def fit(self, X, y, loss=None): # 递归构建决策树 self.root = self._build_tree(X, y) self.loss = None ### 决策树构建函数 def _build_tree(self, X, y, current_depth=0): # 初始化最小基尼不纯度 init_gini_impurity = 999 # 初始化最佳特征索引和阈值 best_criteria = None # 初始化数据子集 best_sets = None if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) # 合并输入和标签 Xy = np.concatenate((X, y), axis=1) # 获取样本数和特征数 n_samples, n_features = X.shape # 设定决策树构建条件 # 训练样本数量大于节点最小分裂样本数且当前树深度小于最大深度 if n_samples >= self.min_samples_split and current_depth <= self.max_depth: # 遍历计算每个特征的基尼不纯度 for feature_i in range(n_features): # 获取第i特征的所有取值 feature_values = np.expand_dims(X[:, feature_i], axis=1) # 获取第i个特征的唯一取值 unique_values = np.unique(feature_values) # 遍历取值并寻找最佳特征分裂阈值 for threshold in unique_values: # 特征节点二叉分裂 Xy1, Xy2 = feature_split(Xy, feature_i, threshold) # 如果分裂后的子集大小都不为0 if len(Xy1) > 0 and len(Xy2) > 0: # 获取两个子集的标签值 y1 = Xy1[:, n_features:] y2 = Xy2[:, n_features:] # 计算基尼不纯度 impurity = self.impurity_calculation(y, y1, y2) # 获取最小基尼不纯度 # 最佳特征索引和分裂阈值 if impurity < init_gini_impurity: init_gini_impurity = impurity best_criteria = {"feature_i": feature_i, "threshold": threshold} best_sets = { "leftX": Xy1[:, :n_features], "lefty": Xy1[:, n_features:], "rightX": Xy2[:, :n_features], "righty": Xy2[:, n_features:] } # 如果计算的最小不纯度小于设定的最小不纯度 if init_gini_impurity < self.min_gini_impurity: # 分别构建左右子树 left_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) right_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) return TreeNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], left_branch=left_branch, right_branch=right_branch) # 计算叶子计算取值 leaf_value = self._leaf_value_calculation(y) return TreeNode(leaf_value=leaf_value) ### 定义二叉树值预测函数 def predict_value(self, x, tree=None): if tree is None: tree = self.root # 如果叶子节点已有值，则直接返回已有值 if tree.leaf_value is not None: return tree.leaf_value # 选择特征并获取特征值 feature_value = x[tree.feature_i] # 判断落入左子树还是右子树 branch = tree.right_branch if isinstance(feature_value, int) or isinstance(feature_value, float): if feature_value >= tree.threshold: branch = tree.left_branch elif feature_value == tree.threshold: branch = tree.right_branch # 测试子集 return self.predict_value(x, branch) ### 数据集预测函数 def predict(self, X): y_pred = [self.predict_value(sample) for sample in X] return y_pred class ClassificationTree(BinaryDecisionTree): ### 定义基尼不纯度计算过程 def _calculate_gini_impurity(self, y, y1, y2): p = len(y1) / len(y) gini = calculate_gini(y) gini_impurity = p * calculate_gini(y1) + (1-p) * calculate_gini(y2) return gini_impurity ### 多数投票 def _majority_vote(self, y): most_common = None max_count = 0 for label in np.unique(y): # 统计多数 count = len(y[y == label]) if count > max_count: most_common = label max_count = count return most_common # 分类树拟合 def fit(self, X, y): self.impurity_calculation = self._calculate_gini_impurity self._leaf_value_calculation = self._majority_vote super(ClassificationTree, self).fit(X, y) ### CART回归树 class RegressionTree(BinaryDecisionTree): # 计算方差减少量 def _calculate_variance_reduction(self, y, y1, y2): var_tot = np.var(y, axis=0) var_y1 = np.var(y1, axis=0) var_y2 = np.var(y2, axis=0) frac_1 = len(y1) / len(y) frac_2 = len(y2) / len(y) # 计算方差减少量 variance_reduction = var_tot - (frac_1 * var_y1 + frac_2 * var_y2) return sum(variance_reduction) # 节点值取平均 def _mean_of_y(self, y): value = np.mean(y, axis=0) return value if len(value) > 1 else value[0] # 回归树拟合 def fit(self, X, y): self.impurity_calculation = self._calculate_variance_reduction self._leaf_value_calculation = self._mean_of_y super(RegressionTree, self).fit(X, y)

评论收藏

内容反馈