function tree = BuildC45Tree(features, targets, inc_node, discrete_dim, maxNbin)
% Input:
% features: Ni*L, Ni features and L samples
% targets: Uc classes
% inc_node: allow incorrect number
% discrete_dim: discrete number in each dim
% maxNbin: max(discrete_dim)
% base: default = 0
% Output:
% tree: C4.5 decision tree
%% Step 0-Get the size
[Ni, L] = size(features);
Uc = unique(targets);
% set default value
tree.dim = 0;
% tree.child(1:maxNbin) = zeros(1,maxNbin);
tree.split_loc = inf;
if isempty(features),
return;
end
%% Step 1-Stop Condition: feature dim is one or examples is small
if ((inc_node > L) || (L == 1) || (length(Uc) == 1)),
H = hist(targets, length(Uc));
[m, largest] = max(H);
tree.child = Uc(largest);
return;
end
%% Step 2-Otherwise: use C4.5 choose the best feature
% 2-1 Compute the node's I
for i = 1:length(Uc),
Pnode(i) = length(find(targets == Uc(i))) / L;
end
Inode = -sum(Pnode.*log(Pnode)/log(2));
% 2-2 For each dimension, compute the gain ratio impurity
% This is done separately for discrete and continuous features
delta_Ib = zeros(1, Ni);
split_loc = ones(1, Ni)*inf;
for i = 1:Ni,
data = features(i,:);
Nbins = length(unique(data));
if (discrete_dim(i)),
%This is a discrete feature
P = zeros(length(Uc), Nbins);
for j = 1:length(Uc),
for k = 1:Nbins,
indices = find((targets == Uc(j)) & (features(i,:) == k));
P(j,k) = length(indices);
end
end
Pk = sum(P);
P = P/L;
Pk = Pk/sum(Pk);
info = sum(-P.*log(eps+P)/log(2));
delta_Ib(i) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2));
else
% This is a continuous feature
P = zeros(length(Uc), 2);
% Sort the features
[sorted_data, indices] = sort(data);
sorted_targets = targets(indices);
% Calculate the information for each possible split
I = zeros(1, L-1);
for j = 1:L-1,
for k =1:length(Uc),
P(k,1) = length(find(sorted_targets(1:j) == Uc(k)));
P(k,2) = length(find(sorted_targets(j+1:end) == Uc(k)));
end
Ps = sum(P)/L;
P = P/L;
info = sum(-P.*log(eps+P)/log(2));
I(j) = Inode - sum(info.*Ps);
end
[delta_Ib(i), s] = max(I);
split_loc(i) = sorted_data(s);
end
end
% 2-3 Find the dimension minimizing delta_Ib
[m, dim] = max(delta_Ib);
dims = 1:Ni;
tree.dim = dim;
% 2-4 Split along the 'dim' dimension
Nf = unique(features(dim,:));
Nbins = length(Nf);
if (discrete_dim(dim)),
%Discrete feature
for i = 1:Nbins,
indices = find(features(dim, :) == Nf(i));
tree.child(i) = BuildC45Tree(features(dims, indices), targets(indices), inc_node, discrete_dim(dims), maxNbin);
end
else
%Continuous feature
tree.split_loc = split_loc(dim);
indices1 = find(features(dim,:) <= split_loc(dim));
indices2 = find(features(dim,:) > split_loc(dim));
tree.child(1) = BuildC45Tree(features(dims, indices1), targets(indices1), inc_node, discrete_dim(dims), maxNbin);
tree.child(2) = BuildC45Tree(features(dims, indices2), targets(indices2), inc_node, discrete_dim(dims), maxNbin);
end
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
计算机视觉-决策树学习MATLAB源码 与其他的数据挖掘算法相比,决策树有许多优点: (1)易于理解和解释 人们很容易理解决策树的意义。 (2)只需很少的数据准备 其他技术往往需要数据归一化。 (3)即可以处理数值型数据也可以处理类别型 数据。其他技术往往只能处理一种数据类型。例如关联规则只能处理类别型的而神经网络只能处理数值型的数据。 (4)使用白箱 模型,输出结果容易通过模型的结构来解释。而神经网络是黑箱模型,很难解释输出的结果。 (5)可以通过测试集来验证模型的性能 。可以考虑模型的稳定性。 (6)强健控制. 对噪声处理有好的强健性。 (7)可以很好的处理大规模数据 。 缺点: (1)训练一棵最优的决策树是一个完全NP问题。因此, 实际应用时决策树的训练采用启发式搜索算法例如 贪心算法 来达到局部最优。这样的算法没办法得到最优的决策树。 (2)决策树创建的过度复杂会导致无法很好的预测训练集之外的数据。这称作过拟合。 剪枝机制可以避免这种问题。 (3)有些问题决策树没办法很好的解决,例如 异或问题。解决这种问题的时候,决策树会变得过大。
资源详情
资源评论
资源推荐
收起资源包目录
4、决策树学习.zip (8个子文件)
4、决策树学习
main.m 1KB
ID3_Results.png 6KB
flower_test.png 253KB
BuildC45Tree.m 3KB
UseID3Tree.m 1KB
UseC45Tree.m 1KB
flower_mask.png 1KB
C45_Results.png 6KB
共 8 条
- 1
mozun2020
- 粉丝: 1w+
- 资源: 131
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
评论0