%数据预处理
uiopen('C:\Users\wscyx\Desktop\机器学习与模式识别\watermelon2.csv',1)
size_data = size(watermelon2); %watermelon2为导入工作台的数据
dataset = watermelon2(2:size_data(1),:); %纯数据集
labels = watermelon2(1,1:size_data(2)-1); %属性标签
%生成决策树
mytree = ID3(dataset,labels);
[nodeids,nodevalue,branchvalue] = print_tree(mytree);
%tree_plot(nodeids(1:4),nodevalue(1:4),branchvalue(1:4));%出第一层树
%tree_plot(nodeids(1:9),nodevalue(1:9),branchvalue(1:9));%出第二层数
%tree_plot(nodeids(1:11),nodevalue(1:11),branchvalue(1:11));%出第三层树
%tree_plot(nodeids(1:13),nodevalue(1:13),branchvalue(1:12));%出第四层树(全)
tree_plot(nodeids,nodevalue,branchvalue);%出第四层树(全)
function subDataset = splitDataset(dataset,axis,value)
%划分数据集,取出该特征值为value的所有样本,并去除该属性
subDataset = {};
data_size = size(dataset);
for i=1:data_size(1)
data = dataset(i,:);
if string(data(axis)) == string(value)
subDataset = [subDataset;[data(1:axis-1) data(axis+1:length(data))]];
end
end
end
function myTree = ID3(dataset,labels)
% ID3算法构建决策树
% 输入参数:
% dataset:数据集
% labels:属性标签
% 输出参数:
% tree:构建的决策树
%%数据为空,则报错
if(isempty(dataset)) %isempty(dataset) 判断dataset是否为空,如果为空,结果为1,否则为0.
error('必须提供数据!')
end
size_data = size(dataset);%[17,7]
if (size_data(2)-1)~=length(labels)
error('属性数量与数据集不一致!')
end
classList = dataset(:,size_data(2)); %classlist是好/坏瓜 17*1
%全为同一类,熵为0
if length(unique(classList))==1 %unique()函数抽取数据矩阵中唯一的一个数
myTree = char(classList(1));%返回‘是’
return
end
%%属性集为空,应该用找最多数的那一类,这里取值……
if size_data(2) == 1
myTree = char(classList(1));
return
end
bestFeature = chooseFeature(dataset);
bestFeatureLabel = char(labels(bestFeature));
%mytree = struct(bestFeatureLabel,struct())
myTree = containers.Map;
leaf = containers.Map;
%myTree(char(bestFeatureLabel)) = leaf;
featValues = dataset(:,bestFeature);
uniqueVals = unique(featValues);
labels=[labels(1:bestFeature-1) labels(bestFeature+1:length(labels))]; %删除该属性
for i=1:length(uniqueVals)
subLabels = labels(:)';
value = char(uniqueVals(i));
subdata = splitDataset(dataset,bestFeature,value);
%mytree.(bestFeatureLabel).(value) = ID3(subdata,subLabels)
leaf(value) = ID3(subdata,subLabels);
%leaf_keys = leaf.keys();
myTree(char(bestFeatureLabel)) = leaf;
%mytree_keys = myTree.keys();
end
end
function shannonEnt = calShannonEnt(dataset)
% 计算信息熵
data_size = size(dataset);
labels = dataset(:,data_size(2));
numEntries = data_size(1);
labelCounts = containers.Map;
for i = 1:length(labels)
label = char(labels(i));
if labelCounts.isKey(label)
labelCounts(label) = labelCounts(label)+1;
else
labelCounts(label) = 1;
end
end
shannonEnt = 0.0;
for key = labelCounts.keys
key = char(key);
labelCounts(key);
prob = labelCounts(key) / numEntries;
shannonEnt = shannonEnt - prob*(log(prob)/log(2));
end
end
function bestFeature=chooseFeature(dataset,~)
% 选择最小熵的属性特征
baseEntropy = calShannonEnt(dataset);
data_size = size(dataset);
numFeatures = data_size(2) - 1;
minEntropy = 2.0;
bestFeature = 0;
for i = 1:numFeatures
uniqueVals = unique(dataset(:,i));
newEntropy = 0.0;
for j=1:length(uniqueVals)
value = uniqueVals(j);
subDataset = splitDataset(dataset,i,value);
size_sub = size(subDataset);
prob = size_sub(1)/data_size(1);
%ShannonEnt = calShannonEnt(subDataset);
newEntropy = newEntropy + prob*calShannonEnt(subDataset);
end
%gain = baseEntropy- newEntropy;
if newEntropy<minEntropy
minEntropy = newEntropy;
bestFeature = i;
end
end
end
function tree_plot(p,nodevalue,branchvalue)
% 参考treeplot:https://ww2.mathworks.cn/help/matlab/ref/treeplot.html
[x,y,h] = treelayout(p); %x:横坐标,y:纵坐标;h:树的深度
f = find(p~=0); %非0节点
pp = p(f); %非0值
X = [x(f); x(pp); NaN(size(f))];
Y = [y(f); y(pp); NaN(size(f))];
X = X(:);
Y = Y(:);
n = length(p);
if n<500
hold on;
plot(x,y,'yo',X,Y,'p-')
nodesize = length(x);
%以下两个for语句做标注用,分别标注节点名称、分支名称
for i=1:nodesize
text(x(i)+0.01,y(i),nodevalue{1,i});
end
for i=2:nodesize
%text(x(i)-0.02,y(i)+0.01,branchvalue{1,i-1})
j = 3*i-5;
text((X(j)+X(j+1))/2-length(char(branchvalue{1,i-1}))/200,(Y(j)+Y(j+1))/2,branchvalue{1,i-1})
end
hold off
else
plot(X,Y,'r-');
end
xlabel(['height = ' int2str(h)]);
axis([0 1 0 1]);
end
function [nodeids_,nodevalue_,branchvalue_] = print_tree(tree)
% 层序遍历决策树,返回nodeids(节点关系),nodevalue(节点信息),branchvalue(枝干信息)
%以上三变量的矩阵是生成树的决定性参数
nodeids(1) = 0;
nodeid = 0;
nodevalue={};
branchvalue={};
queue = {tree} ;
while ~isempty(queue) %~isempty(queue)表示将 isempty(queue)的结果取反,也就是说如果queue为空,结果为0,否则为1.
node = queue{1};
queue(1) = [];
if string(class(node))~='containers.Map' %叶节点
nodeid = nodeid+1;
nodevalue = [nodevalue,{node}];
elseif length(node.keys)==1 %节点
nodevalue = [nodevalue,node.keys];
node_info = node(char(node.keys));
nodeid = nodeid+1;
branchvalue = [branchvalue,node_info.keys];
for i=1:length(node_info.keys)
nodeids = [nodeids,nodeid];
%nodeids(nodeid+length(queue)+i) = nodeid;
end
% else0o0p
% nodeid = nodeid+1;
% branchvalue = [branchvalue,node.keys];
% for i=1:length(node.keys)
% %nodeids = [nodeids,nodeid];
% nodeids(nodeid+length(queue)+i) = nodeid;
% end
% %nodeid = nodeid+1;
end
if string(class(node))=='containers.Map'
%nodeid = nodeid+1;
keys = node.keys();
for i = 1:length(keys)
key = keys{i};
%nodeids(nodeid+length(queue)+i) = nodeid;
%nodevalue{1,nodeid} = key ;
queue=[queue,{node(key)}];
end
end
nodeids_=nodeids;
nodevalue_=nodevalue;
branchvalue_ = branchvalue;
end
end
watermelon_决策树_西瓜数据集_西瓜数据_
版权申诉
5星 · 超过95%的资源 70 浏览量
2021-10-03
01:27:15
上传
评论 3
收藏 3KB ZIP 举报
慕酒
- 粉丝: 48
- 资源: 4823
最新资源
- python-leetcode面试题解之第157题用Read4读取N个字符-题解.zip
- python-leetcode面试题解之第156题上下翻转二叉树-题解.zip
- python-leetcode面试题解之第155题最小栈-题解.zip
- python-leetcode面试题解之第153题寻找旋转排序数组中的最小值-题解.zip
- python-leetcode面试题解之第152题乘积最大子数组-题解.zip
- python-leetcode面试题解之第151题反转字符串中的单词-题解.zip
- python-leetcode面试题解之第150题逆波兰表达式求值-题解.zip
- python-leetcode面试题解之第149题直线上最多的点数-题解.zip
- python-leetcode面试题解之第148题排序链表-题解.zip
- python-leetcode面试题解之第147题对链表进行插入排序-题解.zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
- 1
- 2
- 3
前往页