function [sample,sampleLabel]=undersampling(data,Label,ClassType,C,AttVector)
% Implement under-sampling algorithm.
% It changes the training data distribution by deleting some
% lower-cost training examples until the appearances of different
% training examples are proportional to their costs. Here a routine
% similar to that used in [1] is employed, which removes redundant
% examples at first and then removes borderline examples, the latter
% can be detected using Tomek links [2].
%
%Usage:
% [sample,sampleLabel]=undersampling(data,Label,ClassType,C,attribute)
%
% sample: new training set after under-sampling to build cost-sensitive NN
% format - row indexes attributes and column indexes
% instances
% sampleLabel: class labels for instances in new training set.
% format - row vector
% data: original training set.
% format - row indexes attributes and column indexes instances
% Label: class labels for instances in original training set
% format - row vector
% ClassType: class type
% C: cost vector. the ith entry is the cost of misclassifying ith class
% instance, without considering the concrete class the instance has
% been wrongly assigned to.
% AttVector: attribute vector,1 presents for the corresponding attribute
% is nominal and 0 for numeric.
%
% Refer [1]:
% M. Kubat and S. Matwin, ※Addressing the curse of imbalanced training
% sets: one-sided selection,§ in Proceedings of the 14th International
% Conference on Machine Learning, Nashville, TN, pp.179每186, 1997.
% Refer [2]:
% I. Tomek, ※Two modifications of CNN,§ IEEE Transactions on Systems, Man
% and Cybernetics, vol.6, no.6, pp.769每772, 1976.
%check parameters
NumClass=length(ClassType);
if(length(C)~=NumClass)
error('class number does not consistent.')
end
if(size(data,2)~=size(Label))
error('instance numbers in data and Label do not consistent.')
end
if(size(data,1)~=length(AttVector))
error('attribute numbers in data and AttVector do not consistent.')
end
%compute class distribution
ClassD=zeros(1,NumClass);
for i=1:NumClass
id=find(Label==ClassType(i));
ClassData{i}=data(:,id);
ClassD(i)=length(id);
end
%compute new class distribution
cn=C./ClassD;
[tmp,baseClassID]=max(cn);
newClassD=floor(C/C(baseClassID)*ClassD(baseClassID));
%ascending C
[tmp,ascendC]=sort(C);
%prepare for VDM used in the distance function
attribute=VDM(data,Label,ClassType,AttVector);
%sampling
for i=ascendC
if(newClassD(i)<ClassD(i))
D=[];
DL=[];
K=[];
% put instances of other classes into D
for j=1:NumClass
if(j~=i)
D=[D ClassData{j}];
l=ones(1,ClassD(j)).*ClassType(j);
DL=[DL l];
end
end
%break ClassData{i} into 2 parts
n=floor(newClassD(i)/2);
id=randperm(ClassD(i));
id1=id(1:n);
id2=id(n+1:end);
% put Ni*/2 class i instances into D
D=[D ClassData{i}(:,id1)];
l=ones(1,n).*ClassType(i);
DL=[DL l];
% put the remaining class i instances into K
K=ClassData{i}(:,id2);
K_flag=zeros(1,length(id2));% 0 unchanged,1 moved to D, -1 deleted
diff=ClassD(i)-newClassD(i);
NumDelIns=0;
%check instances in K to delete redundant ones
while(NumDelIns<diff & length(find(K_flag==0))>0)
%randomly pick an unchecked instance to check
id=find(K_flag==0);
rn=round(rand(1,1)*(length(id)-1))+1;
id=id(rn);
instance=K(:,id);
target=ClassType(i);
% calculate distance which used for classification with 1-NN rule
d=dist_nominal(instance,D,attribute,AttVector);
% 1-NN
[tmp,mind]=min(d);
if(target==DL(mind))%delete
K_flag(id)= -1;
NumDelIns=NumDelIns+1;
else% move to D
K_flag(id)= 1;
end
end
%if enough instances have been deleted, merge the remaining into D
if(NumDelIns==diff)
id=find(K_flag~= -1);
D=[D K(:,id)];
l=ones(1,length(id))*ClassType(i);
DL=[DL l];
K=[];
K_flag=[];
%if not
else
%merge unredundant instances into D
id=find(K_flag==1);
D=[D K(:,id)];
l=ones(1,length(id))*ClassType(i);
DL=[DL l];
K=[];
K_flag=[];
%check the i-th class in D to delete borderline examples
idClassi=find(DL==ClassType(i));
while(NumDelIns<diff & ~isempty(idClassi))
%randomly pick up an instance from the i-th class
rn=round(rand(1,1)*(length(idClassi)-1))+1;
id=idClassi(rn);
X=D(:,id);
target=ClassType(i);
% calculate distances to identify Tomek links
d=dist_nominal(X,D,attribute,AttVector);
d(id)=inf;
iid=find(isnan(DL)==1);
d(iid)=nan;
[tmp,NearX]=min(d);
if(target~=DL(NearX))
Y=D(:,NearX);
d=dist_nominal(Y,D,attribute,AttVector);
d(NearX)=Inf;
iid=find(isnan(DL)==1);
d(iid)=nan;
[tmp,NearY]=min(d);
if(NearY==id)%delete borderline example
DL(id)=NaN;
NumDelIns=NumDelIns+1;
end
end
idClassi=setdiff(idClassi,id);
end%while
id=find(isnan(DL)==0);
D=D(:,id);
DL=DL(id);
% if it still needs to delete some instances, randomly delete
% until requirement is meet
if( isempty(idClassi) & NumDelIns<diff )
idClassi=find(DL==ClassType(i));
id=randperm(length(idClassi));
id=idClassi(id(1:diff-NumDelIns));
id=setdiff(1:length(DL),id);
D=D(:,id);
DL=DL(id);
end
end%if-elseif
%update the i-th class after under-sampling
id=find(DL==ClassType(i));
ClassData{i}=D(:,id);
end%if
end%for
sample=D;
sampleLabel=DL;
没有合适的资源?快使用搜索试试~ 我知道了~
Smote的matlab代码
共29个文件
m:19个
gif:3个
jpg:2个
5星 · 超过95%的资源 需积分: 50 260 下载量 154 浏览量
2015-10-29
21:59:55
上传
评论 4
收藏 37KB RAR 举报
温馨提示
关于MATLAB的smote的代码,参考里面的sample_HardEnsemble_SoftEnsemble来做自己的案子!
资源推荐
资源详情
资源评论
收起资源包目录
SMOTE.rar (29个子文件)
SMOTE
ReadMe.files
image004.jpg 1KB
image006.gif 4KB
editdata.mso 330B
Thumbs.db 6KB
filelist.xml 324B
image005.jpg 1KB
image002.gif 113B
image001.gif 978B
ReadMe.htm 81KB
CSNN
UnderSampling.m 7KB
echocardiogram.mat 3KB
sample_UnderSampling.m 1KB
sample_OverSampling.m 1KB
sample_HardEnsemble_SoftEnsemble.m 3KB
ThresholdMovNN.m 2KB
sample_SmoteOverSampling.m 1KB
HardEnsemble.m 2KB
Utilities
LabelFormatConvertion.m 2KB
normalize.m 182B
dist_nominal.m 2KB
Locate.m 585B
CostMatrix.m 3KB
NNoutputFormat.m 1KB
VDM.m 3KB
sample_ThresholdMovNN.m 1KB
SMOTE.m 5KB
OverSampling.m 2KB
SoftEnsemble.m 1KB
SmoteOverSampling.m 3KB
共 29 条
- 1
木虫爬爬
- 粉丝: 3
- 资源: 6
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
- 1
- 2
前往页