matlab实现的c4.5分类决策树算法-

function D = C4_5(train_features, train_targets, inc_node, region) % Classify using Quinlans C4.5 algorithm% Inputs:% features - Train features% targets - Train targets% inc_node - Percentage of incorrectly assigned samples at a node% region - Decision region vector: -x x -y y number_of_points% Outputs% D - Decision sufrace %NOTE: In this implementation it is assumed that a feature vector with fewer than 10 unique values (the parameter Nu)%is discrete, and will be treated as such. Other vectors will be treated as continuous Ni, M = size(train_features);inc_node = inc_node*M/100;Nu = 10; %For the decision regionN = region(5);mx = ones(N,1) * linspace (region(1),region(2),N);my = linspace (region(3),region(4),N) * ones(1,N);flatxy = mx(:), my(:); %Preprocessing%f, t, UW, m = PCA(train_features, train_targets, Ni, region);%train_features = UW * (train_features - m*ones(1,M);%flatxy = UW * (flatxy - m*ones(1,N2); %Find which of the input features are discrete, and discretisize the corresponding%dimension on the decision regiondiscrete_dim = zeros(1,Ni);for i = 1:Ni, Nb = length(unique(train_features(i,:); if (Nb = Nu), %This is a discrete feature discrete_dim(i) = Nb; H, flatxy(i,:) = high_histogram(flatxy(i,:), Nb); endend %Build the tree recursivelydisp(Building tree)tree = make_tree(train_features, train_targets, inc_node, discrete_dim, max(discrete_dim), 0); %Make the decision region according to the treedisp(Building decision surface using the tree)targets = use_tree(flatxy, 1:N2, tree, discrete_dim, unique(train_targets); D = reshape(targets,N,N);%END function targets = use_tree(features, indices, tree, discrete_dim, Uc)%Classify recursively using a tree targets = zeros(1, size(features,2); if (tree.dim = 0) %Reached the end of the tree targets(indices) = tree.child; breakend %This is not the last level of the tree, so:%First, find the dimension we are to work ondim = tree.dim;dims= 1:size(features,1); %And classify according to itif (discrete_dim(dim) = 0), %Continuous feature in = indices(find(features(dim, indices) tree.split_loc); targets = targets + use_tree(features(dims, :), in, tree.child(2), discrete_dim(dims), Uc);else %Discrete feature Uf = unique(features(dim,:); for i = 1:length(Uf), in = indices(find(features(dim, indices) = Uf(i); targets = targets + use_tree(features(dims, :), in, tree.child(i), discrete_dim(dims), Uc); endend %END use_tree function tree = make_tree(features, targets, inc_node, discrete_dim, maxNbin, base)%Build a tree recursively Ni, L = size(features);Uc = unique(targets);tree.dim = 0;%tree.child(1:maxNbin) = zeros(1,maxNbin);tree.split_loc = inf; if isempty(features), breakend %When to stop: If the dimension is one or the number of examples is smallif (inc_node L) | (L = 1) | (length(Uc) = 1), H = hist(targets, length(Uc); m, largest = max(H); tree.child = Uc(largest); breakend %Compute the nodes Ifor i = 1:length(Uc), Pnode(i) = length(find(targets = Uc(i) / L;endInode = -sum(Pnode.*log(Pnode)/log(2); %For each dimension, compute the gain ratio impurity%This is done separately for discrete and continuous featuresdelta_Ib = zeros(1, Ni);split_loc = ones(1, Ni)*inf; for i = 1:Ni, data = features(i,:); Nbins = length(unique(data); if (discrete_dim(i), %This is a discrete feature P = zeros(length(Uc), Nbins); for j = 1:length(Uc), for k = 1:Nbins, indices = find(targets = Uc(j) & (features(i,:) = k); P(j,k) = length(indices); end end Pk = sum(P); P = P/L; Pk = Pk/sum(Pk); info = sum(-P.*log(eps+P)/log(2); delta_Ib(i) = (Inode-sum(Pk.*info)/-sum(Pk.*log(eps+Pk)/log(2); else %This is a continuous feature P = zeros(length(Uc), 2); %Sort the features sorted_data, indices = sort(data); sorted_targets = targets(indices); %Calculate the information for each possible split I = zeros(1, L-1); for j = 1:L-1, for k =1:length(Uc), P(k,1) = length(find(sorted_targets(1:j) = Uc(k);