function test_example_NN
load mnist_uint8;
train_x = double(train_x) / 255;
test_x = double(test_x) / 255;
train_y = double(train_y);
test_y = double(test_y);
% normalize
[train_x, mu, sigma] = zscore(train_x);
test_x = normalize(test_x, mu, sigma);
%% ex1 vanilla neural net
rand('state',0)
nn = nnsetup([784 100 10]);
opts.numepochs = 1; %Number of full sweeps through data
opts.batchsize = 100; % Take a mean gradient step over this many samples
[nn, L] = nntrain(nn, train_x, train_y, opts);
[er, bad] = nntest(nn, test_x, test_y);
assert(er < 0.08, 'Too big error');
%% ex2 neural net with L2 weight decay
rand('state',0)
nn = nnsetup([784 100 10]);
nn.weightPenaltyL2 = 1e-4; %L2 weight decay
opts.numepochs = 1; % Number of full sweeps through data
opts.batchsize = 100; % Take a mean gradient step over this many samples
nn = nntrain(nn, train_x, train_y, opts);
[er, bad] = nntest(nn, test_x, test_y);
assert(er < 0.1, 'Too big error');
%% ex3 neural net with dropout
rand('state',0)
nn = nnsetup([784 100 10]);
nn.dropoutFraction = 0.5; %Dropout fraction
opts.numepochs = 1; % Number of full sweeps through data
opts.batchsize = 100; % Take a mean gradient step over this many samples
nn = nntrain(nn, train_x, train_y, opts);
[er, bad] = nntest(nn, test_x, test_y);
assert(er < 0.1, 'Too big error');
%% ex4 neural net with sigmoid activation function
rand('state',0)
nn = nnsetup([784 100 10]);
nn.activation_function = 'sigm'; %Sigmoid activation function
nn.learningRate = 1; % Sigmrequire a lower learning rate
opts.numepochs = 1; % Number of full sweeps through data
opts.batchsize = 100; % Take a mean gradient step over this many samples
nn = nntrain(nn, train_x, train_y, opts);
[er, bad] = nntest(nn, test_x, test_y);
assert(er < 0.1, 'Too big error');
%% ex5 plotting functionality
rand('state',0)
nn = nnsetup([784 20 10]);
opts.numepochs = 5; %Number of full sweeps through data
nn.output = 'softmax'; % use softmax output
opts.batchsize = 1000; % Take a mean gradient step over this many samples
opts.plot = 1; % enable plotting
nn = nntrain(nn, train_x, train_y, opts);
[er, bad] = nntest(nn, test_x, test_y);
assert(er < 0.1, 'Too big error');
%% ex6 neural net with sigmoid activation and plotting of validation and training error
% split training data into training and validation data
vx = train_x(1:10000,:);
tx = train_x(10001:end,:);
vy = train_y(1:10000,:);
ty = train_y(10001:end,:);
rand('state',0)
nn = nnsetup([7842010]);
nn.output = 'softmax'; % use softmax output
opts.numepochs = 5; % Number of full sweeps through data
opts.batchsize = 1000; % Take a mean gradient step over this many samples
opts.plot = 1; % enable plotting
nn = nntrain(nn, tx, ty, opts, vx, vy); % nntrain takes validation set as last two arguments (optionally)
[er, bad] = nntest(nn, test_x, test_y);
assert(er < 0.1, 'Too big error');
2、function nn = nnsetup(architecture)
%NNSETUP creates a Feedforward Backpropagate Neural Network% nn = nnsetup(architecture) returns an neural network structure with n=numel(architecture)% layers, architecture being a n x 1 vector of layer sizes e.g. [784 100 10]nn.size = architecture;
nn.n = numel(nn.size);
nn.activation_function = 'tanh_opt'; % Activation functions of hidden layers: 'sigm' (sigmoid) or 'tanh_opt' (optimal tanh).nn.learningRate = 2; % learning rate Note: typically needs to be lower when using 'sigm' activation function and non-normalized inputs.nn.momentum = 0.5; % Momentumnn.scaling_learningRate = 1; % Scaling factor for the learning rate (each epoch)nn.weightPenaltyL2 = 0; % L2 regularizationnn.nonSparsityPenalty = 0; % Non sparsity penaltynn.sparsityTarget = 0.05; % Sparsity targetnn.inputZeroMaskedFraction = 0; % Used for Denoising AutoEncodersnn.dropoutFraction = 0; % Dropout level (http://www.cs.toronto.edu/~hinton/absps/dropout.pdf)nn.testing = 0; % Internal variable. nntest sets this to one.nn.output = 'sigm'; % output unit 'sigm' (=logistic), 'softmax' and 'linear'fori = 2 : nn.n
% weights and weight momentumnn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)+1) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
nn.vW{i - 1} = zeros(size(nn.W{i - 1}));
% average activations (for use with sparsity)nn.p{i} = zeros(1, nn.size(i));
endend
%NNTRAIN trains a neural net
% [nn, L] = nnff(nn, x, y, opts) trains the neural network nn with input x and
% output y for opts.numepochs epochs, with minibatches of size
% opts.batchsize. Returns a neural network nn with updated activations,
% errors, weights and biases, (nn.a, nn.e, nn.W, nn.b) and L, the sum
% squared error for each training minibatch.
assert(isfloat(train_x), 'train_x must be a float');
assert(nargin == 4 || nargin == 6,'number ofinput arguments must be 4 or 6')
loss.train.e = [];
loss.train.e_frac = [];
loss.val.e = [];
loss.val.e_frac = [];
opts.validation = 0;
if nargin == 6
opts.validation = 1;
end
fhandle = [];
if isfield(opts,'plot') && opts.plot == 1
fhandle = figure();
end
m = size(train_x, 1);
batchsize = opts.batchsize;
numepochs = opts.numepochs;
numbatches = m / batchsize;
assert(rem(numbatches, 1) == 0, 'numbatches must be a integer');
L = zeros(numepochs*numbatches,1);
n = 1;
for i = 1 : numepochs
tic;
kk = randperm(m);
for l = 1 : numbatches
batch_x = train_x(kk((l - 1) * batchsize + 1 : l * batchsize), :);
%Add noise to input (for use in denoising autoencoder)
if(nn.inputZeroMaskedFraction ~= 0)
batch_x = batch_x.*(rand(size(batch_x))>nn.inputZeroMaskedFraction);
end
batch_y = train_y(kk((l - 1) * batchsize + 1 : l * batchsize), :);
nn = nnff(nn, batch_x, batch_y);
nn = nnbp(nn);
nn = nnapplygrads(nn);
L(n) = nn.L;
n = n + 1;
end
t = toc;
if opts.validation == 1
loss = nneval(nn, loss, train_x, train_y, val_x, val_y);
str_perf = sprintf('; Full-batch train mse = %f, val mse = %f', loss.train.e(end), loss.val.e(end));
else
loss = nneval(nn, loss, train_x, train_y);
str_perf = sprintf('; Full-batch train err = %f', loss.train.e(end));
endif ishandle(fhandle)
nnupdatefigures(nn, fhandle, loss, opts, i);
end
disp(['epoch ' num2str(i) '/' num2str(opts.numepochs) '. Took ' num2str(t) ' seconds''. Mini-batch mean squared error on training set is ' num2str(mean(L((n-numbatches):(n-1)))) str_perf]);
nn.learningRate = nn.learningRate * nn.scaling_learningRate;
endend
4、function nn = nnff(nn, x, y)
%NNFF performs a feedforward pass% nn = nnff(nn, x, y) returns an neural network structure with updated% layer activations, error and loss (nn.a, nn.e and nn.L)
n = nn.n;
m = size(x, 1);
x = [ones(m,1) x];
nn.a{1} = x;
%feedforward passfori = 2 : n-1switchnn.activation_function
case'sigm'% Calculate the unit's outputs (including the bias term)nn.a{i} = sigm(nn.a{i - 1} * nn.W{i - 1}');
case'tanh_opt'nn.a{i} = tanh_opt(nn.a{i - 1} * nn.W{i - 1}');
end%dropoutif(nn.dropoutFraction > 0)
if(nn.testing)
nn.a{i} = nn.a{i}.*(1 - nn.dropoutFraction);
elsenn.dropOutMask{i} = (rand(size(nn.a{i}))>nn.dropoutFraction);
nn.a{i} = nn.a{i}.*nn.dropOutMask{i};
endend%calculate running exponential activations for use with sparsityif(nn.nonSparsityPenalty>0)
nn.p{i} = 0.99 * nn.p{i} + 0.01 * mean(nn.a{i}, 1);
end%Add the bias termnn.a{i} = [ones(m,1) nn.a{i}];
endswitchnn.output
case'sigm'nn.a{n} = sigm(nn.a{n - 1} * nn.W{n - 1}');
case'linear'nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
case'softmax'nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
nn.a{n} = exp(bsxfun(@minus, nn.a{n}, max(nn.a{n},[],2)));
nn.a{n} = bsxfun(@rdivide, nn.a{n}, sum(nn.a{n}, 2));
end%error and lossnn.e = y - nn.a{n};
switchnn.output
case{'sigm', 'linear'}nn.L = 1/2 * sum(sum(nn.e .^ 2)) / m;
case'softmax'nn.L = -sum(sum(y .* log(nn.a{n}))) / m;
endend
5、function nn = nnbp(nn)
%NNBP performs backpropagation% nn = nnbp(nn) returns an neural network structure with updated weights
n = nn.n;
sparsityError = 0;
switchnn.output
case'sigm'
d{n} = - nn.e .* (nn.a{n} .* (1 - nn.a{n}));
case{'softmax','linear'}
d{n} = - nn.e;
endfori = (n - 1) : -1 : 2% Derivative of the activation functionswitchnn.activation_function
case'sigm'
d_act = nn.a{i} .* (1 - nn.a{i});
case'tanh_opt'
d_act = 1.7159 * 2/3 * (1 - 1/(1.7159)^2 * nn.a{i}.^2);
endif(nn.nonSparsityPenalty>0)
pi = repmat(nn.p{i}, size(nn.a{i}, 1), 1);
sparsityError = [zeros(size(nn.a{i},1),1) nn.nonSparsityPenalty * (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi))];
end% Backpropagate first derivativesifi+1==n % in this case in d{n} there is not the bias term to be removed
d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* d_act; % Bishop (5.56)else% in this case in d{i} the bias term has to be removed
d{i} = (d{i + 1}(:,2:end) * nn.W{i} + sparsityError) .* d_act;
endif(nn.dropoutFraction>0)
d{i} = d{i} .* [ones(size(d{i},1),1) nn.dropOutMask{i}];
endendfori = 1 : (n - 1)
ifi+1==n
nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
elsenn.dW{i} = (d{i + 1}(:,2:end)' * nn.a{i}) / size(d{i + 1}, 1);
end
end
end
6、function nn = nnapplygrads(nn)
%NNAPPLYGRADS updates weights and biases with calculated gradients% nn = nnapplygrads(nn) returns an neural network structure with updated% weights and biasesfori = 1 : (nn.n - 1)
if(nn.weightPenaltyL2>0)
dW = nn.dW{i} + nn.weightPenaltyL2 * [zeros(size(nn.W{i},1),1) nn.W{i}(:,2:end)];
else
dW = nn.dW{i};
end
dW = nn.learningRate * dW;
if(nn.momentum>0)
nn.vW{i} = nn.momentum*nn.vW{i} + dW;
dW = nn.vW{i};
endnn.W{i} = nn.W{i} - dW;
endend