"""Multi-layer Perceptron
"""
# Author: Issam H. Laradji <issam.laradji@gmail.com>
# License: BSD 3 clause
import numpy as np
from abc import ABCMeta, abstractmethod
from scipy.optimize import fmin_l_bfgs_b
import warnings
from base import logistic, softmax, binary_KL_divergence
from base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.base import TransformerMixin
from sklearn.externals import six
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import gen_batches, check_random_state
from sklearn.utils import shuffle
from sklearn.utils import ConvergenceWarning
from sklearn.utils.extmath import safe_sparse_dot
def _pack(layers_coef_, layers_intercept_):
"""Pack the parameters into a single vector."""
return np.hstack([l.ravel() for l in layers_coef_ + layers_intercept_])
class BaseMultilayerPerceptron(six.with_metaclass(ABCMeta, BaseEstimator)):
"""Base class for MLP classification and regression.
Warning: This class should not be used directly.
Use derived classes instead.
"""
@abstractmethod
def __init__(self, hidden_layer_sizes, activation, algorithm,
alpha, batch_size, learning_rate, learning_rate_init, power_t,
max_iter, loss, shuffle, beta, sparsity_param, random_state, tol, verbose,
warm_start):
self.activation = activation
self.algorithm = algorithm
self.alpha = alpha
self.beta = beta
self.sparsity_param = sparsity_param
self.batch_size = batch_size
self.learning_rate = learning_rate
self.learning_rate_init = learning_rate_init
self.power_t = power_t
self.max_iter = max_iter
self.loss = loss
self.hidden_layer_sizes = hidden_layer_sizes
self.shuffle = shuffle
self.random_state = random_state
self.tol = tol
self.verbose = verbose
self.warm_start = warm_start
self.layers_coef_ = None
self.layers_intercept_ = None
self.cost_ = None
self.n_iter_ = None
self.learning_rate_ = None
self.classes_ = None
# iteration count for learning rate schedule
# must not be int (e.g. if ``learning_rate=='optimal'``)
self.t_ = None
def _unpack(self, packed_parameters):
"""Extract the coefficients and intercepts from packed_parameters."""
for i in range(self.n_layers_ - 1):
start, end, shape = self._coef_indptr[i]
self.layers_coef_[i] = np.reshape(packed_parameters[start:end],
shape)
start, end = self._intercept_indptr[i]
self.layers_intercept_[i] = packed_parameters[start:end]
def _forward_pass(self, activations, with_output_activation=True):
"""Perform a forward pass on the network by computing the values
of the neurons in the hidden layers and the output layer.
Parameters
----------
activations: list, length = n_layers - 1
The ith index of the list holds the values of the ith layer.
with_output_activation : bool, default True
If True, the output passes through the output activation
function, which is either the softmax function or the
logistic function
"""
# Iterate over the hidden layers
for i in range(self.n_layers_ - 1):
activations[i + 1] = safe_sparse_dot(activations[i],
self.layers_coef_[i])
activations[i + 1] += self.layers_intercept_[i]
# For the hidden layers
if i + 1 != self.n_layers_ - 1:
hidden_activation = ACTIVATIONS[self.activation]
activations[i + 1] = hidden_activation(activations[i + 1])
# For the last layer
if with_output_activation:
output_activation = ACTIVATIONS[self.out_activation_]
activations[i + 1] = output_activation(activations[i + 1])
return activations
def _compute_cost_grad(self, layer, n_samples, activations, deltas,
coef_grads, intercept_grads):
"""Compute the cost gradient for the layer."""
coef_grads[layer] = safe_sparse_dot(activations[layer].T,
deltas[layer]) / n_samples
coef_grads[layer] += (self.alpha * self.layers_coef_[layer])
intercept_grads[layer] = np.mean(deltas[layer], 0)
return coef_grads, intercept_grads
def _cost_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
coef_grads, intercept_grads):
"""Compute the MLP cost function and its corresponding derivatives
with respect to the different parameters given in the initialization.
Parameters
----------
packed_parameters : array-like
A vector comprising the flattened coefficients and intercepts.
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data.
y : array-like, shape (n_samples,)
The target values.
activations: list, length = n_layers - 1
The ith index of the list holds the values of the ith layer.
deltas : list, length = n_layers - 1
The ith index of the list holds the difference between the
activations of the i + 1 layer and the backpropagated error.
coef_grad : list, length = n_layers - 1
The ith index contains the amount of change used to update the
coefficient parameters of the ith layer in an iteration.
intercept_grads : list, length = n_layers - 1
The ith index contains the amount of change used to update the
intercept parameters of the ith layer in an iteration.
Returns
-------
cost : float
grad : array-like, shape (number of nodes of all layers,)
"""
self._unpack(packed_coef_inter)
cost, coef_grads, intercept_grads = self._backprop(X, y, activations,
deltas, coef_grads,
intercept_grads)
self.n_iter_ += 1
grad = _pack(coef_grads, intercept_grads)
return cost, grad
def _backprop(self, X, y, activations, deltas, coef_grads,
intercept_grads):
"""Compute the MLP cost function and its corresponding derivatives
with respect to each parameter: weights and bias vectors.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data.
y : array-like, shape (n_samples,)
The target values.
activations: list, length = n_layers - 1
The ith index of the list holds the values of the ith layer.
deltas : list, length = n_layers - 1
The ith index of the list holds the difference between the
activations of the i + 1 layer and the backpropagated error.
coef_grad : list, length = n_layers - 1
The ith index contains the amount of change used to update the
coefficient parameters of the ith layer in an iteration.
intercept_grads : list, length = n_layers - 1
The ith index contains the amount of change used to update the
intercept parameters of the ith layer in an iteration.
Returns
-------
cost : float
"""
n_samples = X.shape[0]
# Step (1/3): Forward propagate
activations = self._forward_pass(activations)
# Step (2/3): Get cost
cost = LOSS_FUNCTIONS[self.loss](y, activations[-1])
# Add L2 regularization term to cost
values = np.sum(np.ar