Source code for dl85.supervised.classifiers.boosting

import os
import sys
import time
from copy import deepcopy
import cvxpy as cp

from sklearn.base import ClassifierMixin
from sklearn.exceptions import NotFittedError
from sklearn.utils.multiclass import unique_labels
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score, roc_auc_score

from .classifier import DL85Classifier
from ...errors.errors import SearchFailedError, TreeNotFoundError
from .utils.matrix import *

MODEL_LP_RATSCH = 1  # regulator of ratsch is between ]0; 1]
MODEL_LP_DEMIRIZ = 2  # regulator of demiriz is between ]1/n_instances; +\infty]
MODEL_QP_MDBOOST = 3


[docs]class DL85Booster(BaseEstimator, ClassifierMixin):
    """
    An optimal binary decision tree classifier.

    Parameters
    ----------
    base_etimator : classifier, default=None
        The base classifier to boost
    max_depth : int, default=1
        Maximum depth of the tree to be found
    min_sup : int, default=1
        Minimum number of examples per leaf
    max_iterations : int, default=0
        The maximum number of iterations after which the search is stopped. Default value means "no stop on iterations"
    model : int, default=MODEL_LP_DEMIRIZ
        The column generation model to solve
    gamma : str, default=None
        Variance matrix parameter for MDBoost
    error_function : function, default=None
        User-specific error function based on transactions
    fast_error_function : function, default=None
        User-specific error function based on supports per class
    opti_gap : float, default=0.01
        This value is a tolerance to stop the column generation before optimality. It fixes the convergence problem of column generation approaches
    max_error : int, default=0
        Maximum allowed error. Default value stands for no bound. If no tree can be found that is strictly better, the model remains empty.
    stop_after_better : bool, default=False
        A parameter used to indicate if the search will stop after finding a tree better than max_error
    regulator : float, default=-1
        This is the regularization parameter of column generation models.
    time_limit : int, default=0
        Allocated time in second(s) for the search. Default value stands for no limit. The best tree found within the time limit is stored, if this tree is better than max_error.
    verbose : bool, default=False
        A parameter used to switch on/off the print of what happens during the search
    desc : bool, default=False
        A parameter used to indicate if the sorting of the items is done in descending order of information gain
    asc : bool, default=False
        A parameter used to indicate if the sorting of the items is done in ascending order of information gain
    repeat_sort : bool, default=False
        A parameter used to indicate whether the sorting of items is done at each level of the lattice or only before the search
    quiet : bool, default=True
        Whether to print or not the column generation details
    print_output : bool, default=False
        A parameter used to indicate if the search output will be printed or not

    Attributes
    ----------
    estimators_ : list
        The list of estimators in the final ensemble.
    estimator_weights_ : list
        The weight of each estimator.
    n_estimators_ : int
        Total number of estimators
    n_iterations_ : int
        Total number of iterations needed to find the optimal ensemble.
    objective_ : float
        The objective value reached by the ensemble.
    accuracy_ : float
        Accuracy of the found tree on training set
    margins_ : list
        The list of margin of the found ensemble on the training set
    margins_norm_ : list
        Same value as above but normalized. Each value is between -1 and 1.
    duration_ : float
        Time of the optimal forest search
    optimal_ : bool
        Whether the ensemble is optimal or not
    classes_ : ndarray, shape (n_classes,)
        The classes seen at :meth:`fit`.
    """

    def __init__(
            self,
            base_estimator=None,
            max_depth=1,
            min_sup=1,
            max_iterations=0,
            model=MODEL_LP_DEMIRIZ,
            gamma=None,
            error_function=None,
            fast_error_function=None,
            opti_gap=0.01,
            max_error=0,
            regulator=-1,
            stop_after_better=False,
            time_limit=0,
            verbose=False,
            desc=False,
            asc=False,
            repeat_sort=False,
            print_output=False,
            quiet=True):
        self.clf_params = dict(locals())
        del self.clf_params["self"]
        del self.clf_params["regulator"]
        del self.clf_params["base_estimator"]
        del self.clf_params["max_iterations"]
        del self.clf_params["model"]
        del self.clf_params["gamma"]
        del self.clf_params["opti_gap"]

        self.base_estimator = base_estimator
        self.max_depth = max_depth
        self.min_sup = min_sup
        self.max_iterations = max_iterations
        self.error_function = error_function
        self.fast_error_function = fast_error_function
        self.max_error = max_error
        self.stop_after_better = stop_after_better
        self.time_limit = time_limit
        self.verbose = verbose
        self.desc = desc
        self.asc = asc
        self.repeat_sort = repeat_sort
        self.print_output = print_output
        self.regulator = regulator
        self.quiet = quiet
        self.model = model
        self.gamma = gamma
        self.opti_gap = opti_gap
        self.n_instances = None
        self.A_inv = None

        self.optimal_ = False  # whether the ensemble is optimal
        self.estimators_, self.estimator_weights_ = [], []
        self.accuracy_ = self.duration_ = self.n_estimators_ = self.n_iterations_ = 0
        self.margins_ = []  # the ensemble margin on training instances
        self.margins_norm_ = []  # the normalized margins
        self.classes_ = []
        self.objective_ = None  # the optimal value reached by the ensemble

    def fit(self, X, y=None, X_test=None, y_test=None, iter_file=None):
        if y is None:
            raise ValueError("The \"y\" value is compulsory for boosting.")

        start_time = time.perf_counter()

        # initialize variables
        self.n_instances, _ = X.shape
        sample_weights = np.array([1/self.n_instances] * self.n_instances)
        predictions, r, self.n_iterations_, constant = None, None, 1, 0.0001

        if self.model == MODEL_QP_MDBOOST:
            # Define the inverse of the A matrix
            if self.gamma is None:
                A = np.full((self.n_instances, self.n_instances), -1/(self.n_instances - 1), dtype=np.float64)
                np.fill_diagonal(A, 1)
                A = np.add(A, np.dot(np.eye(self.n_instances), constant))  # regularize A to make A^-1 sure it is really PSD
            else:
                self.gamma = 1 / self.n_instances if self.gamma == 'auto' else 1 / (self.n_features * X.var()) if self.gamma == 'scale' else 1 / X.var() if self.gamma == 'nscale' else 1 / self.n_instances
                A = np.identity(self.n_instances, dtype=np.float64)
                for i in range(self.n_instances):
                    for j in range(self.n_instances):
                        if i != j:
                            A[i, j] = np.exp(-self.gamma * np.linalg.norm(np.subtract(X[i, :], X[j, :]))**2)

            if not self.quiet:
                print("Matrix A", A)
                print("is pos def: ", is_pd(A))
                print("is psd: ", is_psd(A))

            # invert the matrix A
            self.A_inv = np.linalg.pinv(A)

            # find the nearest pos def matrix if A_inv is not
            if pos_def(self.A_inv) is False:
                self.A_inv = nearest_pd(self.A_inv)

            if not self.quiet:
                print("Matrix A_inv", self.A_inv)
                print("is pos def", is_pd(self.A_inv))
                print("is psd", is_psd(self.A_inv))

        if not self.quiet:
            print()

        self.classes_ = unique_labels(y)

        # if a test set is provided, some metrics are store after each boosting iteration
        if X_test is not None and y_test is not None:
            iter_file_name = (iter_file if iter_file is not None else "iter_file") + ".csv"
            acc_stream = open(iter_file_name, "w")
            acc_stream.write("objective,train_acc,test_acc,train_auc,test_auc,n_iter,n_trees,min_margin,avg_margin,var_margin\n")
            acc_stream.flush()

        # run the boosting loop for a fixed number of iterations if max_iterations is specified
        while (self.max_iterations > 0 and self.n_iterations_ <= self.max_iterations) or self.max_iterations <= 0:
            if not self.quiet:
                print("n_iter", self.n_iterations_)

            # initialize the classifier
            # initialize the base classifier. DL8.5 can be changed by another optimal classifier
            clf = DL85Classifier(**self.clf_params) if self.base_estimator is None else self.base_estimator

            # fit the model with the correct weights. The print of the learning are disabled.
            old_stdout = sys.stdout
            sys.stdout = open(os.devnull, "w")
            clf.fit(X, y, sample_weight=sample_weights.tolist())
            sys.stdout = old_stdout

            # print the tree expression of the estimator if it has
            if not self.quiet:
                print("A new tree has been learnt based on previous found sample weights")
                if hasattr(clf, "tree_") and isinstance(clf.tree_, dict):
                    print(clf.tree_)

            # compute the prediction of the new estimator : 1 if correct else -1
            try:
                pred = np.array([-1 if p != y[i] else 1 for i, p in enumerate(clf.predict(X))])
            except (NotFittedError, SearchFailedError, TreeNotFoundError) as error:
                if not self.quiet:
                    print("Problem during the search so we stop")
                break

            if not self.quiet:
                print("correct predictions - incorrect predictions =", pred.sum())
                print("np.dot(predictions, sample_weigths) =", pred @ sample_weights)

            # check if optimal condition is met
            if self.n_iterations_ > 1:

                # if a test set is provided, write metrics after each boosting iteration
                if X_test is not None and y_test is not None:
                    n_treess = len([i for i in self.estimator_weights_ if i != 0])
                    if n_treess > 0:
                        n_treess = str(n_treess)
                        self.n_estimators_ = len(self.estimators_)
                        train_acc = str(accuracy_score(y, self.predict(X)))
                        test_acc = str(accuracy_score(y_test, self.predict(X_test)))
                        train_auc = str(roc_auc_score(y, self.predict_proba(X)[:, 1]))
                        test_auc = str(roc_auc_score(y_test, self.predict_proba(X_test)[:, 1]))
                        acc_stream.write(str(opti) + "," + train_acc + "," + test_acc + "," + train_auc + "," + test_auc + "," + str(self.n_iterations_ - 1) + "," + n_treess + "," + str(min(self.margins_norm_)) + "," + str(np.mean(self.margins_norm_)) + "," + str(np.var(self.margins_norm_)) + "\n")
                        acc_stream.flush()

                # optimality condition. an epsilon is added to fix column generation convergence problems
                if pred @ sample_weights < r + self.opti_gap:
                    if not self.quiet:
                        print("np.dot(predictions, sample_weigths):{} < r:{} + espsilon:{} ==> we cannot add the new tree. End of iterations".format(pred @ sample_weights, r, self.opti_gap))
                        print("Objective value at end is", opti)
                    self.optimal_ = True
                    self.objective_ = opti
                    break
                # not yet optimal
                self.objective_ = opti
                if not self.quiet:
                    print("np.dot(predictions, sample_weigths):{} >= r:{} + epsilon:{}. We can add the new tree.".format(pred @ sample_weights, r, self.opti_gap))

            # add new prediction to all predictions matrix.Each column represents predictions of a tree for all examples
            predictions = pred.reshape((-1, 1)) if predictions is None else np.concatenate((predictions, pred.reshape(-1, 1)), axis=1)

            if not self.quiet:
                print("whole predictions shape", predictions.shape)
                print("run dual...")

            # add the new estimator to list of estimators
            self.estimators_.append(deepcopy(clf))

            # compute the dual to find new sample weights (the best to reach optimality) for the next estimator to add
            if self.model == MODEL_LP_RATSCH:
                r, sample_weights, opti, self.estimator_weights_ = self.compute_dual_ratsch(predictions)
            elif self.model == MODEL_LP_DEMIRIZ:
                r, sample_weights, opti, self.estimator_weights_ = self.compute_dual_demiriz(predictions)
            elif self.model == MODEL_QP_MDBOOST:
                r, sample_weights, opti, self.estimator_weights_ = self.compute_dual_mdboost(predictions)

            # compute some metrics (margins concept) based on adaboost intuition
            self.margins_ = (predictions @ np.array(self.estimator_weights_).reshape(-1, 1)).transpose().tolist()[0]
            self.margins_norm_ = (predictions @ np.array([float(i)/sum(self.estimator_weights_) for i in self.estimator_weights_]).reshape(-1, 1)).transpose().tolist()[0] if sum(self.estimator_weights_) > 0 else None

            if not self.quiet:
                print("after dual")
                print("We got", len(self.estimator_weights_), "trees with weights w:", self.estimator_weights_)
                print("Objective value at this stage is", opti)
                print("Value of r is", r)
                print("The sorted margin at this stage is", sorted(self.margins_))
                print("min margin:", min(self.margins_norm_), "\tmax margin:", max(self.margins_norm_), "\tavg margin:", np.mean(self.margins_norm_), "\tstd margin:", np.std(self.margins_norm_), "\tsum:", sum(self.margins_norm_))
                print("number of neg margins:", len([marg for marg in self.margins_norm_ if marg < 0]), "\tnumber of pos margins:", len([marg for marg in self.margins_norm_ if marg >= 0]))
                print("The new sample weight for the next iteration is", sample_weights.tolist(), "\n")

            self.n_iterations_ += 1

        # close the metrics file
        if X_test is not None and y_test is not None:  # handle each iteration value
            acc_stream.close()

        # compute the learning duration and fix the iterations number counter
        self.duration_ = time.perf_counter() - start_time
        self.n_iterations_ -= 1

        # at the end, remove the useless estimators (the one with weight = 0)
        zero_ind = [i for i, val in enumerate(self.estimator_weights_) if val == 0]
        if not self.quiet:
            print("\nall tree w", self.estimator_weights_, "\n", "zero ind", zero_ind)
        self.estimator_weights_ = np.delete(self.estimator_weights_, np.s_[zero_ind], axis=0)
        self.estimators_ = [clf for clf_id, clf in enumerate(self.estimators_) if clf_id not in zero_ind]
        predictions = np.delete(predictions, np.s_[zero_ind], axis=1)
        if not self.quiet:
            print("final pred shape", predictions.shape)

        # compute training accuracy of the found ensemble and store it in the variable `accuracy_`
        forest_pred_val = np.dot(predictions, np.array(self.estimator_weights_))
        train_pred_correct_or_not = np.where(forest_pred_val < 0, 0, 1)  # 1 if prediction is correct, 0 otherwise
        self.accuracy_ = sum(train_pred_correct_or_not)/len(y)

        # save the number of found estimators
        self.n_estimators_ = len(self.estimators_)

        # Show each non-zero estimator weight and its tree expression if it has
        if not self.quiet:
            for i, estimator in enumerate(sorted(zip(self.estimator_weights_, self.estimators_), key=lambda x: x[0], reverse=True)):
                print("clf n_", i+1, " ==>\tweight: ", estimator[0], sep="", end="")
                if hasattr(estimator[1], "tree_") and isinstance(estimator[1].tree_, dict):
                    print(" \tjson_string: ", estimator[1].tree_, sep="")
                else:
                    print()

        if self.n_estimators_ == 0:
            raise NotFittedError("No tree selected")

        return self

    def compute_dual_ratsch(self, predictions):  # primal is maximization
        r_ = cp.Variable()
        u_ = cp.Variable(self.n_instances)
        obj = cp.Minimize(r_)
        constr = [predictions[:, i] @ u_ <= r_ for i in range(predictions.shape[1])]
        constr.append(-u_ <= 0)
        if self.regulator > 0:
            constr.append(u_ <= self.regulator)
        constr.append(cp.sum(u_) == 1)
        problem = cp.Problem(obj, constr)
        if self.quiet:
            old_stdout = sys.stdout
            sys.stdout = open(os.devnull, "w")
        opti = problem.solve(solver=cp.GUROBI)
        if self.quiet:
            sys.stdout = old_stdout
        return r_.value, u_.value, opti, [x.dual_value.tolist() for x in problem.constraints[:predictions.shape[1]]]

    def compute_dual_demiriz(self, predictions):  # primal is minimization
        u_ = cp.Variable(self.n_instances)
        obj = cp.Maximize(cp.sum(u_))
        constr = [predictions[:, i] @ u_ <= 1 for i in range(predictions.shape[1])]
        constr.append(-u_ <= 0)
        constr.append(u_ <= self.regulator)
        problem = cp.Problem(obj, constr)
        if self.quiet:
            old_stdout = sys.stdout
            sys.stdout = open(os.devnull, "w")
        opti = problem.solve(solver=cp.GUROBI)
        if self.quiet:
            sys.stdout = old_stdout
        return 1, u_.value, opti, [x.dual_value.tolist() for x in problem.constraints[:predictions.shape[1]]]

    def compute_dual_mdboost(self, predictions):  # primal is maximization
        r_ = cp.Variable()
        u_ = cp.Variable(self.n_instances)
        obj = cp.Minimize(r_ + 1/(2*self.regulator) * cp.quad_form((u_ - 1), self.A_inv))
        constr = [predictions[:, i] @ u_ <= r_ for i in range(predictions.shape[1])]
        problem = cp.Problem(obj, constr)
        if self.quiet:
            old_stdout = sys.stdout
            sys.stdout = open(os.devnull, "w")
        opti = problem.solve(solver=cp.GUROBI)
        if self.quiet:
            sys.stdout = old_stdout
        return r_.value, u_.value, opti, [x.dual_value.tolist() for x in problem.constraints]

[docs]    def softmax(self, X, copy=True):
        """
        Calculate the softmax function.
        The softmax function is calculated by
        np.exp(X) / np.sum(np.exp(X), axis=1)
        This will cause overflow when large values are exponentiated.
        Hence the largest value in each row is subtracted from each data
        point to prevent this.
        Parameters
        ----------
        X : array-like of float of shape (M, N)
            Argument to the logistic function.
        copy : bool, default=True
            Copy X or not.
        Returns
        -------
        out : ndarray of shape (M, N)
            Softmax function evaluated at every point in x.
        """
        if copy:
            X = np.copy(X)
        max_prob = np.max(X, axis=1).reshape((-1, 1))
        X -= max_prob
        np.exp(X, X)
        sum_prob = np.sum(X, axis=1).reshape((-1, 1))
        X /= sum_prob
        return X

    def predict(self, X, y=None):
        # in case no tree has been found
        if self.n_estimators_ == 0:  # fit method has not been called or the regulator is not good
            print(self.estimators_)
            print(self.estimator_weights_)
            raise NotFittedError("Call fit method first or change the regulator" % {'name': type(self).__name__})
        # Run a prediction on each estimator
        predict_per_clf = np.asarray([clf.predict(X) for clf in self.estimators_]).transpose()
        # return the prediction based on all estimators. The mex weighted class is returned
        return np.apply_along_axis(lambda x: np.argmax(np.bincount(x, weights=self.estimator_weights_)), axis=1, arr=predict_per_clf.astype('int'))

    def predict_proba(self, X):
        classes = self.classes_[:, np.newaxis]
        pred = sum((np.array(estimator.predict(X)) == classes).T * w for estimator, w in zip(self.estimators_, self.estimator_weights_))
        pred /= sum(self.estimator_weights_)
        pred[:, 0] *= -1
        decision = pred.sum(axis=1)
        decision = np.vstack([-decision, decision]).T / 2
        return self.softmax(decision, False)

    def get_nodes_count(self):
        if self.n_estimators_ == 0:  # fit method has not been called
            raise NotFittedError("Call fit method first" % {'name': type(self).__name__})
        return sum([clf.get_nodes_count() for clf in self.estimators_])