from sklearn.base import ClusterMixin
from sklearn.utils.validation import assert_all_finite, check_array
from sklearn.neighbors import DistanceMetric
from ..predictors.predictor import DL85Predictor
import numpy as np
[docs]class DL85Cluster(DL85Predictor, ClusterMixin):
""" An optimal binary decision tree classifier.
Parameters
----------
max_depth : int, default=1
Maximum depth of the tree to be found
min_sup : int, default=1
Minimum number of examples per leaf
max_error : int, default=0
Maximum allowed error. Default value stands for no bound. If no tree can be found that is strictly better, the model remains empty.
stop_after_better : bool, default=False
A parameter used to indicate if the search will stop after finding a tree better than max_error
time_limit : int, default=0
Allocated time in second(s) for the search. Default value stands for no limit. The best tree found within the time limit is stored, if this tree is better than max_error.
verbose : bool, default=False
A parameter used to switch on/off the print of what happens during the search
desc : bool, default=False
A parameter used to indicate if the sorting of the items is done in descending order of information gain
asc : bool, default=False
A parameter used to indicate if the sorting of the items is done in ascending order of information gain
repeat_sort : bool, default=False
A parameter used to indicate whether the sorting of items is done at each level of the lattice or only before the search
print_output : bool, default=False
A parameter used to indicate if the search output will be printed or not
Attributes
----------
tree_ : str
Outputted tree in serialized form; remains empty as long as no model is learned.
size_ : int
The size of the outputted tree
depth_ : int
Depth of the found tree
error_ : float
Error of the found tree
accuracy_ : float
Accuracy of the found tree on training set
lattice_size_ : int
The number of nodes explored before found the optimal tree
runtime_ : float
Time of the optimal decision tree search
timeout_ : bool
Whether the search reached timeout or not
classes_ : ndarray, shape (n_classes,)
The classes seen at :meth:`fit`.
"""
def __init__(
self,
max_depth=1,
min_sup=1,
error_function=None,
max_error=0,
stop_after_better=False,
time_limit=0,
verbose=False,
desc=False,
asc=False,
repeat_sort=False,
leaf_value_function=None,
print_output=False):
DL85Predictor.__init__(self,
max_depth=max_depth,
min_sup=min_sup,
error_function=error_function,
fast_error_function=None,
max_error=max_error,
stop_after_better=stop_after_better,
time_limit=time_limit,
verbose=verbose,
desc=desc,
asc=asc,
repeat_sort=repeat_sort,
leaf_value_function=leaf_value_function,
print_output=print_output)
@staticmethod
def default_error(tids, X):
dist = DistanceMetric.get_metric('euclidean')
X_subset = np.asarray([X[index, :] for index in list(tids)], dtype='int32')
centroid = np.mean(X_subset, axis=0).reshape(1, X_subset.shape[1])
distances = [dist.pairwise(instance.reshape(1, X_subset.shape[1]), centroid)[0, 0] for instance in X_subset]
return round(sum(distances), 2)
@staticmethod
def default_leaf_value(tids, X):
return round(np.mean(X.take(list(tids))), 2)
[docs] def fit(self, X, X_error=None):
"""Implements the standard fitting function for a DL8.5 classifier.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The training input samples. If X_error is provided, it represents explanation input
X_error : array-like, shape (n_samples, n_features_1)
The training input used to calculate error. If it is not provided X is used to calculate error
Returns
-------
self : object
Returns self.
"""
# Check that X_error has correct shape and raise ValueError if not
if X_error is not None:
assert_all_finite(X_error)
X_error = check_array(X_error, dtype='int32')
if self.error_function is None:
if X_error is None:
self.error_function = lambda tids: self.default_error(tids, X)
else:
if X_error.shape[0] == X.shape[0]:
self.error_function = lambda tids: self.default_error(tids, X_error)
else:
raise ValueError("X_error does not have the same number of rows as X")
if self.leaf_value_function is None:
if X_error is None:
self.leaf_value_function = lambda tids: self.default_leaf_value(tids, X)
else:
if X_error.shape[0] == X.shape[0]:
self.leaf_value_function = lambda tids: self.default_leaf_value(tids, X_error)
else:
raise ValueError("X_error does not have the same number of rows as X")
# call fit method of the predictor
DL85Predictor.fit(self, X)
# print(self.tree_)
# Return the classifier
return self
[docs] def predict(self, X):
""" Implements the standard predict function for a DL8.5 classifier.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The input samples.
Returns
-------
y : ndarray, shape (n_samples,)
The label for each sample is the label of the closest sample
seen during fit.
"""
return DL85Predictor.predict(self, X)
# return self.y_