eland/eland/ml/exporters/es_gb_models.py
Valeriy Khakhutskyy 0576114a1d
[ML] Export ML model as sklearn Pipeline (#509)
Closes #503

Note: I also had to fix the Sphinx version to 5.3.0 since, starting from 6.0, Sphinx suffers from a TypeError bug, which causes a CI failure.
2023-02-01 16:17:06 +01:00

473 lines
18 KiB
Python

# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from abc import ABC
from typing import Any, List, Literal, Mapping, Optional, Set, Tuple, Union
import numpy as np
from elasticsearch import Elasticsearch
from numpy.typing import ArrayLike
from .._optional import import_optional_dependency
import_optional_dependency("sklearn", on_version="warn")
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble._gb_losses import (
BinomialDeviance,
HuberLossFunction,
LeastSquaresError,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils.validation import check_array
from eland.common import ensure_es_client
from eland.ml.common import TYPE_CLASSIFICATION, TYPE_REGRESSION
from ._sklearn_deserializers import Tree
from .common import ModelDefinitionKeyError
class ESGradientBoostingModel(ABC):
"""
Abstract class for converting Elastic ML model into sklearn Pipeline.
"""
def __init__(
self,
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
model_id: str,
) -> None:
"""
Parameters
----------
es_client : Elasticsearch client argument(s)
- elasticsearch-py parameters or
- elasticsearch-py instance
model_id : str
The unique identifier of the trained inference model in Elasticsearch.
Raises
------
RuntimeError
On failure to retrieve trained model information to the specified model ID.
ValueError
The model is expected to be trained in Elastic Stack. Models initially imported
from xgboost, lgbm, or sklearn are not supported.
"""
self.es_client: Elasticsearch = ensure_es_client(es_client)
self.model_id = model_id
self._trained_model_result = self.es_client.ml.get_trained_models(
model_id=self.model_id,
decompress_definition=True,
include=["hyperparameters", "definition"],
)
if (
"trained_model_configs" not in self._trained_model_result
or len(self._trained_model_result["trained_model_configs"]) == 0
):
raise RuntimeError(
f"Failed to retrieve the trained model for model ID {self.model_id!r}"
)
if "metadata" not in self._trained_model_result["trained_model_configs"][0]:
raise ValueError(
"Error initializing sklearn classifier. Incorrect prior class probability. "
+ "Note: only export of models trained in the Elastic Stack is supported."
)
preprocessors = []
if "preprocessors" in self._definition:
preprocessors = self._definition["preprocessors"]
(
self.feature_names_in_,
self.input_field_names,
) = ESGradientBoostingModel._get_feature_names_in_(
preprocessors,
self._definition["trained_model"]["ensemble"]["feature_names"],
self._trained_model_result["trained_model_configs"][0]["input"][
"field_names"
],
)
feature_names_map = {name: i for i, name in enumerate(self.feature_names_in_)}
trained_models = self._definition["trained_model"]["ensemble"]["trained_models"]
self._trees = []
for trained_model in trained_models:
self._trees.append(Tree(trained_model["tree"], feature_names_map))
# 0's tree is the constant estimator
self.n_estimators = len(trained_models) - 1
def _initialize_estimators(self, decision_tree_type) -> None:
self.estimators_ = np.ndarray(
(len(self._trees) - 1, 1), dtype=decision_tree_type
)
self.n_estimators_ = self.estimators_.shape[0]
for i in range(self.n_estimators_):
estimator = decision_tree_type()
estimator.tree_ = self._trees[i + 1].tree
estimator.n_features_in_ = self.n_features_in_
estimator.max_depth = self._max_depth
estimator.max_features_ = self.max_features_
self.estimators_[i, 0] = estimator
def _extract_common_parameters(self) -> None:
self.n_features_in_ = len(self.feature_names_in_)
self.max_features_ = self.n_features_in_
@property
def _max_depth(self) -> int:
return max(map(lambda x: x.max_depth, self._trees))
@property
def _n_outputs(self) -> int:
return self._trees[0].n_outputs
@property
def _definition(self) -> Mapping[Union[str, int], Any]:
return self._trained_model_result["trained_model_configs"][0]["definition"]
@staticmethod
def _get_feature_names_in_(
preprocessors, feature_names, field_names
) -> Tuple[List[str], Set[str]]:
input_field_names = set()
def add_input_field_name(preprocessor_type: str, feature_name: str) -> None:
if feature_name in feature_names:
input_field_names.add(preprocessor[preprocessor_type]["field"])
for preprocessor in preprocessors:
if "target_mean_encoding" in preprocessor:
add_input_field_name(
"target_mean_encoding",
preprocessor["target_mean_encoding"]["feature_name"],
)
elif "frequency_encoding" in preprocessor:
add_input_field_name(
"frequency_encoding",
preprocessor["frequency_encoding"]["feature_name"],
)
elif "one_hot_encoding" in preprocessor:
for feature_name in preprocessor["one_hot_encoding"][
"hot_map"
].values():
add_input_field_name("one_hot_encoding", feature_name)
for field_name in field_names:
if field_name in feature_names and field_name not in input_field_names:
input_field_names.add(field_name)
return feature_names, input_field_names
@property
def preprocessors(self) -> List[Any]:
"""
Returns the list of preprocessor JSON definitions.
Returns
-------
List[Any]
List of preprocessors definitions or [].
"""
if "preprocessors" in self._definition:
return self._definition["preprocessors"]
return []
def fit(self, X, y, sample_weight=None, monitor=None) -> None:
"""
Override of the sklearn fit() method. It does nothing since Elastic ML models are
trained in the Elastic Stack or imported.
"""
# Do nothing, model if fitted using Elasticsearch API
pass
class ESGradientBoostingClassifier(ESGradientBoostingModel, GradientBoostingClassifier):
"""
Elastic ML model wrapper compatible with sklearn GradientBoostingClassifier.
"""
def __init__(
self,
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
model_id: str,
) -> None:
"""
Parameters
----------
es_client : Elasticsearch client argument(s)
- elasticsearch-py parameters or
- elasticsearch-py instance
model_id : str
The unique identifier of the trained inference model in Elasticsearch.
Raises
------
NotImplementedError
Multi-class classification is not supported at the moment.
ValueError
The classifier should be defined for at least 2 classes.
ModelDefinitionKeyError
If required data cannot be extracted from the model definition due to a schema change.
"""
try:
ESGradientBoostingModel.__init__(self, es_client, model_id)
self._extract_common_parameters()
GradientBoostingClassifier.__init__(
self,
learning_rate=1.0,
n_estimators=self.n_estimators,
max_depth=self._max_depth,
)
if "classification_labels" in self._definition["trained_model"]["ensemble"]:
self.classes_ = np.array(
self._definition["trained_model"]["ensemble"][
"classification_labels"
]
)
else:
self.classes_ = None
self.n_outputs = self._n_outputs
if self.classes_ is not None:
self.n_classes_ = len(self.classes_)
elif self.n_outputs <= 2:
self.n_classes_ = 2
else:
self.n_classes_ = self.n_outputs
if self.n_classes_ == 2:
self._loss = BinomialDeviance(self.n_classes_)
# self.n_outputs = 1
elif self.n_classes_ > 2:
raise NotImplementedError("Only binary classification is implemented.")
else:
raise ValueError(f"At least 2 classes required. got {self.n_classes_}.")
self.init_ = self._initialize_init_()
self._initialize_estimators(DecisionTreeClassifier)
except KeyError as ex:
raise ModelDefinitionKeyError(ex) from ex
@property
def analysis_type(self) -> Literal["classification"]:
return TYPE_CLASSIFICATION
def _initialize_init_(self) -> DummyClassifier:
estimator = DummyClassifier(strategy="prior")
estimator.n_classes_ = self.n_classes_
estimator.n_outputs_ = self.n_outputs
estimator.classes_ = np.arange(self.n_classes_)
estimator._strategy = estimator.strategy
if self.n_classes_ == 2:
log_odds = self._trees[0].tree.value.flatten()[0]
if np.isnan(log_odds):
raise ValueError(
"Error initializing sklearn classifier. Incorrect prior class probability. "
+ "Note: only export of models trained in the Elastic Stack is supported."
)
class_prior = 1 / (1 + np.exp(-log_odds))
estimator.class_prior_ = np.array([1 - class_prior, class_prior])
else:
raise NotImplementedError("Only binary classification is implemented.")
return estimator
def predict_proba(
self, X, feature_names_in: Optional[Union["ArrayLike", List[str]]] = None
) -> "ArrayLike":
"""Predict class probabilities for X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples.
feature_names_in : {array of string, list of string} of length n_features.
Feature names of the corresponding columns in X. Important, since the column list
can be extended by ColumnTransformer through the pipeline. By default None.
Returns
-------
ArrayLike of shape (n_samples, n_classes)
The class probabilities of the input samples. The order of the
classes corresponds to that in the attribute :term:`classes_`.
"""
if feature_names_in is not None:
if X.shape[1] != len(feature_names_in):
raise ValueError(
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
)
if isinstance(feature_names_in, np.ndarray):
feature_names_in = feature_names_in.tolist()
# select columns used by the model in the correct order
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
X = check_array(X)
return GradientBoostingClassifier.predict_proba(self, X)
def predict(
self,
X: "ArrayLike",
feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
) -> "ArrayLike":
"""Predict class for X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples.
feature_names_in : {array of string, list of string} of length n_features.
Feature names of the corresponding columns in X. Important, since the column list
can be extended by ColumnTransformer through the pipeline. By default None.
Returns
-------
ArrayLike of shape (n_samples,)
The predicted values.
"""
if feature_names_in is not None:
if X.shape[1] != len(feature_names_in):
raise ValueError(
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
)
if isinstance(feature_names_in, np.ndarray):
feature_names_in = feature_names_in.tolist()
# select columns used by the model in the correct order
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
X = check_array(X)
return GradientBoostingClassifier.predict(self, X)
class ESGradientBoostingRegressor(ESGradientBoostingModel, GradientBoostingRegressor):
"""
Elastic ML model wrapper compatible with sklearn GradientBoostingRegressor.
"""
def __init__(
self,
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
model_id: str,
) -> None:
"""
Parameters
----------
es_client : Elasticsearch client argument(s)
- elasticsearch-py parameters or
- elasticsearch-py instance
model_id : str
The unique identifier of the trained inference model in Elasticsearch.
Raises
------
NotImplementedError
Only MSE, MSLE, and Huber loss functions are supported.
ModelDefinitionKeyError
If required data cannot be extracted from the model definition due to a schema change.
"""
try:
ESGradientBoostingModel.__init__(self, es_client, model_id)
self._extract_common_parameters()
GradientBoostingRegressor.__init__(
self,
learning_rate=1.0,
n_estimators=self.n_estimators,
max_depth=self._max_depth,
)
self.n_outputs = 1
loss_function = self._trained_model_result["trained_model_configs"][0][
"metadata"
]["analytics_config"]["analysis"][self.analysis_type]["loss_function"]
if loss_function == "mse" or loss_function == "msle":
self.criterion = "squared_error"
self._loss = LeastSquaresError()
elif loss_function == "huber":
loss_parameter = loss_function = self._trained_model_result[
"trained_model_configs"
][0]["metadata"]["analytics_config"]["analysis"][self.analysis_type][
"loss_function_parameter"
]
self.criterion = "huber"
self._loss = HuberLossFunction(loss_parameter)
else:
raise NotImplementedError(
"Only MSE, MSLE and Huber loss functions are supported."
)
self.init_ = self._initialize_init_()
self._initialize_estimators(DecisionTreeRegressor)
except KeyError as ex:
raise ModelDefinitionKeyError(ex) from ex
@property
def analysis_type(self) -> Literal["regression"]:
return TYPE_REGRESSION
def _initialize_init_(self) -> DummyRegressor:
constant = self._trees[0].tree.value[0]
estimator = DummyRegressor(
strategy="constant",
constant=constant,
)
estimator.constant_ = np.array([constant])
estimator.n_outputs_ = 1
return estimator
def predict(
self,
X: "ArrayLike",
feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
) -> "ArrayLike":
"""Predict targets for X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples.
feature_names_in : {array of string, list of string} of length n_features.
Feature names of the corresponding columns in X. Important, since the column list
can be extended by ColumnTransformer through the pipeline. By default None.
Returns
-------
ArrayLike of shape (n_samples,)
The predicted values.
"""
if feature_names_in is not None:
if X.shape[1] != len(feature_names_in):
raise ValueError(
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
)
if isinstance(X, np.ndarray):
feature_names_in = feature_names_in.tolist()
# select columns used by the model in the correct order
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
X = check_array(X)
return GradientBoostingRegressor.predict(self, X)