[ML] Export ML model as sklearn Pipeline (#509)

Closes #503 Note: I also had to fix the Sphinx version to 5.3.0 since, starting from 6.0, Sphinx suffers from a TypeError bug, which causes a CI failure.
2025-07-11 00:02:14 +08:00 · 2023-02-01 16:17:06 +01:00 · 2023-02-01 16:17:06 +01:00 · 0576114a1d
commit 0576114a1d
parent 2ea96322b3
18 changed files with 1117 additions and 22 deletions
--- a/.ci/test-matrix.yml
+++ b/.ci/test-matrix.yml
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -191,7 +191,7 @@ currently using a minimum version of PyCharm 2019.2.4.

    ``` bash
    > import eland as ed
-    > ed_df = ed.DataFrame('localhost', 'flights')
+    > ed_df = ed.DataFrame('http://localhost:9200', 'flights')
    ```

 * To run the automatic formatter and check for lint issues run
--- a/eland/ml/exporters/init.py
+++ b/eland/ml/exporters/init.py
@ -0,0 +1,16 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
--- a/eland/ml/exporters/_sklearn_deserializers.py
+++ b/eland/ml/exporters/_sklearn_deserializers.py
@ -0,0 +1,217 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from typing import Any, Dict
+
+import numpy as np
+
+from .._optional import import_optional_dependency
+
+import_optional_dependency("sklearn", on_version="warn")
+
+import sklearn
+from sklearn.preprocessing import FunctionTransformer
+
+
+class Tree:
+    """Wrapper to create sklearn Tree objects from Elastic ML tree
+    description in JSON format.
+    """
+
+    def __init__(
+        self,
+        json_tree: Dict[str, Any],
+        feature_names_map: Dict[str, int],
+    ):
+        tree_leaf = -1
+
+        node_count = len(json_tree["tree_structure"])
+        children_left = np.ones((node_count,), dtype=int) * tree_leaf
+        children_right = np.ones((node_count,), dtype=int) * tree_leaf
+        feature = np.ones((node_count,), dtype=int) * -2
+        threshold = np.ones((node_count,), dtype=float) * -2
+        impurity = np.zeros((node_count,), dtype=float)
+        # value works only for regression and binary classification
+        value = np.zeros((node_count, 1, 1), dtype="<f8")
+        n_node_samples = np.zeros((node_count,), dtype=int)
+
+        # parse values from the JSON tree
+        feature_names = json_tree["feature_names"]
+        for json_node in json_tree["tree_structure"]:
+            node_id = json_node["node_index"]
+            if "number_samples" in json_node:
+                n_node_samples[node_id] = json_node["number_samples"]
+            else:
+                n_node_samples[node_id] = 0
+
+            if "leaf_value" not in json_node:
+                children_left[node_id] = json_node["left_child"]
+                children_right[node_id] = json_node["right_child"]
+                feature[node_id] = feature_names_map[
+                    feature_names[json_node["split_feature"]]
+                ]
+                threshold[node_id] = json_node["threshold"]
+                if "split_gain" in json_node:
+                    impurity[node_id] = json_node["split_gain"]
+                else:
+                    impurity[node_id] = -1
+            else:
+                value[node_id, 0, 0] = json_node["leaf_value"]
+
+        # iterate through tree to get max depth and expected values
+        weighted_n_node_samples = n_node_samples.copy()
+        self.max_depth = Tree._compute_expectations(
+            children_left=children_left,
+            children_right=children_right,
+            node_sample_weight=weighted_n_node_samples,
+            values=value,
+            node_index=0,
+        )
+        self.n_outputs = value.shape[-1]
+
+        # initialize the sklearn tree
+        self.tree = sklearn.tree._tree.Tree(
+            len(feature_names), np.array([1], dtype=int), 1
+        )
+        node_state = np.array(
+            [
+                (
+                    children_left[i],
+                    children_right[i],
+                    feature[i],
+                    threshold[i],
+                    impurity[i],
+                    n_node_samples[i],
+                    weighted_n_node_samples[i],
+                )
+                for i in range(node_count)
+            ],
+            dtype=[
+                ("left_child", "<i8"),
+                ("right_child", "<i8"),
+                ("feature", "<i8"),
+                ("threshold", "<f8"),
+                ("impurity", "<f8"),
+                ("n_node_samples", "<i8"),
+                ("weighted_n_node_samples", "<f8"),
+            ],
+        )
+        state = {
+            "max_depth": self.max_depth,
+            "node_count": node_count,
+            "nodes": node_state,
+            "values": value,
+        }
+        self.tree.__setstate__(state)
+
+    @staticmethod
+    def _compute_expectations(
+        children_left, children_right, node_sample_weight, values, node_index
+    ) -> int:
+        if children_right[node_index] == -1:
+            return 0
+
+        left_index = children_left[node_index]
+        right_index = children_right[node_index]
+        depth_left = Tree._compute_expectations(
+            children_left, children_right, node_sample_weight, values, left_index
+        )
+        depth_right = Tree._compute_expectations(
+            children_left, children_right, node_sample_weight, values, right_index
+        )
+        left_weight = node_sample_weight[left_index]
+        right_weight = node_sample_weight[right_index]
+
+        v = (
+            (
+                left_weight * values[left_index, :]
+                + right_weight * values[right_index, :]
+            )
+            / (left_weight + right_weight)
+            if left_weight + right_weight > 0
+            else 0
+        )
+        values[node_index, :] = v
+        return max(depth_left, depth_right) + 1
+
+
+class TargetMeanEncoder(FunctionTransformer):
+    """FunctionTransformer implementation of the target mean encoder, which is
+    deserialized from the Elastic ML preprocessor description in JSON formats.
+    """
+
+    def __init__(self, preprocessor: Dict[str, Any]):
+        self.preprocessor = preprocessor
+        target_map = self.preprocessor["target_mean_encoding"]["target_map"]
+        feature_name_out = self.preprocessor["target_mean_encoding"]["feature_name"]
+        self.field_name_in = self.preprocessor["target_mean_encoding"]["field"]
+        fallback_value = self.preprocessor["target_mean_encoding"]["default_value"]
+
+        def func(column):
+            return np.array(
+                [
+                    target_map[str(category)]
+                    if category in target_map
+                    else fallback_value
+                    for category in column
+                ]
+            ).reshape(-1, 1)
+
+        def feature_names_out(ft, carr):
+            return [feature_name_out if c == self.field_name_in else c for c in carr]
+
+        super().__init__(func=func, feature_names_out=feature_names_out)
+
+
+class FrequencyEncoder(FunctionTransformer):
+    """FunctionTransformer implementation of the frequency encoder, which is
+    deserialized from the Elastic ML preprocessor description in JSON format.
+    """
+
+    def __init__(self, preprocessor: Dict[str, Any]):
+        self.preprocessor = preprocessor
+        frequency_map = self.preprocessor["frequency_encoding"]["frequency_map"]
+        feature_name_out = self.preprocessor["frequency_encoding"]["feature_name"]
+        self.field_name_in = self.preprocessor["frequency_encoding"]["field"]
+        fallback_value = 0.0
+
+        def func(column):
+            return np.array(
+                [
+                    frequency_map[str(category)]
+                    if category in frequency_map
+                    else fallback_value
+                    for category in column
+                ]
+            ).reshape(-1, 1)
+
+        def feature_names_out(ft, carr):
+            return [feature_name_out if c == self.field_name_in else c for c in carr]
+
+        super().__init__(func=func, feature_names_out=feature_names_out)
+
+
+class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):
+    """Wrapper for sklearn one-hot encoder, which is deserialized from the
+    Elastic ML preprocessor description in JSON format.
+    """
+
+    def __init__(self, preprocessor: Dict[str, Any]):
+        self.preprocessor = preprocessor
+        self.field_name_in = self.preprocessor["one_hot_encoding"]["field"]
+        self.cats = [list(self.preprocessor["one_hot_encoding"]["hot_map"].keys())]
+        super().__init__(categories=self.cats, handle_unknown="ignore")
--- a/eland/ml/exporters/common.py
+++ b/eland/ml/exporters/common.py
@ -0,0 +1,46 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+import eland
+
+
+class ModelDefinitionKeyError(Exception):
+    """
+    This exception is raised when a key is not found in the model definition.
+
+    Attributes:
+        missed_key (str): The key that was not found in the model definition.
+        available_keys (List[str]): The list of keys that are available in the model definition.
+
+    Examples:
+        model_definition = {"key1": "value1", "key2": "value2"}
+        try:
+            model_definition["key3"]
+        except KeyError as ex:
+            raise ModelDefinitionKeyError(ex) from ex
+    """
+
+    def __init__(self, ex: KeyError):
+        self.missed_key = ex.args[0]
+
+    def __str__(self):
+        return (
+            f'Key "{self.missed_key}" is not available. '
+            + "The model definition may have changed. "
+            + "Make sure you are using an Elasticsearch version compatible "
+            + f"with Eland {eland.__version__}."
+        )
--- a/eland/ml/exporters/es_gb_models.py
+++ b/eland/ml/exporters/es_gb_models.py
@ -0,0 +1,472 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from abc import ABC
+from typing import Any, List, Literal, Mapping, Optional, Set, Tuple, Union
+
+import numpy as np
+from elasticsearch import Elasticsearch
+from numpy.typing import ArrayLike
+
+from .._optional import import_optional_dependency
+
+import_optional_dependency("sklearn", on_version="warn")
+
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
+from sklearn.ensemble._gb_losses import (
+    BinomialDeviance,
+    HuberLossFunction,
+    LeastSquaresError,
+)
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils.validation import check_array
+
+from eland.common import ensure_es_client
+from eland.ml.common import TYPE_CLASSIFICATION, TYPE_REGRESSION
+
+from ._sklearn_deserializers import Tree
+from .common import ModelDefinitionKeyError
+
+
+class ESGradientBoostingModel(ABC):
+    """
+    Abstract class for converting Elastic ML model into sklearn Pipeline.
+    """
+
+    def __init__(
+        self,
+        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
+        model_id: str,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        es_client : Elasticsearch client argument(s)
+            - elasticsearch-py parameters or
+            - elasticsearch-py instance
+        model_id : str
+            The unique identifier of the trained inference model in Elasticsearch.
+
+        Raises
+        ------
+        RuntimeError
+            On failure to retrieve trained model information to the specified model ID.
+        ValueError
+            The model is expected to be trained in Elastic Stack. Models initially imported
+            from xgboost, lgbm, or sklearn are not supported.
+        """
+        self.es_client: Elasticsearch = ensure_es_client(es_client)
+        self.model_id = model_id
+
+        self._trained_model_result = self.es_client.ml.get_trained_models(
+            model_id=self.model_id,
+            decompress_definition=True,
+            include=["hyperparameters", "definition"],
+        )
+
+        if (
+            "trained_model_configs" not in self._trained_model_result
+            or len(self._trained_model_result["trained_model_configs"]) == 0
+        ):
+            raise RuntimeError(
+                f"Failed to retrieve the trained model for model ID {self.model_id!r}"
+            )
+
+        if "metadata" not in self._trained_model_result["trained_model_configs"][0]:
+            raise ValueError(
+                "Error initializing sklearn classifier. Incorrect prior class probability. "
+                + "Note: only export of models trained in the Elastic Stack is supported."
+            )
+        preprocessors = []
+        if "preprocessors" in self._definition:
+            preprocessors = self._definition["preprocessors"]
+        (
+            self.feature_names_in_,
+            self.input_field_names,
+        ) = ESGradientBoostingModel._get_feature_names_in_(
+            preprocessors,
+            self._definition["trained_model"]["ensemble"]["feature_names"],
+            self._trained_model_result["trained_model_configs"][0]["input"][
+                "field_names"
+            ],
+        )
+
+        feature_names_map = {name: i for i, name in enumerate(self.feature_names_in_)}
+
+        trained_models = self._definition["trained_model"]["ensemble"]["trained_models"]
+        self._trees = []
+        for trained_model in trained_models:
+            self._trees.append(Tree(trained_model["tree"], feature_names_map))
+
+        # 0's tree is the constant estimator
+        self.n_estimators = len(trained_models) - 1
+
+    def _initialize_estimators(self, decision_tree_type) -> None:
+        self.estimators_ = np.ndarray(
+            (len(self._trees) - 1, 1), dtype=decision_tree_type
+        )
+        self.n_estimators_ = self.estimators_.shape[0]
+
+        for i in range(self.n_estimators_):
+            estimator = decision_tree_type()
+            estimator.tree_ = self._trees[i + 1].tree
+            estimator.n_features_in_ = self.n_features_in_
+            estimator.max_depth = self._max_depth
+            estimator.max_features_ = self.max_features_
+            self.estimators_[i, 0] = estimator
+
+    def _extract_common_parameters(self) -> None:
+        self.n_features_in_ = len(self.feature_names_in_)
+        self.max_features_ = self.n_features_in_
+
+    @property
+    def _max_depth(self) -> int:
+        return max(map(lambda x: x.max_depth, self._trees))
+
+    @property
+    def _n_outputs(self) -> int:
+        return self._trees[0].n_outputs
+
+    @property
+    def _definition(self) -> Mapping[Union[str, int], Any]:
+        return self._trained_model_result["trained_model_configs"][0]["definition"]
+
+    @staticmethod
+    def _get_feature_names_in_(
+        preprocessors, feature_names, field_names
+    ) -> Tuple[List[str], Set[str]]:
+        input_field_names = set()
+
+        def add_input_field_name(preprocessor_type: str, feature_name: str) -> None:
+            if feature_name in feature_names:
+                input_field_names.add(preprocessor[preprocessor_type]["field"])
+
+        for preprocessor in preprocessors:
+            if "target_mean_encoding" in preprocessor:
+                add_input_field_name(
+                    "target_mean_encoding",
+                    preprocessor["target_mean_encoding"]["feature_name"],
+                )
+            elif "frequency_encoding" in preprocessor:
+                add_input_field_name(
+                    "frequency_encoding",
+                    preprocessor["frequency_encoding"]["feature_name"],
+                )
+            elif "one_hot_encoding" in preprocessor:
+                for feature_name in preprocessor["one_hot_encoding"][
+                    "hot_map"
+                ].values():
+                    add_input_field_name("one_hot_encoding", feature_name)
+
+        for field_name in field_names:
+            if field_name in feature_names and field_name not in input_field_names:
+                input_field_names.add(field_name)
+
+        return feature_names, input_field_names
+
+    @property
+    def preprocessors(self) -> List[Any]:
+        """
+        Returns the list of preprocessor JSON definitions.
+
+        Returns
+        -------
+        List[Any]
+            List of preprocessors definitions or [].
+        """
+        if "preprocessors" in self._definition:
+            return self._definition["preprocessors"]
+        return []
+
+    def fit(self, X, y, sample_weight=None, monitor=None) -> None:
+        """
+        Override of the sklearn fit() method. It does nothing since Elastic ML models are
+        trained in the Elastic Stack or imported.
+        """
+        # Do nothing, model if fitted using Elasticsearch API
+        pass
+
+
+class ESGradientBoostingClassifier(ESGradientBoostingModel, GradientBoostingClassifier):
+    """
+    Elastic ML model wrapper compatible with sklearn GradientBoostingClassifier.
+    """
+
+    def __init__(
+        self,
+        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
+        model_id: str,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        es_client : Elasticsearch client argument(s)
+            - elasticsearch-py parameters or
+            - elasticsearch-py instance
+        model_id : str
+            The unique identifier of the trained inference model in Elasticsearch.
+
+        Raises
+        ------
+        NotImplementedError
+            Multi-class classification is not supported at the moment.
+        ValueError
+            The classifier should be defined for at least 2 classes.
+        ModelDefinitionKeyError
+            If required data cannot be extracted from the model definition due to a schema change.
+        """
+
+        try:
+            ESGradientBoostingModel.__init__(self, es_client, model_id)
+            self._extract_common_parameters()
+            GradientBoostingClassifier.__init__(
+                self,
+                learning_rate=1.0,
+                n_estimators=self.n_estimators,
+                max_depth=self._max_depth,
+            )
+
+            if "classification_labels" in self._definition["trained_model"]["ensemble"]:
+                self.classes_ = np.array(
+                    self._definition["trained_model"]["ensemble"][
+                        "classification_labels"
+                    ]
+                )
+            else:
+                self.classes_ = None
+
+            self.n_outputs = self._n_outputs
+            if self.classes_ is not None:
+                self.n_classes_ = len(self.classes_)
+            elif self.n_outputs <= 2:
+                self.n_classes_ = 2
+            else:
+                self.n_classes_ = self.n_outputs
+
+            if self.n_classes_ == 2:
+                self._loss = BinomialDeviance(self.n_classes_)
+                # self.n_outputs = 1
+            elif self.n_classes_ > 2:
+                raise NotImplementedError("Only binary classification is implemented.")
+            else:
+                raise ValueError(f"At least 2 classes required. got {self.n_classes_}.")
+
+            self.init_ = self._initialize_init_()
+            self._initialize_estimators(DecisionTreeClassifier)
+        except KeyError as ex:
+            raise ModelDefinitionKeyError(ex) from ex
+
+    @property
+    def analysis_type(self) -> Literal["classification"]:
+        return TYPE_CLASSIFICATION
+
+    def _initialize_init_(self) -> DummyClassifier:
+        estimator = DummyClassifier(strategy="prior")
+
+        estimator.n_classes_ = self.n_classes_
+        estimator.n_outputs_ = self.n_outputs
+        estimator.classes_ = np.arange(self.n_classes_)
+        estimator._strategy = estimator.strategy
+
+        if self.n_classes_ == 2:
+            log_odds = self._trees[0].tree.value.flatten()[0]
+            if np.isnan(log_odds):
+                raise ValueError(
+                    "Error initializing sklearn classifier. Incorrect prior class probability. "
+                    + "Note: only export of models trained in the Elastic Stack is supported."
+                )
+            class_prior = 1 / (1 + np.exp(-log_odds))
+            estimator.class_prior_ = np.array([1 - class_prior, class_prior])
+        else:
+            raise NotImplementedError("Only binary classification is implemented.")
+
+        return estimator
+
+    def predict_proba(
+        self, X, feature_names_in: Optional[Union["ArrayLike", List[str]]] = None
+    ) -> "ArrayLike":
+        """Predict class probabilities for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+        feature_names_in : {array of string, list of string} of length n_features.
+            Feature names of the corresponding columns in X. Important, since the column list
+            can be extended by ColumnTransformer through the pipeline. By default None.
+
+        Returns
+        -------
+        ArrayLike of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        if feature_names_in is not None:
+            if X.shape[1] != len(feature_names_in):
+                raise ValueError(
+                    f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
+                )
+            if isinstance(feature_names_in, np.ndarray):
+                feature_names_in = feature_names_in.tolist()
+            # select columns used by the model in the correct order
+            X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
+
+        X = check_array(X)
+        return GradientBoostingClassifier.predict_proba(self, X)
+
+    def predict(
+        self,
+        X: "ArrayLike",
+        feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
+    ) -> "ArrayLike":
+        """Predict class for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+        feature_names_in : {array of string, list of string} of length n_features.
+            Feature names of the corresponding columns in X. Important, since the column list
+            can be extended by ColumnTransformer through the pipeline. By default None.
+
+        Returns
+        -------
+        ArrayLike of shape (n_samples,)
+            The predicted values.
+        """
+        if feature_names_in is not None:
+            if X.shape[1] != len(feature_names_in):
+                raise ValueError(
+                    f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
+                )
+            if isinstance(feature_names_in, np.ndarray):
+                feature_names_in = feature_names_in.tolist()
+            # select columns used by the model in the correct order
+            X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
+
+        X = check_array(X)
+        return GradientBoostingClassifier.predict(self, X)
+
+
+class ESGradientBoostingRegressor(ESGradientBoostingModel, GradientBoostingRegressor):
+    """
+    Elastic ML model wrapper compatible with sklearn GradientBoostingRegressor.
+    """
+
+    def __init__(
+        self,
+        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
+        model_id: str,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        es_client : Elasticsearch client argument(s)
+            - elasticsearch-py parameters or
+            - elasticsearch-py instance
+        model_id : str
+            The unique identifier of the trained inference model in Elasticsearch.
+
+        Raises
+        ------
+        NotImplementedError
+            Only MSE, MSLE, and Huber loss functions are supported.
+        ModelDefinitionKeyError
+            If required data cannot be extracted from the model definition due to a schema change.
+        """
+        try:
+            ESGradientBoostingModel.__init__(self, es_client, model_id)
+            self._extract_common_parameters()
+            GradientBoostingRegressor.__init__(
+                self,
+                learning_rate=1.0,
+                n_estimators=self.n_estimators,
+                max_depth=self._max_depth,
+            )
+
+            self.n_outputs = 1
+            loss_function = self._trained_model_result["trained_model_configs"][0][
+                "metadata"
+            ]["analytics_config"]["analysis"][self.analysis_type]["loss_function"]
+            if loss_function == "mse" or loss_function == "msle":
+                self.criterion = "squared_error"
+                self._loss = LeastSquaresError()
+            elif loss_function == "huber":
+                loss_parameter = loss_function = self._trained_model_result[
+                    "trained_model_configs"
+                ][0]["metadata"]["analytics_config"]["analysis"][self.analysis_type][
+                    "loss_function_parameter"
+                ]
+                self.criterion = "huber"
+                self._loss = HuberLossFunction(loss_parameter)
+            else:
+                raise NotImplementedError(
+                    "Only MSE, MSLE and Huber loss functions are supported."
+                )
+
+            self.init_ = self._initialize_init_()
+            self._initialize_estimators(DecisionTreeRegressor)
+        except KeyError as ex:
+            raise ModelDefinitionKeyError(ex) from ex
+
+    @property
+    def analysis_type(self) -> Literal["regression"]:
+        return TYPE_REGRESSION
+
+    def _initialize_init_(self) -> DummyRegressor:
+        constant = self._trees[0].tree.value[0]
+        estimator = DummyRegressor(
+            strategy="constant",
+            constant=constant,
+        )
+        estimator.constant_ = np.array([constant])
+        estimator.n_outputs_ = 1
+        return estimator
+
+    def predict(
+        self,
+        X: "ArrayLike",
+        feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
+    ) -> "ArrayLike":
+        """Predict targets for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+        feature_names_in : {array of string, list of string} of length n_features.
+            Feature names of the corresponding columns in X. Important, since the column list
+            can be extended by ColumnTransformer through the pipeline. By default None.
+
+        Returns
+        -------
+        ArrayLike of shape (n_samples,)
+            The predicted values.
+        """
+        if feature_names_in is not None:
+            if X.shape[1] != len(feature_names_in):
+                raise ValueError(
+                    f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
+                )
+            if isinstance(X, np.ndarray):
+                feature_names_in = feature_names_in.tolist()
+            # select columns used by the model in the correct order
+            X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
+
+        X = check_array(X)
+        return GradientBoostingRegressor.predict(self, X)
--- a/eland/ml/ml_model.py
+++ b/eland/ml/ml_model.py
@ -37,6 +37,7 @@ if TYPE_CHECKING:
            RandomForestClassifier,
            RandomForestRegressor,
        )
+        from sklearn.pipeline import Pipeline  # type: ignore # noqa: F401
        from sklearn.tree import (  # type: ignore # noqa: F401
            DecisionTreeClassifier,
            DecisionTreeRegressor,
@ -424,6 +425,83 @@ class MLModel:
            return False
        return True

+    def export_model(self) -> "Pipeline":
+        """Export Elastic ML model as sklearn Pipeline.
+
+        Returns
+        -------
+        sklearn.pipeline.Pipeline
+            _description_
+
+        Raises
+        ------
+        AssertionError
+            If preprocessors JSON definition has unexpected schema.
+        ValueError
+            The model is expected to be trained in Elastic Stack. Models initially imported
+            from xgboost, lgbm, or sklearn are not supported.
+        ValueError
+            If unexpected categorical encoding is found in the list of preprocessors.
+        NotImplementedError
+            Only regression and binary classification models are supported currently.
+        """
+        from sklearn.compose import ColumnTransformer  # type: ignore # noqa: F401
+        from sklearn.pipeline import Pipeline
+
+        from .exporters._sklearn_deserializers import (
+            FrequencyEncoder,
+            OneHotEncoder,
+            TargetMeanEncoder,
+        )
+        from .exporters.es_gb_models import (
+            ESGradientBoostingClassifier,
+            ESGradientBoostingRegressor,
+        )
+
+        if self.model_type == TYPE_CLASSIFICATION:
+            model = ESGradientBoostingClassifier(
+                es_client=self._client, model_id=self._model_id
+            )
+        elif self.model_type == TYPE_REGRESSION:
+            model = ESGradientBoostingRegressor(
+                es_client=self._client, model_id=self._model_id
+            )
+        else:
+            raise NotImplementedError(
+                "Only regression and binary classification models are supported currently."
+            )
+
+        transformers = []
+        for p in model.preprocessors:
+            assert (
+                len(p) == 1
+            ), f"Unexpected preprocessor data structure: {p}. One-key mapping expected."
+            encoding_type = list(p.keys())[0]
+            field = p[encoding_type]["field"]
+            if encoding_type == "frequency_encoding":
+                transform = FrequencyEncoder(p)
+                transformers.append((f"{field}_{encoding_type}", transform, field))
+            elif encoding_type == "target_mean_encoding":
+                transform = TargetMeanEncoder(p)
+                transformers.append((f"{field}_{encoding_type}", transform, field))
+            elif encoding_type == "one_hot_encoding":
+                transform = OneHotEncoder(p)
+                transformers.append((f"{field}_{encoding_type}", transform, [field]))
+            else:
+                raise ValueError(
+                    f"Unexpected categorical encoding type {encoding_type} found. "
+                    + "Expected encodings: frequency_encoding, target_mean_encoding, one_hot_encoding."
+                )
+        preprocessor = ColumnTransformer(
+            transformers=transformers,
+            remainder="passthrough",
+            verbose_feature_names_out=False,
+        )
+
+        pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("es_model", model)])
+
+        return pipeline
+
    @property
    def _trained_model_config(self) -> Dict[str, Any]:
        """Lazily loads an ML models 'trained_model_config' information"""
--- a/eland/ml/pytorch/transformers.py
+++ b/eland/ml/pytorch/transformers.py
@ -125,7 +125,7 @@ def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]
        return None
    potential_task_types: Set[str] = set()
    for architecture in model_config.architectures:
-        for (substr, task_type) in ARCHITECTURE_TO_TASK_TYPE.items():
+        for substr, task_type in ARCHITECTURE_TO_TASK_TYPE.items():
            if substr in architecture:
                for t in task_type:
                    potential_task_types.add(t)
@ -384,7 +384,6 @@ class _DPREncoderWrapper(nn.Module):  # type: ignore

    @staticmethod
    def from_pretrained(model_id: str) -> Optional[Any]:
-
        config = AutoConfig.from_pretrained(model_id)

        def is_compatible() -> bool:
--- a/eland/operations.py
+++ b/eland/operations.py
@ -210,7 +210,6 @@ class Operations:
    def idx(
        self, query_compiler: "QueryCompiler", axis: int, sort_order: str
    ) -> pd.Series:
-
        if axis == 1:
            # Fetch idx on Columns
            raise NotImplementedError(
@ -279,7 +278,6 @@ class Operations:
        numeric_only: bool = False,
        dropna: bool = True,
    ) -> Union[pd.DataFrame, pd.Series]:
-
        results = self._metric_aggs(
            query_compiler,
            pd_aggs=pd_aggs,
@ -530,7 +528,6 @@ class Operations:
        # weights = [10066.,   263.,   386.,   264.,   273.,   390.,   324.,   438.,   261.,   252.,    142.]
        # So sum last 2 buckets
        for field in numeric_source_fields:
-
            # in case of series let plotting.ed_hist_series thrown an exception
            if not response.get("aggregations"):
                continue
@ -771,7 +768,6 @@ class Operations:
        is_dataframe: bool = True,
        numeric_only: Optional[bool] = True,
    ) -> Union[pd.DataFrame, pd.Series]:
-
        percentiles = [
            quantile_to_percentile(x)
            for x in (
@ -801,7 +797,6 @@ class Operations:
            return df if is_dataframe else df.transpose().iloc[0]

    def unique(self, query_compiler: "QueryCompiler") -> pd.Series:
-
        query_params, _ = self._resolve_tasks(query_compiler)
        body = Query(query_params.query)

@ -1052,7 +1047,6 @@ class Operations:
            buckets: Sequence[Dict[str, Any]] = composite_buckets["buckets"]

            if after_key:
-
                # yield the bucket which contains the result
                yield buckets

@ -1227,7 +1221,6 @@ class Operations:
    def to_pandas(
        self, query_compiler: "QueryCompiler", show_progress: bool = False
    ) -> pd.DataFrame:
-
        df_list: List[pd.DataFrame] = []
        i = 0
        for df in self.search_yield_pandas_dataframes(query_compiler=query_compiler):
--- a/eland/query.py
+++ b/eland/query.py
@ -170,7 +170,6 @@ class Query:
        sort_order: str,
        size: int = 1,
    ) -> None:
-
        top_hits: Any = {}
        if sort_order:
            top_hits["sort"] = [{i: {"order": sort_order}} for i in source_columns]
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -246,7 +246,6 @@ class QueryCompiler:

        i = 0
        for i, hit in enumerate(results, 1):
-
            if "_source" in hit:
                row = hit["_source"]
            else:
--- a/noxfile.py
+++ b/noxfile.py
@ -61,7 +61,7 @@ def format(session):
    session.install("black", "isort", "flynt")
    session.run("python", "utils/license-headers.py", "fix", *SOURCE_FILES)
    session.run("flynt", *SOURCE_FILES)
-    session.run("black", "--target-version=py37", *SOURCE_FILES)
+    session.run("black", "--target-version=py38", *SOURCE_FILES)
    session.run("isort", "--profile=black", *SOURCE_FILES)
    lint(session)

@ -73,7 +73,7 @@ def lint(session):
    session.install("black", "flake8", "mypy", "isort", "numpy")
    session.install("--pre", "elasticsearch>=8.3,<9")
    session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES)
-    session.run("black", "--check", "--target-version=py37", *SOURCE_FILES)
+    session.run("black", "--check", "--target-version=py38", *SOURCE_FILES)
    session.run("isort", "--check", "--profile=black", *SOURCE_FILES)
    session.run("flake8", "--ignore=E501,W503,E402,E712,E203", *SOURCE_FILES)

@ -138,6 +138,7 @@ def test(session, pandas_version: str):
            "scikit-learn",
            "xgboost",
            "lightgbm",
+            "shap",
        )
        session.run("pytest", "tests/ml/")

--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -29,6 +29,7 @@ pytest>=5.2.1
 pytest-mock
 pytest-cov
 nbval
+shap==0.41.0

 #
 # Docs
--- a/tests/dataframe/test_count_pytest.py
+++ b/tests/dataframe/test_count_pytest.py
@ -35,7 +35,6 @@ class TestDataFrameCount(TestData):
        df.count()

    def test_count_flights(self):
-
        pd_flights = self.pd_flights().filter(self.filter_data)
        ed_flights = self.ed_flights().filter(self.filter_data)

--- a/tests/dataframe/test_metrics_pytest.py
+++ b/tests/dataframe/test_metrics_pytest.py
@ -419,7 +419,6 @@ class TestDataFrameMetrics(TestData):
            assert calculated_values.shape == (2,)

    def test_aggs_count(self):
-
        pd_flights = self.pd_flights().filter(self.filter_data)
        ed_flights = self.ed_flights().filter(self.filter_data)

--- a/tests/dataframe/test_to_csv_pytest.py
+++ b/tests/dataframe/test_to_csv_pytest.py
@ -102,7 +102,6 @@ class TestDataFrameToCSV(TestData):
        ES_TEST_CLIENT.indices.delete(index=test_index)

    def test_pd_to_csv_without_filepath(self):
-
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()

--- a/tests/dataframe/test_utils_pytest.py
+++ b/tests/dataframe/test_utils_pytest.py
@ -147,7 +147,6 @@ class TestDataFrameUtils(TestData):
        # assert_pandas_eland_frame_equal(pd_df, self.ed_flights())

    def test_es_type_override_error(self):
-
        df = self.pd_flights().filter(
            ["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
        )
--- a/tests/ml/test_ml_model_pytest.py
+++ b/tests/ml/test_ml_model_pytest.py
@ -15,11 +15,14 @@
 #  specific language governing permissions and limitations
 #  under the License.

+from operator import itemgetter
+
 import numpy as np
 import pytest

+import eland as ed
 from eland.ml import MLModel
-from tests import ES_TEST_CLIENT, ES_VERSION
+from tests import ES_TEST_CLIENT, ES_VERSION, FLIGHTS_SMALL_INDEX_NAME

 try:
    from sklearn import datasets
@ -44,16 +47,26 @@ try:
 except ImportError:
    HAS_LIGHTGBM = False

+try:
+    import shap
+
+    HAS_SHAP = True
+except ImportError:
+    HAS_SHAP = False
+

 requires_sklearn = pytest.mark.skipif(
-    not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
+    not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run."
 )
 requires_xgboost = pytest.mark.skipif(
-    not HAS_XGBOOST, reason="This test requires 'xgboost' package to run"
+    not HAS_XGBOOST, reason="This test requires 'xgboost' package to run."
+)
+requires_shap = pytest.mark.skipif(
+    not HAS_SHAP, reason="This tests requries 'shap' package to run."
 )
 requires_no_ml_extras = pytest.mark.skipif(
    HAS_SKLEARN or HAS_XGBOOST,
-    reason="This test requires 'scikit-learn' and 'xgboost' to not be installed",
+    reason="This test requires 'scikit-learn' and 'xgboost' to not be installed.",
 )

 requires_lightgbm = pytest.mark.skipif(
@ -80,6 +93,102 @@ def check_prediction_equality(es_model: MLModel, py_model, test_data):
    np.testing.assert_almost_equal(test_results, es_results, decimal=2)


+def yield_model_id(analysis, analyzed_fields):
+    import random
+    import string
+    import time
+
+    suffix = "".join(random.choices(string.ascii_lowercase, k=4))
+    job_id = "test-flights-regression-" + suffix
+    dest = job_id + "-dest"
+
+    response = ES_TEST_CLIENT.ml.put_data_frame_analytics(
+        id=job_id,
+        analysis=analysis,
+        dest={"index": dest},
+        source={"index": [FLIGHTS_SMALL_INDEX_NAME]},
+        analyzed_fields=analyzed_fields,
+    )
+    assert response.meta.status == 200
+    response = ES_TEST_CLIENT.ml.start_data_frame_analytics(id=job_id)
+    assert response.meta.status == 200
+
+    time.sleep(2)
+    response = ES_TEST_CLIENT.ml.get_trained_models(model_id=job_id + "*")
+    assert response.meta.status == 200
+    assert response.body["count"] == 1
+    model_id = response.body["trained_model_configs"][0]["model_id"]
+
+    yield model_id
+
+    ES_TEST_CLIENT.ml.delete_data_frame_analytics(id=job_id)
+    ES_TEST_CLIENT.indices.delete(index=dest)
+    ES_TEST_CLIENT.ml.delete_trained_model(model_id=model_id)
+
+
+@pytest.fixture(params=[[0, 4], [0, 1], range(5)])
+def regression_model_id(request):
+    analysis = {
+        "regression": {
+            "dependent_variable": "FlightDelayMin",
+            "max_trees": 3,
+            "num_top_feature_importance_values": 0,
+            "max_optimization_rounds_per_hyperparameter": 1,
+            "prediction_field_name": "FlightDelayMin_prediction",
+            "training_percent": 30,
+            "randomize_seed": 1000,
+            "loss_function": "mse",
+            "early_stopping_enabled": True,
+        }
+    }
+    all_includes = [
+        "FlightDelayMin",
+        "FlightDelayType",
+        "FlightTimeMin",
+        "DistanceMiles",
+        "OriginAirportID",
+    ]
+    includes = [all_includes[i] for i in request.param]
+    analyzed_fields = {
+        "includes": includes,
+        "excludes": [],
+    }
+    yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
+
+
+@pytest.fixture(params=[[0, 6], [5, 6], range(7)])
+def classification_model_id(request):
+    analysis = {
+        "classification": {
+            "dependent_variable": "Cancelled",
+            "max_trees": 5,
+            "num_top_feature_importance_values": 0,
+            "max_optimization_rounds_per_hyperparameter": 1,
+            "prediction_field_name": "Cancelled_prediction",
+            "training_percent": 50,
+            "randomize_seed": 1000,
+            "num_top_classes": -1,
+            "class_assignment_objective": "maximize_accuracy",
+            "early_stopping_enabled": True,
+        }
+    }
+    all_includes = [
+        "OriginWeather",
+        "OriginAirportID",
+        "DestCityName",
+        "DestWeather",
+        "DestRegion",
+        "AvgTicketPrice",
+        "Cancelled",
+    ]
+    includes = [all_includes[i] for i in request.param]
+    analyzed_fields = {
+        "includes": includes,
+        "excludes": [],
+    }
+    yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
+
+
 class TestMLModel:
    @requires_no_ml_extras
    def test_import_ml_model_when_dependencies_are_not_available(self):
@ -494,3 +603,172 @@ class TestMLModel:

        # Clean up
        es_model.delete_model()
+
+    @requires_sklearn
+    @requires_shap
+    def test_export_regressor(self, regression_model_id):
+        ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
+        types = dict(ed_flights.dtypes)
+        X = ed_flights.to_pandas().astype(types)
+
+        model = MLModel(es_client=ES_TEST_CLIENT, model_id=regression_model_id)
+        pipeline = model.export_model()
+        pipeline.fit(X)
+
+        predictions_sklearn = pipeline.predict(
+            X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
+        )
+        response = ES_TEST_CLIENT.ml.infer_trained_model(
+            model_id=regression_model_id,
+            docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
+        )
+        predictions_es = np.array(
+            list(
+                map(
+                    itemgetter("FlightDelayMin_prediction"),
+                    response.body["inference_results"],
+                )
+            )
+        )
+        np.testing.assert_array_almost_equal(predictions_sklearn, predictions_es)
+
+        import pandas as pd
+
+        X_transformed = pipeline["preprocessor"].transform(X=X)
+        X_transformed = pd.DataFrame(
+            X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
+        )
+        explainer = shap.TreeExplainer(pipeline["es_model"])
+        shap_values = explainer.shap_values(
+            X_transformed[pipeline["es_model"].feature_names_in_]
+        )
+        np.testing.assert_array_almost_equal(
+            predictions_sklearn, shap_values.sum(axis=1) + explainer.expected_value
+        )
+
+    @requires_sklearn
+    def test_export_classification(self, classification_model_id):
+        ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
+        X = ed.eland_to_pandas(ed_flights)
+
+        model = MLModel(es_client=ES_TEST_CLIENT, model_id=classification_model_id)
+        pipeline = model.export_model()
+        pipeline.fit(X)
+
+        predictions_sklearn = pipeline.predict(
+            X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
+        )
+        prediction_proba_sklearn = pipeline.predict_proba(
+            X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
+        ).max(axis=1)
+
+        response = ES_TEST_CLIENT.ml.infer_trained_model(
+            model_id=classification_model_id,
+            docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
+        )
+        predictions_es = np.array(
+            list(
+                map(
+                    lambda x: str(int(x["Cancelled_prediction"])),
+                    response.body["inference_results"],
+                )
+            )
+        )
+        prediction_proba_es = np.array(
+            list(
+                map(
+                    itemgetter("prediction_probability"),
+                    response.body["inference_results"],
+                )
+            )
+        )
+        np.testing.assert_array_almost_equal(
+            prediction_proba_sklearn, prediction_proba_es
+        )
+        np.testing.assert_array_equal(predictions_sklearn, predictions_es)
+
+        import pandas as pd
+
+        X_transformed = pipeline["preprocessor"].transform(X=X)
+        X_transformed = pd.DataFrame(
+            X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
+        )
+        explainer = shap.TreeExplainer(pipeline["es_model"])
+        shap_values = explainer.shap_values(
+            X_transformed[pipeline["es_model"].feature_names_in_]
+        )
+        log_odds = shap_values.sum(axis=1) + explainer.expected_value
+        prediction_proba_shap = 1 / (1 + np.exp(-log_odds))
+        # use probability of the predicted class
+        prediction_proba_shap[prediction_proba_shap < 0.5] = (
+            1 - prediction_proba_shap[prediction_proba_shap < 0.5]
+        )
+        np.testing.assert_array_almost_equal(
+            prediction_proba_sklearn, prediction_proba_shap
+        )
+
+    @requires_xgboost
+    @requires_sklearn
+    @pytest.mark.parametrize("objective", ["binary:logistic", "reg:squarederror"])
+    def test_xgb_import_export(self, objective):
+        booster = "gbtree"
+
+        if objective.startswith("binary:"):
+            training_data = datasets.make_classification(n_features=5)
+            xgb_model = XGBClassifier(
+                booster=booster, objective=objective, use_label_encoder=False
+            )
+        else:
+            training_data = datasets.make_regression(n_features=5)
+            xgb_model = XGBRegressor(
+                booster=booster, objective=objective, use_label_encoder=False
+            )
+
+        # Train model
+        xgb_model.fit(training_data[0], training_data[1])
+
+        # Serialise the models to Elasticsearch
+        feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
+        model_id = "test_xgb_model"
+
+        es_model = MLModel.import_model(
+            ES_TEST_CLIENT, model_id, xgb_model, feature_names, es_if_exists="replace"
+        )
+
+        # Export suppose to fail
+        with pytest.raises(ValueError) as ex:
+            es_model.export_model()
+        assert ex.match("Error initializing sklearn classifier.")
+
+        # Clean up
+        es_model.delete_model()
+
+    @requires_lightgbm
+    @pytest.mark.parametrize("objective", ["regression", "binary"])
+    def test_lgbm_import_export(self, objective):
+        booster = "gbdt"
+        if objective == "binary":
+            training_data = datasets.make_classification(n_features=5)
+            lgbm_model = LGBMClassifier(boosting_type=booster, objective=objective)
+        else:
+            training_data = datasets.make_regression(n_features=5)
+            lgbm_model = LGBMRegressor(boosting_type=booster, objective=objective)
+
+        # Train model
+        lgbm_model.fit(training_data[0], training_data[1])
+
+        # Serialise the models to Elasticsearch
+        feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
+        model_id = "test_lgbm_model"
+
+        es_model = MLModel.import_model(
+            ES_TEST_CLIENT, model_id, lgbm_model, feature_names, es_if_exists="replace"
+        )
+
+        # Export suppose to fail
+        with pytest.raises(ValueError) as ex:
+            es_model.export_model()
+        assert ex.match("Error initializing sklearn classifier.")
+
+        # Clean up
+        es_model.delete_model()