From 5169cc926a0459657e03c06d7e23d01fbe6e90ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20FOUCRET?= <aurelien.foucret@gmail.com>
Date: Wed, 17 Jan 2024 10:01:47 +0100
Subject: [PATCH] Improve LTR (#651)

* Ensure the feature logger is using NaN for non matching query feature extractors (consistent with ES).

* Default score is None instead of 0.

* LTR model import API improvements.

* Fix feature logger tests.

* Fix export in eland.ml.ltr

* Apply suggestions from code review

Co-authored-by: Adam Demjen <demjened@gmail.com>

* Fix supported models for LTR

---------

Co-authored-by: Adam Demjen <demjened@gmail.com>
---
 eland/ml/ltr/__init__.py                   |  15 +--
 eland/ml/ltr/feature_logger.py             |  29 +----
 eland/ml/ltr/ltr_model_config.py           |   2 +-
 eland/ml/ml_model.py                       | 125 ++++++++++++++++++++-
 tests/ml/ltr/test_feature_logger_pytest.py |   6 +-
 tests/ml/test_ml_model_pytest.py           |  53 +++++----
 6 files changed, 164 insertions(+), 66 deletions(-)

diff --git a/eland/ml/ltr/__init__.py b/eland/ml/ltr/__init__.py
index f90a079..33a4f67 100644
--- a/eland/ml/ltr/__init__.py
+++ b/eland/ml/ltr/__init__.py
@@ -15,11 +15,12 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
-from eland.ml.ltr.feature_logger import FeatureLogger
-from eland.ml.ltr.ltr_model_config import (
-    FeatureExtractor,
-    LTRModelConfig,
-    QueryFeatureExtractor,
-)
+from .feature_logger import FeatureLogger
+from .ltr_model_config import FeatureExtractor, LTRModelConfig, QueryFeatureExtractor
 
-__all__ = [LTRModelConfig, QueryFeatureExtractor, FeatureExtractor, FeatureLogger]
+__all__ = [
+    "LTRModelConfig",
+    "QueryFeatureExtractor",
+    "FeatureExtractor",
+    "FeatureLogger",
+]
diff --git a/eland/ml/ltr/feature_logger.py b/eland/ml/ltr/feature_logger.py
index 4640f5e..3cf6de5 100644
--- a/eland/ml/ltr/feature_logger.py
+++ b/eland/ml/ltr/feature_logger.py
@@ -95,7 +95,7 @@ class FeatureLogger:
         """
 
         doc_features = {
-            doc_id: [float(0)] * len(self._model_config.feature_extractors)
+            doc_id: [float("nan")] * len(self._model_config.feature_extractors)
             for doc_id in doc_ids
         }
 
@@ -141,28 +141,6 @@ class FeatureLogger:
 
     def _extract_query_features(
         self, query_params: Mapping[str, Any], doc_ids: List[str]
-    ):
-        default_query_scores = dict(
-            (extractor.feature_name, extractor.default_score)
-            for extractor in self._model_config.query_feature_extractors
-        )
-
-        matched_queries = self._execute_search_template_request(
-            script_source=self._script_source,
-            template_params={
-                **query_params,
-                "__doc_ids": doc_ids,
-                "__size": len(doc_ids),
-            },
-        )
-
-        return {
-            hit_id: {**default_query_scores, **matched_queries_scores}
-            for hit_id, matched_queries_scores in matched_queries.items()
-        }
-
-    def _execute_search_template_request(
-        self, script_source: str, template_params: Mapping[str, any]
     ):
         # When support for include_named_queries_score will be added,
         # this will be replaced by the call to the client search_template method.
@@ -171,7 +149,10 @@ class FeatureLogger:
         __path = f"/{_quote(self._index_name)}/_search/template"
         __query = {"include_named_queries_score": True}
         __headers = {"accept": "application/json", "content-type": "application/json"}
-        __body = {"source": script_source, "params": template_params}
+        __body = {
+            "source": self._script_source,
+            "params": {**query_params, "__doc_ids": doc_ids, "__size": len(doc_ids)},
+        }
 
         return {
             hit["_id"]: hit["matched_queries"] if "matched_queries" in hit else {}
diff --git a/eland/ml/ltr/ltr_model_config.py b/eland/ml/ltr/ltr_model_config.py
index e3c5877..ca02ad7 100644
--- a/eland/ml/ltr/ltr_model_config.py
+++ b/eland/ml/ltr/ltr_model_config.py
@@ -59,7 +59,7 @@ class QueryFeatureExtractor(FeatureExtractor):
         self,
         feature_name: str,
         query: Mapping[str, Any],
-        default_score: Optional[float] = float(0),
+        default_score: Optional[float] = None,
     ):
         """
         Parameters
diff --git a/eland/ml/ml_model.py b/eland/ml/ml_model.py
index fffaf0e..d13c2a6 100644
--- a/eland/ml/ml_model.py
+++ b/eland/ml/ml_model.py
@@ -24,6 +24,7 @@ from eland.common import ensure_es_client, es_version
 from eland.utils import deprecated_api
 
 from .common import TYPE_CLASSIFICATION, TYPE_LEARNING_TO_RANK, TYPE_REGRESSION
+from .ltr import LTRModelConfig
 from .transformers import get_model_transformer
 
 if TYPE_CHECKING:
@@ -266,7 +267,6 @@ class MLModel:
         classification_weights: Optional[List[float]] = None,
         es_if_exists: Optional[str] = None,
         es_compress_model_definition: bool = True,
-        inference_config: Optional[Mapping[str, Mapping[str, Any]]] = None,
     ) -> "MLModel":
         """
         Transform and serialize a trained 3rd party model into Elasticsearch.
@@ -342,10 +342,6 @@ class MLModel:
             JSON instead of raw JSON to reduce the amount of data sent
             over the wire in HTTP requests. Defaults to 'True'.
 
-        inference_config: Mapping[str, Mapping[str, Any]]
-            Model inference configuration. Must contain a top-level property whose name is the same as the inference
-            task type.
-
         Examples
         --------
         >>> from sklearn import datasets
@@ -380,6 +376,125 @@ class MLModel:
         >>> # Delete model from Elasticsearch
         >>> es_model.delete_model()
         """
+
+        return cls._import_model(
+            es_client=es_client,
+            model_id=model_id,
+            model=model,
+            feature_names=feature_names,
+            classification_labels=classification_labels,
+            classification_weights=classification_weights,
+            es_if_exists=es_if_exists,
+            es_compress_model_definition=es_compress_model_definition,
+        )
+
+    @classmethod
+    def import_ltr_model(
+        cls,
+        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
+        model_id: str,
+        model: Union[
+            "DecisionTreeRegressor",
+            "RandomForestRegressor",
+            "XGBRanker",
+            "XGBRegressor",
+            "LGBMRegressor",
+        ],
+        ltr_model_config: LTRModelConfig,
+        es_if_exists: Optional[str] = None,
+        es_compress_model_definition: bool = True,
+    ) -> "MLModel":
+        """
+        Transform and serialize a trained 3rd party model into Elasticsearch.
+        This model can then be used as a learning_to_rank rescorer in the Elastic Stack.
+
+        Parameters
+        ----------
+        es_client: Elasticsearch client argument(s)
+            - elasticsearch-py parameters or
+            - elasticsearch-py instance
+
+        model_id: str
+            The unique identifier of the trained inference model in Elasticsearch.
+
+        model: An instance of a supported python model. We support the following model types for LTR prediction:
+            - sklearn.tree.DecisionTreeRegressor
+            - sklearn.ensemble.RandomForestRegressor
+            - xgboost.XGBRanker
+                - only the following objectives are supported:
+                    - "rank:map"
+                    - "rank:ndcg"
+                    - "rank:pairwise"
+            - xgboost.XGBRegressor
+                - only the following objectives are supported:
+                    - "reg:squarederror"
+                    - "reg:linear"
+                    - "reg:squaredlogerror"
+                    - "reg:logistic"
+                    - "reg:pseudohubererror"
+            - lightgbm.LGBMRegressor
+                - Categorical fields are expected to already be processed
+                - Only the following objectives are supported
+                    - "regression"
+                    - "regression_l1"
+                    - "huber"
+                    - "fair"
+                    - "quantile"
+                    - "mape"
+
+        ltr_model_config: LTRModelConfig
+            The LTR model configuration is used to configure feature extractors for the LTR model.
+            Feature names are automatically inferred from the feature extractors.
+
+        es_if_exists: {'fail', 'replace'} default 'fail'
+            How to behave if model already exists
+
+            - fail: Raise a Value Error
+            - replace: Overwrite existing model
+
+        es_compress_model_definition: bool
+            If True will use 'compressed_definition' which uses gzipped
+            JSON instead of raw JSON to reduce the amount of data sent
+            over the wire in HTTP requests. Defaults to 'True'.
+        """
+
+        return cls._import_model(
+            es_client=es_client,
+            model_id=model_id,
+            model=model,
+            feature_names=ltr_model_config.feature_names,
+            inference_config=ltr_model_config.to_dict(),
+            es_if_exists=es_if_exists,
+            es_compress_model_definition=es_compress_model_definition,
+        )
+
+    @classmethod
+    def _import_model(
+        cls,
+        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
+        model_id: str,
+        model: Union[
+            "DecisionTreeClassifier",
+            "DecisionTreeRegressor",
+            "RandomForestRegressor",
+            "RandomForestClassifier",
+            "XGBClassifier",
+            "XGBRanker",
+            "XGBRegressor",
+            "LGBMRegressor",
+            "LGBMClassifier",
+        ],
+        feature_names: List[str],
+        classification_labels: Optional[List[str]] = None,
+        classification_weights: Optional[List[float]] = None,
+        es_if_exists: Optional[str] = None,
+        es_compress_model_definition: bool = True,
+        inference_config: Optional[Mapping[str, Mapping[str, Any]]] = None,
+    ) -> "MLModel":
+        """
+        Actual implementation of model import used by public API methods.
+        """
+
         es_client = ensure_es_client(es_client)
         transformer = get_model_transformer(
             model,
diff --git a/tests/ml/ltr/test_feature_logger_pytest.py b/tests/ml/ltr/test_feature_logger_pytest.py
index 6e2dfdf..e6b1d3a 100644
--- a/tests/ml/ltr/test_feature_logger_pytest.py
+++ b/tests/ml/ltr/test_feature_logger_pytest.py
@@ -15,6 +15,8 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
+import math
+
 from eland.ml.ltr import FeatureLogger, LTRModelConfig, QueryFeatureExtractor
 from tests import ES_TEST_CLIENT, NATIONAL_PARKS_INDEX_NAME
 
@@ -53,12 +55,12 @@ class TestFeatureLogger:
 
         # "park_hawaii-volcanoes" document does not matches for title but is a world heritage site
         assert (
-            doc_features["park_hawaii-volcanoes"][0] == 0
+            math.isnan(doc_features["park_hawaii-volcanoes"][0])
             and doc_features["park_hawaii-volcanoes"][1] > 1
         )
 
         # "park_hawaii-volcanoes" document does not matches for title and is not a world heritage site
-        assert doc_features["park_death-valley"] == [0, 0]
+        assert all(math.isnan(feature) for feature in doc_features["park_death-valley"])
 
     def _ltr_model_config(self):
         # Returns an LTR config with 2 query feature extractors:
diff --git a/tests/ml/test_ml_model_pytest.py b/tests/ml/test_ml_model_pytest.py
index e19358f..c7d0215 100644
--- a/tests/ml/test_ml_model_pytest.py
+++ b/tests/ml/test_ml_model_pytest.py
@@ -23,6 +23,7 @@ import pytest
 
 import eland as ed
 from eland.ml import MLModel
+from eland.ml.ltr import LTRModelConfig, QueryFeatureExtractor
 from tests import (
     ES_TEST_CLIENT,
     ES_VERSION,
@@ -330,41 +331,35 @@ class TestMLModel:
 
         # Serialise the models to Elasticsearch
         model_id = "test_learning_to_rank"
-        feature_extractors = [
-            {
-                "query_extractor": {
-                    "feature_name": "title_bm25",
-                    "query": {"match": {"title": "{{query_string}}"}},
-                }
-            },
-            {
-                "query_extractor": {
-                    "feature_name": "visitors",
-                    "query": {
+        ltr_model_config = LTRModelConfig(
+            feature_extractors=[
+                QueryFeatureExtractor(
+                    feature_name="title_bm25",
+                    query={"match": {"title": "{{query_string}}"}},
+                ),
+                QueryFeatureExtractor(
+                    feature_name="description_bm25",
+                    query={"match": {"description_bm25": "{{query_string}}"}},
+                ),
+                QueryFeatureExtractor(
+                    feature_name="visitors",
+                    query={
                         "script_score": {
                             "query": {"exists": {"field": "visitors"}},
                             "script": {"source": 'return doc["visitors"].value;'},
                         }
                     },
-                }
-            },
-        ]
-        feature_names = [
-            extractor["query_extractor"]["feature_name"]
-            for extractor in feature_extractors
-        ]
-        inference_config = {
-            "learning_to_rank": {"feature_extractors": feature_extractors}
-        }
+                ),
+            ]
+        )
 
-        es_model = MLModel.import_model(
+        es_model = MLModel.import_ltr_model(
             ES_TEST_CLIENT,
             model_id,
             regressor,
-            feature_names,
+            ltr_model_config,
             es_if_exists="replace",
             es_compress_model_definition=compress_model_definition,
-            inference_config=inference_config,
         )
 
         # Verify the saved inference config contains the passed LTR config
@@ -375,10 +370,14 @@ class TestMLModel:
             "inference_config"
         ]
         assert "learning_to_rank" in saved_inference_config
-        saved_ltr_config = saved_inference_config["learning_to_rank"]
+        assert "feature_extractors" in saved_inference_config["learning_to_rank"]
+        saved_feature_extractors = saved_inference_config["learning_to_rank"][
+            "feature_extractors"
+        ]
+
         assert all(
-            item in saved_ltr_config.items()
-            for item in inference_config["learning_to_rank"].items()
+            feature_extractor.to_dict() in saved_feature_extractors
+            for feature_extractor in ltr_model_config.feature_extractors
         )
 
         # Execute search with rescoring