Accept LTR inference config when creating model (#645)

* Support for supplying inference_config * Fix linting errors * Add unit test * Add LTR type, throw exception on predict, refine test * Add search step to LTR test * Fix linter errors * Update rescoring assertion in test + type defs * Fix linting error * Remove failing assertion
2025-07-24 00:00:39 +08:00 · 2024-01-08 09:19:03 -05:00 · 2024-01-08 09:19:03 -05:00 · 840871f9d9
commit 840871f9d9
parent 05c5859b8a
3 changed files with 122 additions and 5 deletions
--- a/eland/ml/common.py
+++ b/eland/ml/common.py
@ -16,4 +16,5 @@
 #  under the License.

 TYPE_CLASSIFICATION = "classification"
+TYPE_LEARNING_TO_RANK = "learning_to_rank"
 TYPE_REGRESSION = "regression"
--- a/eland/ml/ml_model.py
+++ b/eland/ml/ml_model.py
@ -23,7 +23,7 @@ import numpy as np
 from eland.common import ensure_es_client, es_version
 from eland.utils import deprecated_api

-from .common import TYPE_CLASSIFICATION, TYPE_REGRESSION
+from .common import TYPE_CLASSIFICATION, TYPE_LEARNING_TO_RANK, TYPE_REGRESSION
 from .transformers import get_model_transformer

 if TYPE_CHECKING:
@ -130,6 +130,11 @@ class MLModel:
        >>> # Delete model from Elasticsearch
        >>> es_model.delete_model()
        """
+        if self.model_type not in (TYPE_CLASSIFICATION, TYPE_REGRESSION):
+            raise NotImplementedError(
+                f"Prediction for type {self.model_type} is not supported."
+            )
+
        docs: List[Mapping[str, Any]] = []
        if isinstance(X, np.ndarray):

@ -215,6 +220,8 @@ class MLModel:
        inference_config = self._trained_model_config["inference_config"]
        if "classification" in inference_config:
            return TYPE_CLASSIFICATION
+        elif "learning_to_rank" in inference_config:
+            return TYPE_LEARNING_TO_RANK
        elif "regression" in inference_config:
            return TYPE_REGRESSION
        raise ValueError("Unable to determine 'model_type' for MLModel")
@ -254,6 +261,7 @@ class MLModel:
        classification_weights: Optional[List[float]] = None,
        es_if_exists: Optional[str] = None,
        es_compress_model_definition: bool = True,
+        inference_config: Optional[Mapping[str, Mapping[str, Any]]] = None,
    ) -> "MLModel":
        """
        Transform and serialize a trained 3rd party model into Elasticsearch.
@ -324,6 +332,10 @@ class MLModel:
            JSON instead of raw JSON to reduce the amount of data sent
            over the wire in HTTP requests. Defaults to 'True'.

+        inference_config: Mapping[str, Mapping[str, Any]]
+            Model inference configuration. Must contain a top-level property whose name is the same as the inference
+            task type.
+
        Examples
        --------
        >>> from sklearn import datasets
@ -367,6 +379,7 @@ class MLModel:
        )
        serializer = transformer.transform()
        model_type = transformer.model_type
+        default_inference_config: Mapping[str, Mapping[str, Any]] = {model_type: {}}

        if es_if_exists is None:
            es_if_exists = "fail"
@ -389,14 +402,14 @@ class MLModel:
            ml_model._client.ml.put_trained_model(
                model_id=model_id,
                input={"field_names": feature_names},
-                inference_config={model_type: {}},
+                inference_config=inference_config or default_inference_config,
                compressed_definition=serializer.serialize_and_compress_model(),
            )
        else:
            ml_model._client.ml.put_trained_model(
                model_id=model_id,
                input={"field_names": feature_names},
-                inference_config={model_type: {}},
+                inference_config=inference_config or default_inference_config,
                definition=serializer.serialize_model(),
            )

--- a/tests/ml/test_ml_model_pytest.py
+++ b/tests/ml/test_ml_model_pytest.py
@ -16,13 +16,19 @@
 #  under the License.

 from operator import itemgetter
+from typing import Tuple

 import numpy as np
 import pytest

 import eland as ed
 from eland.ml import MLModel
-from tests import ES_TEST_CLIENT, ES_VERSION, FLIGHTS_SMALL_INDEX_NAME
+from tests import (
+    ES_TEST_CLIENT,
+    ES_VERSION,
+    FLIGHTS_SMALL_INDEX_NAME,
+    MOVIES_INDEX_NAME,
+)

 try:
    from sklearn import datasets
@ -70,10 +76,17 @@ requires_no_ml_extras = pytest.mark.skipif(
 )

 requires_lightgbm = pytest.mark.skipif(
-    not HAS_LIGHTGBM, reason="This test requires 'lightgbm' package to run"
+    not HAS_LIGHTGBM, reason="This test requires 'lightgbm' package to run."
 )


+def requires_elasticsearch_version(minimum_version: Tuple[int, int, int]):
+    return pytest.mark.skipif(
+        ES_VERSION < minimum_version,
+        reason=f"This test requires Elasticsearch version {'.'.join(str(v) for v in minimum_version)} or later.",
+    )
+
+
 def skip_if_multiclass_classifition():
    if ES_VERSION < (7, 7):
        raise pytest.skip(
@ -306,6 +319,96 @@ class TestMLModel:
        # Clean up
        es_model.delete_model()

+    @requires_elasticsearch_version((8, 12))
+    @requires_sklearn
+    @pytest.mark.parametrize("compress_model_definition", [True, False])
+    def test_learning_to_rank(self, compress_model_definition):
+        # Train model
+        training_data = datasets.make_regression(n_features=2)
+        regressor = DecisionTreeRegressor()
+        regressor.fit(training_data[0], training_data[1])
+
+        # Serialise the models to Elasticsearch
+        model_id = "test_learning_to_rank"
+        feature_extractors = [
+            {
+                "query_extractor": {
+                    "feature_name": "title_bm25",
+                    "query": {"match": {"title": "{{query_string}}"}},
+                }
+            },
+            {
+                "query_extractor": {
+                    "feature_name": "imdb_rating",
+                    "query": {
+                        "script_score": {
+                            "query": {"exists": {"field": "imdbRating"}},
+                            "script": {"source": 'return doc["imdbRating"].value;'},
+                        }
+                    },
+                }
+            },
+        ]
+        feature_names = [
+            extractor["query_extractor"]["feature_name"]
+            for extractor in feature_extractors
+        ]
+        inference_config = {
+            "learning_to_rank": {"feature_extractors": feature_extractors}
+        }
+
+        es_model = MLModel.import_model(
+            ES_TEST_CLIENT,
+            model_id,
+            regressor,
+            feature_names,
+            es_if_exists="replace",
+            es_compress_model_definition=compress_model_definition,
+            inference_config=inference_config,
+        )
+
+        # Verify the saved inference config contains the passed LTR config
+        response = ES_TEST_CLIENT.ml.get_trained_models(model_id=model_id)
+        assert response.meta.status == 200
+        assert response.body["count"] == 1
+        saved_inference_config = response.body["trained_model_configs"][0][
+            "inference_config"
+        ]
+        assert "learning_to_rank" in saved_inference_config
+        saved_ltr_config = saved_inference_config["learning_to_rank"]
+        assert all(
+            item in saved_ltr_config.items()
+            for item in inference_config["learning_to_rank"].items()
+        )
+
+        # Execute search with rescoring
+        search_result = ES_TEST_CLIENT.search(
+            index=MOVIES_INDEX_NAME,
+            query={"terms": {"_id": ["tt1318514", "tt0071562"]}},
+            rescore={
+                "learning_to_rank": {
+                    "model_id": model_id,
+                    "params": {"query_string": "planet of the apes"},
+                }
+            },
+        )
+
+        # Assert that:
+        # - all documents from the query are present
+        # - all documents have been rescored (score != 1.0)
+        doc_scores = [hit["_score"] for hit in search_result["hits"]["hits"]]
+        assert len(search_result["hits"]["hits"]) == 2
+        assert all(score != float(1) for score in doc_scores)
+
+        # Verify prediction is not supported for LTR
+        try:
+            es_model.predict([0])
+        except NotImplementedError:
+            pass
+
+        # Clean up
+        es_model.delete_model()
+
    @requires_sklearn
    @pytest.mark.parametrize("compress_model_definition", [True, False])
    def test_random_forest_classifier(self, compress_model_definition):