Fix missing value support for XGBRanker. (#654)

* Fix missing value support for XGBRanker.

* lint

* Sort expected scores

* lint
This commit is contained in:
Aurélien FOUCRET 2024-01-23 18:42:24 +01:00 committed by GitHub
parent 1190364abb
commit 2a6a4b1f06
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 41 additions and 14 deletions

View File

@ -96,6 +96,7 @@ class TreeNode:
add_if_exists(d, "split_feature", self._split_feature) add_if_exists(d, "split_feature", self._split_feature)
add_if_exists(d, "threshold", self._threshold) add_if_exists(d, "threshold", self._threshold)
add_if_exists(d, "number_samples", self._number_samples) add_if_exists(d, "number_samples", self._number_samples)
add_if_exists(d, "default_left", self._default_left)
else: else:
if len(self._leaf_value) == 1: if len(self._leaf_value) == 1:
# Support Elasticsearch 7.6 which only # Support Elasticsearch 7.6 which only

View File

@ -107,6 +107,7 @@ class XGBoostForestTransformer(ModelTransformer):
decision_type=self._node_decision_type, decision_type=self._node_decision_type,
left_child=self.extract_node_id(row["Yes"], curr_tree), left_child=self.extract_node_id(row["Yes"], curr_tree),
right_child=self.extract_node_id(row["No"], curr_tree), right_child=self.extract_node_id(row["No"], curr_tree),
default_left=row["Yes"] == row["Missing"],
threshold=float(row["Split"]), threshold=float(row["Split"]),
split_feature=self.get_feature_id(row["Feature"]), split_feature=self.get_feature_id(row["Feature"]),
) )

View File

@ -23,7 +23,7 @@ import pytest
import eland as ed import eland as ed
from eland.ml import MLModel from eland.ml import MLModel
from eland.ml.ltr import LTRModelConfig, QueryFeatureExtractor from eland.ml.ltr import FeatureLogger, LTRModelConfig, QueryFeatureExtractor
from tests import ( from tests import (
ES_TEST_CLIENT, ES_TEST_CLIENT,
ES_VERSION, ES_VERSION,
@ -321,13 +321,27 @@ class TestMLModel:
es_model.delete_model() es_model.delete_model()
@requires_elasticsearch_version((8, 12)) @requires_elasticsearch_version((8, 12))
@requires_sklearn @requires_xgboost
@pytest.mark.parametrize("compress_model_definition", [True, False]) @pytest.mark.parametrize("compress_model_definition", [True, False])
def test_learning_to_rank(self, compress_model_definition): @pytest.mark.parametrize(
# Train model "objective",
training_data = datasets.make_regression(n_features=2) ["rank:ndcg", "rank:map", "rank:pairwise"],
regressor = DecisionTreeRegressor() )
regressor.fit(training_data[0], training_data[1]) def test_learning_to_rank(self, objective, compress_model_definition):
X, y = datasets.make_classification(
n_features=3, n_informative=2, n_redundant=1
)
rng = np.random.default_rng()
qid = rng.integers(0, 3, size=X.shape[0])
# Sort the inputs based on query index
sorted_idx = np.argsort(qid)
X = X[sorted_idx, :]
y = y[sorted_idx]
qid = qid[sorted_idx]
ranker = XGBRanker(objective=objective)
ranker.fit(X, y, qid=qid)
# Serialise the models to Elasticsearch # Serialise the models to Elasticsearch
model_id = "test_learning_to_rank" model_id = "test_learning_to_rank"
@ -356,7 +370,7 @@ class TestMLModel:
es_model = MLModel.import_ltr_model( es_model = MLModel.import_ltr_model(
ES_TEST_CLIENT, ES_TEST_CLIENT,
model_id, model_id,
regressor, ranker,
ltr_model_config, ltr_model_config,
es_if_exists="replace", es_if_exists="replace",
es_compress_model_definition=compress_model_definition, es_compress_model_definition=compress_model_definition,
@ -388,16 +402,27 @@ class TestMLModel:
"learning_to_rank": { "learning_to_rank": {
"model_id": model_id, "model_id": model_id,
"params": {"query_string": "yosemite"}, "params": {"query_string": "yosemite"},
} },
"window_size": 2,
}, },
) )
# Assert that: # Assert that rescored search result match predition.
# - all documents from the query are present
# - all documents have been rescored (score != 1.0)
doc_scores = [hit["_score"] for hit in search_result["hits"]["hits"]] doc_scores = [hit["_score"] for hit in search_result["hits"]["hits"]]
assert len(search_result["hits"]["hits"]) == 2
assert all(score != float(1) for score in doc_scores) feature_logger = FeatureLogger(
ES_TEST_CLIENT, NATIONAL_PARKS_INDEX_NAME, ltr_model_config
)
expected_scores = sorted(
[
ranker.predict(np.asarray([doc_features]))[0]
for _, doc_features in feature_logger.extract_features(
{"query_string": "yosemite"}, ["park_yosemite", "park_everglades"]
).items()
],
reverse=True,
)
np.testing.assert_almost_equal(expected_scores, doc_scores, decimal=2)
# Verify prediction is not supported for LTR # Verify prediction is not supported for LTR
try: try: