Default truncation to second for text similarity the task type(#713)

In reranking the first input (the query) is generally shorter. In this case it makes more sense to truncate the second input (the document text)
2025-07-11 00:02:14 +08:00 · 2024-08-05 11:47:15 +01:00 · 2024-08-05 11:47:15 +01:00 · fd8886da6a
commit fd8886da6a
parent bee6d0e1f7
2 changed files with 6 additions and 0 deletions
--- a/eland/ml/pytorch/transformers.py
+++ b/eland/ml/pytorch/transformers.py
@ -770,6 +770,9 @@ class TransformerModel:
            tokenization_config.span = 128
            tokenization_config.truncate = "none"

+        if self._task_type == "text_similarity":
+            tokenization_config.truncate = "second"
+
        if self._traceable_model.classification_labels():
            inference_config = TASK_TYPE_TO_INFERENCE_CONFIG[self._task_type](
                tokenization=tokenization_config,
--- a/tests/ml/pytorch/test_pytorch_model_config_pytest.py
+++ b/tests/ml/pytorch/test_pytorch_model_config_pytest.py
@ -217,6 +217,9 @@ class TestModelConfguration:
                assert isinstance(config.inference_config.classification_labels, list)
                assert len(config.inference_config.classification_labels) > 0

+            if task_type == "text_similarity":
+                assert tokenization.truncate == "second"
+
            del tm

    def test_model_config_with_prefix_string(self):