Default truncation to second for text similarity the task type(#713)

In reranking the first input (the query) is generally shorter. In this case
it makes more sense to truncate the second input (the document text)
This commit is contained in:
David Kyle 2024-08-05 11:47:15 +01:00 committed by GitHub
parent bee6d0e1f7
commit fd8886da6a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 6 additions and 0 deletions

View File

@ -770,6 +770,9 @@ class TransformerModel:
tokenization_config.span = 128
tokenization_config.truncate = "none"
if self._task_type == "text_similarity":
tokenization_config.truncate = "second"
if self._traceable_model.classification_labels():
inference_config = TASK_TYPE_TO_INFERENCE_CONFIG[self._task_type](
tokenization=tokenization_config,

View File

@ -217,6 +217,9 @@ class TestModelConfguration:
assert isinstance(config.inference_config.classification_labels, list)
assert len(config.inference_config.classification_labels) > 0
if task_type == "text_similarity":
assert tokenization.truncate == "second"
del tm
def test_model_config_with_prefix_string(self):