Fix tokeniser for DeBERTa models (#769)

2025-07-11 00:02:14 +08:00 · 2025-04-23 09:10:02 +01:00 · 2025-04-23 09:10:02 +01:00 · a9c36927f6
commit a9c36927f6
parent 87380ef716
6 changed files with 377 additions and 291 deletions
--- a/eland/ml/pytorch/_pytorch_model.py
+++ b/eland/ml/pytorch/_pytorch_model.py
@ -126,6 +126,7 @@ class PyTorchModel:
    def infer(
        self,
        docs: List[Mapping[str, str]],
        inference_config: Optional[Mapping[str, Any]] = None,
        timeout: str = DEFAULT_TIMEOUT,
    ) -> Any:
        if docs is None:
@ -133,6 +134,8 @@ class PyTorchModel:
        __body: Dict[str, Any] = {}
        __body["docs"] = docs
        if inference_config is not None:
            __body["inference_config"] = inference_config
        __path = f"/_ml/trained_models/{_quote(self.model_id)}/_infer"
        __query: Dict[str, Any] = {}
--- a/eland/ml/pytorch/nlp_ml_model.py
+++ b/eland/ml/pytorch/nlp_ml_model.py
@ -86,7 +86,7 @@ class NlpXLMRobertaTokenizationConfig(NlpTokenizationConfig):
        )
-class DebertaV2Config(NlpTokenizationConfig):
+class NlpDebertaV2TokenizationConfig(NlpTokenizationConfig):
    def __init__(
        self,
        *,
--- a/eland/ml/pytorch/transformers.py
+++ b/eland/ml/pytorch/transformers.py
@ -25,17 +25,13 @@ import os.path
 import random
 import re
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 import torch  # type: ignore
 import transformers  # type: ignore
-from sentence_transformers import SentenceTransformer  # type: ignore
+from torch import Tensor
 from torch import Tensor, nn
 from torch.profiler import profile  # type: ignore
 from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForQuestionAnswering,
    BertTokenizer,
    PretrainedConfig,
    PreTrainedModel,
@ -44,11 +40,11 @@ from transformers import (
 )
 from eland.ml.pytorch.nlp_ml_model import (
    DebertaV2Config,
    FillMaskInferenceOptions,
    NerInferenceOptions,
    NlpBertJapaneseTokenizationConfig,
    NlpBertTokenizationConfig,
    NlpDebertaV2TokenizationConfig,
    NlpMPNetTokenizationConfig,
    NlpRobertaTokenizationConfig,
    NlpTokenizationConfig,
@ -65,8 +61,13 @@ from eland.ml.pytorch.nlp_ml_model import (
    ZeroShotClassificationInferenceOptions,
 )
 from eland.ml.pytorch.traceable_model import TraceableModel
 from eland.ml.pytorch.wrappers import (
    _DistilBertWrapper,
    _DPREncoderWrapper,
    _QuestionAnsweringWrapperModule,
    _SentenceTransformerWrapperModule,
 )
 DEFAULT_OUTPUT_KEY = "sentence_embedding"
 SUPPORTED_TASK_TYPES = {
    "fill_mask",
    "ner",
@ -172,284 +173,6 @@ def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]
    return potential_task_types.pop()
 class _QuestionAnsweringWrapperModule(nn.Module):  # type: ignore
    """
    A wrapper around a question answering model.
    Our inference engine only takes the first tuple if the inference response
    is a tuple.
    This wrapper transforms the output to be a stacked tensor if its a tuple.
    Otherwise it passes it through
    """
    def __init__(self, model: PreTrainedModel):
        super().__init__()
        self._hf_model = model
        self.config = model.config
    @staticmethod
    def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
        model = AutoModelForQuestionAnswering.from_pretrained(
            model_id, token=token, torchscript=True
        )
        if isinstance(
            model.config,
            (
                transformers.MPNetConfig,
                transformers.XLMRobertaConfig,
                transformers.RobertaConfig,
                transformers.BartConfig,
            ),
        ):
            return _TwoParameterQuestionAnsweringWrapper(model)
        else:
            return _QuestionAnsweringWrapper(model)
 class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
    def __init__(self, model: PreTrainedModel):
        super().__init__(model=model)
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "position_ids": position_ids,
        }
        # remove inputs for specific model types
        if isinstance(self._hf_model.config, transformers.DistilBertConfig):
            del inputs["token_type_ids"]
            del inputs["position_ids"]
        response = self._hf_model(**inputs)
        if isinstance(response, tuple):
            return torch.stack(list(response), dim=0)
        return response
 class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
    def __init__(self, model: PreTrainedModel):
        super().__init__(model=model)
    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        response = self._hf_model(**inputs)
        if isinstance(response, tuple):
            return torch.stack(list(response), dim=0)
        return response
 class _DistilBertWrapper(nn.Module):  # type: ignore
    """
    In Elasticsearch the BERT tokenizer is used for DistilBERT models but
    the BERT tokenizer produces 4 inputs where DistilBERT models expect 2.
    Wrap the model's forward function in a method that accepts the 4
    arguments passed to a BERT model then discard the token_type_ids
    and the position_ids to match the wrapped DistilBERT model forward
    function
    """
    def __init__(self, model: transformers.PreTrainedModel):
        super().__init__()
        self._model = model
        self.config = model.config
    @staticmethod
    def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
        if isinstance(model.config, transformers.DistilBertConfig):
            return _DistilBertWrapper(model)
        else:
            return model
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        _token_type_ids: Tensor = None,
        _position_ids: Tensor = None,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        return self._model(input_ids=input_ids, attention_mask=attention_mask)
 class _SentenceTransformerWrapperModule(nn.Module):  # type: ignore
    """
    A wrapper around sentence-transformer models to provide pooling,
    normalization and other graph layers that are not defined in the base
    HuggingFace transformer model.
    """
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__()
        self._hf_model = model
        self._st_model = SentenceTransformer(model.config.name_or_path)
        self._output_key = output_key
        self.config = model.config
        self._remove_pooling_layer()
        self._replace_transformer_layer()
    @staticmethod
    def from_pretrained(
        model_id: str,
        tokenizer: PreTrainedTokenizer,
        *,
        token: Optional[str] = None,
        output_key: str = DEFAULT_OUTPUT_KEY,
    ) -> Optional[Any]:
        model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
        if isinstance(
            tokenizer,
            (
                transformers.BartTokenizer,
                transformers.MPNetTokenizer,
                transformers.RobertaTokenizer,
                transformers.XLMRobertaTokenizer,
                transformers.DebertaV2Tokenizer,
            ),
        ):
            return _TwoParameterSentenceTransformerWrapper(model, output_key)
        else:
            return _SentenceTransformerWrapper(model, output_key)
    def _remove_pooling_layer(self) -> None:
        """
        Removes any last pooling layer which is not used to create embeddings.
        Leaving this layer in will cause it to return a NoneType which in turn
        will fail to load in libtorch. Alternatively, we can just use the output
        of the pooling layer as a dummy but this also affects (if only in a
        minor way) the performance of inference, so we're better off removing
        the layer if we can.
        """
        if hasattr(self._hf_model, "pooler"):
            self._hf_model.pooler = None
    def _replace_transformer_layer(self) -> None:
        """
        Replaces the HuggingFace Transformer layer in the SentenceTransformer
        modules so we can set it with one that has pooling layer removed and
        was loaded ready for TorchScript export.
        """
        self._st_model._modules["0"].auto_model = self._hf_model
 class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__(model=model, output_key=output_key)
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "position_ids": position_ids,
        }
        # remove inputs for specific model types
        if isinstance(self._hf_model.config, transformers.DistilBertConfig):
            del inputs["token_type_ids"]
        return self._st_model(inputs)[self._output_key]
 class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__(model=model, output_key=output_key)
    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        return self._st_model(inputs)[self._output_key]
 class _DPREncoderWrapper(nn.Module):  # type: ignore
    """
    AutoModel loading does not work for DPRContextEncoders, this only exists as
    a workaround. This may never be fixed so this is likely permanent.
    See: https://github.com/huggingface/transformers/issues/13670
    """
    _SUPPORTED_MODELS = {
        transformers.DPRContextEncoder,
        transformers.DPRQuestionEncoder,
    }
    _SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
    def __init__(
        self,
        model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
    ):
        super().__init__()
        self._model = model
        self.config = model.config
    @staticmethod
    def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
        config = AutoConfig.from_pretrained(model_id, token=token)
        def is_compatible() -> bool:
            is_dpr_model = config.model_type == "dpr"
            has_architectures = (
                config.architectures is not None and len(config.architectures) == 1
            )
            is_supported_architecture = has_architectures and (
                config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
            )
            return is_dpr_model and is_supported_architecture
        if is_compatible():
            model = getattr(transformers, config.architectures[0]).from_pretrained(
                model_id, torchscript=True
            )
            return _DPREncoderWrapper(model)
        else:
            return None
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        _position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        return self._model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
 class _TransformerTraceableModel(TraceableModel):
    """A base class representing a HuggingFace transformer model that can be traced."""
@ -489,12 +212,17 @@ class _TransformerTraceableModel(TraceableModel):
                transformers.MPNetTokenizer,
                transformers.RobertaTokenizer,
                transformers.XLMRobertaTokenizer,
                transformers.DebertaV2Tokenizer,
            ),
        ):
            del inputs["token_type_ids"]
            return (inputs["input_ids"], inputs["attention_mask"])
        if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
            return (
                inputs["input_ids"],
                inputs["attention_mask"],
                inputs["token_type_ids"],
            )
        position_ids = torch.arange(inputs["input_ids"].size(1), dtype=torch.long)
        inputs["position_ids"] = position_ids
        return (
@ -694,7 +422,12 @@ class TransformerModel:
                " ".join(m) for m, _ in sorted(ranks.items(), key=lambda kv: kv[1])
            ]
            vocab_obj["merges"] = merges
-        sp_model = getattr(self._tokenizer, "sp_model", None)
+
        if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
            sp_model = self._tokenizer._tokenizer.spm
        else:
            sp_model = getattr(self._tokenizer, "sp_model", None)
        if sp_model:
            id_correction = getattr(self._tokenizer, "fairseq_offset", 0)
            scores = []
@ -733,7 +466,7 @@ class TransformerModel:
                max_sequence_length=_max_sequence_length
            )
        elif isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
-            return DebertaV2Config(
+            return NlpDebertaV2TokenizationConfig(
                max_sequence_length=_max_sequence_length,
                do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
            )
--- a/eland/ml/pytorch/wrappers.py
+++ b/eland/ml/pytorch/wrappers.py
@ -0,0 +1,317 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 """
 This module contains the wrapper classes for the Hugging Face models.
 Wrapping is necessary to ensure that the forward method of the model
 is called with the same arguments the ml-cpp pytorch_inference process
 uses.
 """
 from typing import Any, Optional, Union
 import torch  # type: ignore
 import transformers  # type: ignore
 from sentence_transformers import SentenceTransformer  # type: ignore
 from torch import Tensor, nn
 from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForQuestionAnswering,
    PreTrainedModel,
    PreTrainedTokenizer,
 )
 DEFAULT_OUTPUT_KEY = "sentence_embedding"
 class _QuestionAnsweringWrapperModule(nn.Module):  # type: ignore
    """
    A wrapper around a question answering model.
    Our inference engine only takes the first tuple if the inference response
    is a tuple.
    This wrapper transforms the output to be a stacked tensor if its a tuple.
    Otherwise it passes it through
    """
    def __init__(self, model: PreTrainedModel):
        super().__init__()
        self._hf_model = model
        self.config = model.config
    @staticmethod
    def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
        model = AutoModelForQuestionAnswering.from_pretrained(
            model_id, token=token, torchscript=True
        )
        if isinstance(
            model.config,
            (
                transformers.MPNetConfig,
                transformers.XLMRobertaConfig,
                transformers.RobertaConfig,
                transformers.BartConfig,
            ),
        ):
            return _TwoParameterQuestionAnsweringWrapper(model)
        else:
            return _QuestionAnsweringWrapper(model)
 class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
    def __init__(self, model: PreTrainedModel):
        super().__init__(model=model)
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "position_ids": position_ids,
        }
        # remove inputs for specific model types
        if isinstance(self._hf_model.config, transformers.DistilBertConfig):
            del inputs["token_type_ids"]
            del inputs["position_ids"]
        response = self._hf_model(**inputs)
        if isinstance(response, tuple):
            return torch.stack(list(response), dim=0)
        return response
 class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
    def __init__(self, model: PreTrainedModel):
        super().__init__(model=model)
    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        response = self._hf_model(**inputs)
        if isinstance(response, tuple):
            return torch.stack(list(response), dim=0)
        return response
 class _DistilBertWrapper(nn.Module):  # type: ignore
    """
    In Elasticsearch the BERT tokenizer is used for DistilBERT models but
    the BERT tokenizer produces 4 inputs where DistilBERT models expect 2.
    Wrap the model's forward function in a method that accepts the 4
    arguments passed to a BERT model then discard the token_type_ids
    and the position_ids to match the wrapped DistilBERT model forward
    function
    """
    def __init__(self, model: transformers.PreTrainedModel):
        super().__init__()
        self._model = model
        self.config = model.config
    @staticmethod
    def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
        if isinstance(model.config, transformers.DistilBertConfig):
            return _DistilBertWrapper(model)
        else:
            return model
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        _token_type_ids: Tensor = None,
        _position_ids: Tensor = None,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        return self._model(input_ids=input_ids, attention_mask=attention_mask)
 class _SentenceTransformerWrapperModule(nn.Module):  # type: ignore
    """
    A wrapper around sentence-transformer models to provide pooling,
    normalization and other graph layers that are not defined in the base
    HuggingFace transformer model.
    """
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__()
        self._hf_model = model
        self._st_model = SentenceTransformer(model.config.name_or_path)
        self._output_key = output_key
        self.config = model.config
        self._remove_pooling_layer()
        self._replace_transformer_layer()
    @staticmethod
    def from_pretrained(
        model_id: str,
        tokenizer: PreTrainedTokenizer,
        *,
        token: Optional[str] = None,
        output_key: str = DEFAULT_OUTPUT_KEY,
    ) -> Optional[Any]:
        model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
        if isinstance(
            tokenizer,
            (
                transformers.BartTokenizer,
                transformers.MPNetTokenizer,
                transformers.RobertaTokenizer,
                transformers.XLMRobertaTokenizer,
                transformers.DebertaV2Tokenizer,
            ),
        ):
            return _TwoParameterSentenceTransformerWrapper(model, output_key)
        else:
            return _SentenceTransformerWrapper(model, output_key)
    def _remove_pooling_layer(self) -> None:
        """
        Removes any last pooling layer which is not used to create embeddings.
        Leaving this layer in will cause it to return a NoneType which in turn
        will fail to load in libtorch. Alternatively, we can just use the output
        of the pooling layer as a dummy but this also affects (if only in a
        minor way) the performance of inference, so we're better off removing
        the layer if we can.
        """
        if hasattr(self._hf_model, "pooler"):
            self._hf_model.pooler = None
    def _replace_transformer_layer(self) -> None:
        """
        Replaces the HuggingFace Transformer layer in the SentenceTransformer
        modules so we can set it with one that has pooling layer removed and
        was loaded ready for TorchScript export.
        """
        self._st_model._modules["0"].auto_model = self._hf_model
 class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__(model=model, output_key=output_key)
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "position_ids": position_ids,
        }
        # remove inputs for specific model types
        if isinstance(self._hf_model.config, transformers.DistilBertConfig):
            del inputs["token_type_ids"]
        return self._st_model(inputs)[self._output_key]
 class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__(model=model, output_key=output_key)
    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        return self._st_model(inputs)[self._output_key]
 class _DPREncoderWrapper(nn.Module):  # type: ignore
    """
    AutoModel loading does not work for DPRContextEncoders, this only exists as
    a workaround. This may never be fixed so this is likely permanent.
    See: https://github.com/huggingface/transformers/issues/13670
    """
    _SUPPORTED_MODELS = {
        transformers.DPRContextEncoder,
        transformers.DPRQuestionEncoder,
    }
    _SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
    def __init__(
        self,
        model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
    ):
        super().__init__()
        self._model = model
        self.config = model.config
    @staticmethod
    def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
        config = AutoConfig.from_pretrained(model_id, token=token)
        def is_compatible() -> bool:
            is_dpr_model = config.model_type == "dpr"
            has_architectures = (
                config.architectures is not None and len(config.architectures) == 1
            )
            is_supported_architecture = has_architectures and (
                config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
            )
            return is_dpr_model and is_supported_architecture
        if is_compatible():
            model = getattr(transformers, config.architectures[0]).from_pretrained(
                model_id, torchscript=True
            )
            return _DPREncoderWrapper(model)
        else:
            return None
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        _position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        return self._model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
--- a/tests/ml/pytorch/test_pytorch_model_config_pytest.py
+++ b/tests/ml/pytorch/test_pytorch_model_config_pytest.py
@ -39,6 +39,7 @@ try:
    from eland.ml.pytorch import (
        FillMaskInferenceOptions,
        NlpBertTokenizationConfig,
        NlpDebertaV2TokenizationConfig,
        NlpMPNetTokenizationConfig,
        NlpRobertaTokenizationConfig,
        NlpXLMRobertaTokenizationConfig,
@ -149,6 +150,14 @@ if HAS_PYTORCH and HAS_SKLEARN and HAS_TRANSFORMERS:
            1024,
            None,
        ),
        (
            "microsoft/deberta-v3-xsmall",
            "fill_mask",
            FillMaskInferenceOptions,
            NlpDebertaV2TokenizationConfig,
            512,
            None,
        ),
    ]
 else:
    MODEL_CONFIGURATIONS = []
--- a/tests/ml/pytorch/test_pytorch_model_upload_pytest.py
+++ b/tests/ml/pytorch/test_pytorch_model_upload_pytest.py
@ -67,6 +67,8 @@ TEXT_EMBEDDING_MODELS = [
    )
 ]
 TEXT_SIMILARITY_MODELS = ["mixedbread-ai/mxbai-rerank-xsmall-v1"]
@pytest.fixture(scope="function", autouse=True)
 def setup_and_tear_down():
@ -135,3 +137,25 @@ class TestPytorchModel:
                    )
                    > 0
                )
    @pytest.mark.skipif(
        ES_VERSION < (8, 16, 0), reason="requires 8.16.0 for DeBERTa models"
    )
    @pytest.mark.parametrize("model_id", TEXT_SIMILARITY_MODELS)
    def test_text_similarity(self, model_id):
        with tempfile.TemporaryDirectory() as tmp_dir:
            ptm = download_model_and_start_deployment(
                tmp_dir, False, model_id, "text_similarity"
            )
            result = ptm.infer(
                docs=[
                    {
                        "text_field": "The Amazon rainforest covers most of the Amazon basin in South America"
                    },
                    {"text_field": "Paris is the capital of France"},
                ],
                inference_config={"text_similarity": {"text": "France"}},
            )
            assert result.body["inference_results"][0]["predicted_value"] < 0
            assert result.body["inference_results"][1]["predicted_value"] > 0