[NLP] Tests for NLP model configurations (#623)

Add tests for generated Elasticsearch model configurations
2025-07-11 00:02:14 +08:00 · 2023-10-19 12:39:57 +01:00 · 2023-10-19 12:39:57 +01:00 · ab6e44f430
commit ab6e44f430
parent 0c0a8ab19f
6 changed files with 263 additions and 29 deletions
--- a/eland/common.py
+++ b/eland/common.py
@ -311,7 +311,7 @@ def ensure_es_client(
    if isinstance(es_client, tuple):
        es_client = list(es_client)
    if not isinstance(es_client, Elasticsearch):
-        es_client = Elasticsearch(es_client)  # type: ignore[arg-type]
+        es_client = Elasticsearch(es_client)
    return es_client
--- a/eland/ml/pytorch/init.py
+++ b/eland/ml/pytorch/init.py
@ -17,10 +17,17 @@
 from eland.ml.pytorch._pytorch_model import PyTorchModel  # noqa: F401
 from eland.ml.pytorch.nlp_ml_model import (
    FillMaskInferenceOptions,
    NerInferenceOptions,
    NlpBertTokenizationConfig,
    NlpMPNetTokenizationConfig,
    NlpRobertaTokenizationConfig,
    NlpTrainedModelConfig,
    QuestionAnsweringInferenceOptions,
    TextClassificationInferenceOptions,
    TextEmbeddingInferenceOptions,
    TextSimilarityInferenceOptions,
    ZeroShotClassificationInferenceOptions,
 )
 from eland.ml.pytorch.traceable_model import TraceableModel  # noqa: F401
 from eland.ml.pytorch.transformers import task_type_from_model_config
@ -28,10 +35,17 @@ from eland.ml.pytorch.transformers import task_type_from_model_config
 __all__ = [
    "PyTorchModel",
    "TraceableModel",
    "FillMaskInferenceOptions",
    "NerInferenceOptions",
    "NlpTrainedModelConfig",
    "NlpBertTokenizationConfig",
    "NlpRobertaTokenizationConfig",
    "NlpXLMRobertaTokenizationConfig",
    "NlpMPNetTokenizationConfig",
    "QuestionAnsweringInferenceOptions",
    "TextClassificationInferenceOptions",
    "TextEmbeddingInferenceOptions",
    "TextSimilarityInferenceOptions",
    "ZeroShotClassificationInferenceOptions",
    "task_type_from_model_config",
 ]
--- a/eland/ml/pytorch/nlp_ml_model.py
+++ b/eland/ml/pytorch/nlp_ml_model.py
@ -317,11 +317,9 @@ class NlpTrainedModelConfig:
        input: TrainedModelInput = TrainedModelInput(field_names=["text_field"]),
        metadata: t.Optional[dict] = None,
        model_type: t.Union["t.Literal['pytorch']", str] = "pytorch",
        default_field_map: t.Optional[t.Mapping[str, str]] = None,
        tags: t.Optional[t.Union[t.List[str], t.Tuple[str, ...]]] = None,
    ):
        self.tags = tags
        self.default_field_map = default_field_map
        self.description = description
        self.inference_config = inference_config
        self.input = input
--- a/eland/ml/pytorch/transformers.py
+++ b/eland/ml/pytorch/transformers.py
@ -664,27 +664,23 @@ class TransformerModel:
        return vocab_obj
    def _create_tokenization_config(self) -> NlpTokenizationConfig:
        _max_sequence_length = self._find_max_sequence_length()
        if isinstance(self._tokenizer, transformers.MPNetTokenizer):
            return NlpMPNetTokenizationConfig(
                do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
-                max_sequence_length=getattr(
+                max_sequence_length=_max_sequence_length,
                    self._tokenizer, "max_model_input_sizes", dict()
                ).get(self._model_id),
            )
        elif isinstance(
            self._tokenizer, (transformers.RobertaTokenizer, transformers.BartTokenizer)
        ):
            return NlpRobertaTokenizationConfig(
                add_prefix_space=getattr(self._tokenizer, "add_prefix_space", None),
-                max_sequence_length=getattr(
+                max_sequence_length=_max_sequence_length,
                    self._tokenizer, "max_model_input_sizes", dict()
                ).get(self._model_id),
            )
        elif isinstance(self._tokenizer, transformers.XLMRobertaTokenizer):
            return NlpXLMRobertaTokenizationConfig(
-                max_sequence_length=getattr(
+                max_sequence_length=_max_sequence_length
                    self._tokenizer, "max_model_input_sizes", dict()
                ).get(self._model_id),
            )
        else:
            japanese_morphological_tokenizers = ["mecab"]
@ -695,18 +691,38 @@ class TransformerModel:
            ):
                return NlpBertJapaneseTokenizationConfig(
                    do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
-                    max_sequence_length=getattr(
+                    max_sequence_length=_max_sequence_length,
                        self._tokenizer, "max_model_input_sizes", dict()
                    ).get(self._model_id),
                )
            else:
                return NlpBertTokenizationConfig(
                    do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
-                    max_sequence_length=getattr(
+                    max_sequence_length=_max_sequence_length,
                        self._tokenizer, "max_model_input_sizes", dict()
                    ).get(self._model_id),
                )
    def _find_max_sequence_length(self) -> int:
        # Sometimes the max_... values are present but contain
        # a random or very large value.
        REASONABLE_MAX_LENGTH = 8192
        max_len = getattr(self._tokenizer, "max_model_input_sizes", dict()).get(
            self._model_id
        )
        if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
            return int(max_len)
        max_len = getattr(self._tokenizer, "model_max_length", None)
        if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
            return int(max_len)
        model_config = getattr(self._traceable_model._model, "config", None)
        if model_config is None:
            raise ValueError("Cannot determine model max input length")
        max_len = getattr(model_config, "max_position_embeddings", None)
        if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
            return int(max_len)
        raise ValueError("Cannot determine model max input length")
    def _create_config(
        self, es_version: Optional[Tuple[int, int, int]]
    ) -> NlpTrainedModelConfig:
@ -756,7 +772,7 @@ class TransformerModel:
            ),
        )
-    def _create_traceable_model(self) -> TraceableModel:
+    def _create_traceable_model(self) -> _TransformerTraceableModel:
        if self._task_type == "auto":
            model = transformers.AutoModel.from_pretrained(
                self._model_id, token=self._access_token, torchscript=True
--- a/tests/ml/pytorch/test_pytorch_model_config.py
+++ b/tests/ml/pytorch/test_pytorch_model_config.py
@ -0,0 +1,216 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 import tempfile
 import pytest
 try:
    import sklearn  # noqa: F401
    HAS_SKLEARN = True
 except ImportError:
    HAS_SKLEARN = False
 try:
    from eland.ml.pytorch.transformers import TransformerModel
    HAS_TRANSFORMERS = True
 except ImportError:
    HAS_TRANSFORMERS = False
 try:
    import torch  # noqa: F401
    from eland.ml.pytorch import (
        FillMaskInferenceOptions,
        NerInferenceOptions,
        NlpBertTokenizationConfig,
        NlpMPNetTokenizationConfig,
        NlpRobertaTokenizationConfig,
        QuestionAnsweringInferenceOptions,
        TextClassificationInferenceOptions,
        TextEmbeddingInferenceOptions,
        TextSimilarityInferenceOptions,
        ZeroShotClassificationInferenceOptions,
    )
    HAS_PYTORCH = True
 except ImportError:
    HAS_PYTORCH = False
 from tests import ES_VERSION
 pytestmark = [
    pytest.mark.skipif(
        ES_VERSION < (8, 7, 0),
        reason="Eland uses Pytorch 1.13.1, versions of Elasticsearch prior to 8.7.0 are incompatible with PyTorch 1.13.1",
    ),
    pytest.mark.skipif(
        not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
    ),
    pytest.mark.skipif(
        not HAS_TRANSFORMERS, reason="This test requires 'transformers' package to run"
    ),
    pytest.mark.skipif(
        not HAS_PYTORCH, reason="This test requires 'torch' package to run"
    ),
 ]
 # If the required imports are missing the test will be skipped.
 # Only define th test configurations if the referenced classes
 # have been imported
 if HAS_PYTORCH and HAS_SKLEARN and HAS_TRANSFORMERS:
    MODEL_CONFIGURATIONS = [
        (
            "intfloat/e5-small-v2",
            "text_embedding",
            TextEmbeddingInferenceOptions,
            NlpBertTokenizationConfig,
            512,
            384,
        ),
        (
            "sentence-transformers/all-mpnet-base-v2",
            "text_embedding",
            TextEmbeddingInferenceOptions,
            NlpMPNetTokenizationConfig,
            512,
            768,
        ),
        (
            "sentence-transformers/all-MiniLM-L12-v2",
            "text_embedding",
            TextEmbeddingInferenceOptions,
            NlpBertTokenizationConfig,
            512,
            384,
        ),
        (
            "facebook/dpr-ctx_encoder-multiset-base",
            "text_embedding",
            TextEmbeddingInferenceOptions,
            NlpBertTokenizationConfig,
            512,
            768,
        ),
        (
            "distilbert-base-uncased",
            "fill_mask",
            FillMaskInferenceOptions,
            NlpBertTokenizationConfig,
            512,
            None,
        ),
        (
            "bert-base-uncased",
            "fill_mask",
            FillMaskInferenceOptions,
            NlpBertTokenizationConfig,
            512,
            None,
        ),
        (
            "elastic/distilbert-base-uncased-finetuned-conll03-english",
            "ner",
            NerInferenceOptions,
            NlpBertTokenizationConfig,
            512,
            None,
        ),
        (
            "SamLowe/roberta-base-go_emotions",
            "text_classification",
            TextClassificationInferenceOptions,
            NlpRobertaTokenizationConfig,
            512,
            None,
        ),
        (
            "distilbert-base-cased-distilled-squad",
            "question_answering",
            QuestionAnsweringInferenceOptions,
            NlpBertTokenizationConfig,
            386,
            None,
        ),
        (
            "cross-encoder/ms-marco-TinyBERT-L-2-v2",
            "text_similarity",
            TextSimilarityInferenceOptions,
            NlpBertTokenizationConfig,
            512,
            None,
        ),
        (
            "valhalla/distilbart-mnli-12-6",
            "zero_shot_classification",
            ZeroShotClassificationInferenceOptions,
            NlpRobertaTokenizationConfig,
            1024,
            None,
        ),
    ]
 else:
    MODEL_CONFIGURATIONS = []
 class TestModelConfguration:
    @pytest.mark.parametrize(
        "model_id,task_type,config_type,tokenizer_type,max_sequence_len,embedding_size",
        MODEL_CONFIGURATIONS,
    )
    def test_text_prediction(
        self,
        model_id,
        task_type,
        config_type,
        tokenizer_type,
        max_sequence_len,
        embedding_size,
    ):
        with tempfile.TemporaryDirectory() as tmp_dir:
            print("loading model " + model_id)
            tm = TransformerModel(
                model_id=model_id,
                task_type=task_type,
                es_version=ES_VERSION,
                quantize=False,
            )
            _, config, _ = tm.save(tmp_dir)
            assert "pytorch" == config.model_type
            assert ["text_field"] == config.input.field_names
            assert isinstance(config.inference_config, config_type)
            tokenization = config.inference_config.tokenization
            assert isinstance(tokenization, tokenizer_type)
            assert max_sequence_len == tokenization.max_sequence_length
            if task_type == "text_classification":
                assert isinstance(config.inference_config.classification_labels, list)
                assert len(config.inference_config.classification_labels) > 0
            if task_type == "text_embedding":
                assert embedding_size == config.inference_config.embedding_size
            if task_type == "question_answering":
                assert tokenization.truncate == "none"
                assert tokenization.span > 0
            if task_type == "zero_shot_classification":
                assert isinstance(config.inference_config.classification_labels, list)
                assert len(config.inference_config.classification_labels) > 0
--- a/tests/ml/pytorch/test_transformer_pytorch_model_pytest.py
+++ b/tests/ml/pytorch/test_transformer_pytorch_model_pytest.py
@ -24,13 +24,6 @@ import numpy as np
 import pytest
 from elasticsearch import NotFoundError
 try:
    import sklearn  # noqa: F401
    HAS_SKLEARN = True
 except ImportError:
    HAS_SKLEARN = False
 try:
    import torch  # noqa: F401
    from torch import Tensor, nn  # noqa: F401
@ -67,9 +60,6 @@ pytestmark = [
        ES_VERSION < (8, 0, 0),
        reason="This test requires at least Elasticsearch version 8.0.0",
    ),
    pytest.mark.skipif(
        not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
    ),
    pytest.mark.skipif(
        not HAS_PYTORCH, reason="This test requires 'pytorch' package to run"
    ),