[ML] adds new auto task type that attempts to automatically determine NLP task type from model config (#475)

For many model types, we don't need to require the task requested. We can infer the task type based on the model configuration and architecture. This commit makes the `task-type` parameter optional for the model up load script and adds logic for auto-detecting the task type based on the 🤗 model.
2025-07-11 00:02:14 +08:00 · 2022-06-23 08:32:23 -04:00 · 2022-06-23 08:32:23 -04:00 · 8892f4fd64
commit 8892f4fd64
parent 8448b3ba4e
4 changed files with 127 additions and 6 deletions
--- a/bin/eland_import_hub_model
+++ b/bin/eland_import_hub_model
@ -84,9 +84,11 @@ def get_arg_parser():
    )
    parser.add_argument(
        "--task-type",
-        required=True,
+        required=False,
        choices=SUPPORTED_TASK_TYPES,
-        help="The task type for the model usage.",
+        help="The task type for the model usage. Will attempt to auto-detect task type for the model if not provided. "
             "Default: auto",
        default="auto"
    )
    parser.add_argument(
        "--quantize",
@ -165,7 +167,11 @@ if __name__ == "__main__":
    try:
        from eland.ml.pytorch import PyTorchModel
-        from eland.ml.pytorch.transformers import SUPPORTED_TASK_TYPES, TransformerModel
+        from eland.ml.pytorch.transformers import (
            SUPPORTED_TASK_TYPES,
            TaskTypeError,
            TransformerModel,
        )
    except ModuleNotFoundError as e:
        logger.error(textwrap.dedent(f"""\
            \033[31mFailed to run because module '{e.name}' is not available.\033[0m
@ -187,8 +193,12 @@ if __name__ == "__main__":
    with tempfile.TemporaryDirectory() as tmp_dir:
        logger.info(f"Loading HuggingFace transformer tokenizer and model '{args.hub_model_id}'")
-        tm = TransformerModel(args.hub_model_id, args.task_type, args.quantize)
+        try:
-        model_path, config, vocab_path = tm.save(tmp_dir)
+            tm = TransformerModel(args.hub_model_id, args.task_type, args.quantize)
            model_path, config, vocab_path = tm.save(tmp_dir)
        except TaskTypeError as err:
            logger.error(f"Failed to get model for task type, please provide valid task type via '--task-type' parameter. Caused by {err}")
            exit(1)
        ptm = PyTorchModel(es, args.es_model_id if args.es_model_id else tm.elasticsearch_model_id())
        model_exists = es.options(ignore_status=404).ml.get_trained_models(model_id=ptm.model_id).meta.status == 200
--- a/eland/ml/pytorch/init.py
+++ b/eland/ml/pytorch/init.py
@ -23,6 +23,7 @@ from eland.ml.pytorch.nlp_ml_model import (
    NlpTrainedModelConfig,
 )
 from eland.ml.pytorch.traceable_model import TraceableModel  # noqa: F401
 from eland.ml.pytorch.transformers import task_type_from_model_config
 __all__ = [
    "PyTorchModel",
@ -31,4 +32,5 @@ __all__ = [
    "NlpBertTokenizationConfig",
    "NlpRobertaTokenizationConfig",
    "NlpMPNetTokenizationConfig",
    "task_type_from_model_config",
 ]
--- a/eland/ml/pytorch/transformers.py
+++ b/eland/ml/pytorch/transformers.py
@ -23,7 +23,7 @@ libraries such as sentence-transformers.
 import json
 import os.path
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 import torch  # type: ignore
 import transformers  # type: ignore
@ -33,6 +33,7 @@ from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForQuestionAnswering,
    PretrainedConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast,
@ -64,6 +65,15 @@ SUPPORTED_TASK_TYPES = {
    "zero_shot_classification",
    "question_answering",
 }
 ARCHITECTURE_TO_TASK_TYPE = {
    "MaskedLM": ["fill_mask", "text_embedding"],
    "TokenClassification": ["ner"],
    "SequenceClassification": ["text_classification", "zero_shot_classification"],
    "QuestionAnswering": ["question_answering"],
    "DPRQuestionEncoder": ["text_embedding"],
    "DPRContextEncoder": ["text_embedding"],
 }
 ZERO_SHOT_LABELS = {"contradiction", "neutral", "entailment"}
 TASK_TYPE_TO_INFERENCE_CONFIG = {
    "fill_mask": FillMaskInferenceOptions,
    "ner": NerInferenceOptions,
@ -97,6 +107,37 @@ TracedModelTypes = Union[
 ]
 class TaskTypeError(Exception):
    pass
 def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]:
    if model_config.architectures is None:
        if model_config.name_or_path.startswith("sentence-transformers/"):
            return "text_embedding"
        return None
    potential_task_types: Set[str] = set()
    for architecture in model_config.architectures:
        for (substr, task_type) in ARCHITECTURE_TO_TASK_TYPE.items():
            if substr in architecture:
                for t in task_type:
                    potential_task_types.add(t)
    if len(potential_task_types) == 0:
        return None
    if len(potential_task_types) > 1:
        if "zero_shot_classification" in potential_task_types:
            if model_config.label2id:
                labels = set([x.lower() for x in model_config.label2id.keys()])
                if len(labels.difference(ZERO_SHOT_LABELS)) == 0:
                    return "zero_shot_classification"
            return "text_classification"
        if "text_embedding" in potential_task_types:
            if model_config.name_or_path.startswith("sentence-transformers/"):
                return "text_embedding"
            return "fill_mask"
    return potential_task_types.pop()
 class _QuestionAnsweringWrapperModule(nn.Module):  # type: ignore
    """
    A wrapper around a question answering model.
@ -581,6 +622,18 @@ class TransformerModel:
        )
    def _create_traceable_model(self) -> TraceableModel:
        if self._task_type == "auto":
            model = transformers.AutoModel.from_pretrained(
                self._model_id, torchscript=True
            )
            maybe_task_type = task_type_from_model_config(model.config)
            if maybe_task_type is None:
                raise TaskTypeError(
                    f"Unable to automatically determine task type for model {self._model_id}, please supply task type: {SUPPORTED_TASK_TYPES_NAMES}"
                )
            else:
                self._task_type = maybe_task_type
        if self._task_type == "fill_mask":
            model = transformers.AutoModelForMaskedLM.from_pretrained(
                self._model_id, torchscript=True
--- a/tests/ml/pytorch/test_transformer_pytorch_model_pytest.py
+++ b/tests/ml/pytorch/test_transformer_pytorch_model_pytest.py
@ -34,12 +34,14 @@ except ImportError:
 try:
    import torch  # noqa: F401
    from torch import Tensor, nn  # noqa: F401
    from transformers import PretrainedConfig  # noqa: F401
    from eland.ml.pytorch import (  # noqa: F401
        NlpBertTokenizationConfig,
        NlpTrainedModelConfig,
        PyTorchModel,
        TraceableModel,
        task_type_from_model_config,
    )
    from eland.ml.pytorch.nlp_ml_model import (
        NerInferenceOptions,
@ -222,6 +224,41 @@ MODELS_TO_TEST = [
    ),
 ]
 AUTO_TASK_RESULTS = [
    ("any_bert", "BERTMaskedLM", None, "fill_mask"),
    ("any_roberta", "RoBERTaMaskedLM", None, "fill_mask"),
    ("sentence-transformers/any_bert", "BERTMaskedLM", None, "text_embedding"),
    ("sentence-transformers/any_roberta", "RoBERTaMaskedLM", None, "text_embedding"),
    ("sentence-transformers/mpnet", "MPNetMaskedLM", None, "text_embedding"),
    ("anynermodel", "BERTForTokenClassification", None, "ner"),
    ("anynermodel", "MPNetForTokenClassification", None, "ner"),
    ("anynermodel", "RoBERTaForTokenClassification", None, "ner"),
    ("anynermodel", "BERTForQuestionAnswering", None, "question_answering"),
    ("anynermodel", "MPNetForQuestionAnswering", None, "question_answering"),
    ("anynermodel", "RoBERTaForQuestionAnswering", None, "question_answering"),
    ("aqaModel", "DPRQuestionEncoder", None, "text_embedding"),
    ("aqaModel", "DPRContextEncoder", None, "text_embedding"),
    (
        "any_bert",
        "BERTForSequenceClassification",
        ["foo", "bar", "baz"],
        "text_classification",
    ),
    (
        "any_bert",
        "BERTForSequenceClassification",
        ["contradiction", "neutral", "entailment"],
        "zero_shot_classification",
    ),
    (
        "any_bert",
        "BERTForSequenceClassification",
        ["CONTRADICTION", "NEUTRAL", "ENTAILMENT"],
        "zero_shot_classification",
    ),
    ("any_bert", "SomeUnknownType", None, None),
 ]
@pytest.fixture(scope="function", autouse=True)
 def setup_and_tear_down():
@ -274,3 +311,22 @@ class TestPytorchModelUpload:
            result = ptm.infer(docs=[{"text_field": input}])
            assert result.get("predicted_value") is not None
            assert result["predicted_value"] == prediction
    @pytest.mark.parametrize(
        "model_id,architecture,labels,expected_task", AUTO_TASK_RESULTS
    )
    def test_auto_task_type(self, model_id, architecture, labels, expected_task):
        config = (
            PretrainedConfig(
                name_or_path=model_id,
                architectures=[architecture],
                label2id=dict(zip(labels, range(len(labels)))),
                id2label=dict(zip(range(len(labels)), labels)),
            )
            if labels
            else PretrainedConfig(
                name_or_path=model_id,
                architectures=[architecture],
            )
        )
        assert task_type_from_model_config(model_config=config) == expected_task