From 7209f617732c9d754415d56e0e78753802d64792 Mon Sep 17 00:00:00 2001
From: Josh Devins <josh.devins@elastic.co>
Date: Thu, 11 Nov 2021 13:18:55 +0100
Subject: [PATCH] Adds max_length padding to transformer tracing (#411)

The padding parameter needs to be set on the tokenization call and not
in the constructor. Furthermore, the True value will only pad to the
largest input in a batch, however we don't trace with batches so this
value had no effect. The proper place to pass this parameter is in the
tokenization call itself and the proper value to use is "max_length"
which will pad the input to the maximum input size specified by the
model. Although we measure no functional or performance impact of this
setting, it has been suggested that this is a best practice.

See: https://huggingface.co/transformers/serialization.html#dummy-inputs-and-standard-lengths
---
 eland/ml/pytorch/transformers.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/eland/ml/pytorch/transformers.py b/eland/ml/pytorch/transformers.py
index 9143579..dc5f37c 100644
--- a/eland/ml/pytorch/transformers.py
+++ b/eland/ml/pytorch/transformers.py
@@ -290,6 +290,7 @@ class _TraceableFillMaskModel(_TraceableModel):
         return self._tokenizer(
             "Who was Jim Henson?",
             "[MASK] Henson was a puppeteer",
+            padding="max_length",
             return_tensors="pt",
         )
 
@@ -301,6 +302,7 @@ class _TraceableNerModel(_TraceableClassificationModel):
                 "Hugging Face Inc. is a company based in New York City. "
                 "Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge."
             ),
+            padding="max_length",
             return_tensors="pt",
         )
 
@@ -309,6 +311,7 @@ class _TraceableTextClassificationModel(_TraceableClassificationModel):
     def _prepare_inputs(self) -> transformers.BatchEncoding:
         return self._tokenizer(
             "This is an example sentence.",
+            padding="max_length",
             return_tensors="pt",
         )
 
@@ -317,6 +320,7 @@ class _TraceableTextEmbeddingModel(_TraceableModel):
     def _prepare_inputs(self) -> transformers.BatchEncoding:
         return self._tokenizer(
             "This is an example sentence.",
+            padding="max_length",
             return_tensors="pt",
         )
 
@@ -326,8 +330,8 @@ class _TraceableZeroShotClassificationModel(_TraceableClassificationModel):
         return self._tokenizer(
             "This is an example sentence.",
             "This example is an example.",
+            padding="max_length",
             return_tensors="pt",
-            truncation_strategy="only_first",
         )
 
 
@@ -337,10 +341,11 @@ class TransformerModel:
         self._task_type = task_type.replace("-", "_")
 
         # load Hugging Face model and tokenizer
-        # use padding in the tokenizer to ensure max length sequences are used for tracing
+        # use padding in the tokenizer to ensure max length sequences are used for tracing (at call time)
         #  - see: https://huggingface.co/transformers/serialization.html#dummy-inputs-and-standard-lengths
         self._tokenizer = transformers.AutoTokenizer.from_pretrained(
-            self._model_id, padding=True, use_fast=False
+            self._model_id,
+            use_fast=False,
         )
 
         # check for a supported tokenizer