Fix transformers API compatibility: support v4.26+ and v5.0+ with version-aware parameter selection (#1514)

* Initial plan * Fix transformers API compatibility issues Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add backward compatibility for transformers v4.26+ by version check Support both tokenizer (v4.26-4.43) and processing_class (v4.44+) parameters based on installed transformers version. Fallback to tokenizer if version check fails. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Improve exception handling specificity Use specific exception types (ImportError, AttributeError, ValueError) instead of broad Exception catch for better error handling. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Run pre-commit formatting on all files Applied black formatting to fix code style across the repository. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
2026-02-09 02:09:16 +08:00 · 2026-01-28 09:00:21 +08:00
parent a5021152d2
commit 158ff7d99e
2 changed files with 43 additions and 29 deletions
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -1196,16 +1196,31 @@ class TransformersEstimator(BaseEstimator):
                    control.should_save = True
                    control.should_evaluate = True
-        self._trainer = TrainerForAuto(
+        # Use processing_class for transformers >= 4.44.0, tokenizer for older versions
-            args=self._training_args,
+        trainer_kwargs = {
-            model_init=self._model_init,
+            "args": self._training_args,
-            train_dataset=train_dataset,
+            "model_init": self._model_init,
-            eval_dataset=eval_dataset,
+            "train_dataset": train_dataset,
-            tokenizer=self.tokenizer,
+            "eval_dataset": eval_dataset,
-            data_collator=self.data_collator,
+            "data_collator": self.data_collator,
-            compute_metrics=self._compute_metrics_by_dataset_name,
+            "compute_metrics": self._compute_metrics_by_dataset_name,
-            callbacks=[EarlyStoppingCallbackForAuto],
+            "callbacks": [EarlyStoppingCallbackForAuto],
-        )
+        }
        # Check if processing_class parameter is supported (transformers >= 4.44.0)
        try:
            import transformers
            from packaging import version
            if version.parse(transformers.__version__) >= version.parse("4.44.0"):
                trainer_kwargs["processing_class"] = self.tokenizer
            else:
                trainer_kwargs["tokenizer"] = self.tokenizer
        except (ImportError, AttributeError, ValueError):
            # Fallback to tokenizer if version check fails
            trainer_kwargs["tokenizer"] = self.tokenizer
        self._trainer = TrainerForAuto(**trainer_kwargs)
        if self._task in NLG_TASKS:
            setattr(self._trainer, "_is_seq2seq", True)
--- a/flaml/automl/nlp/huggingface/utils.py
+++ b/flaml/automl/nlp/huggingface/utils.py
@@ -211,29 +211,28 @@ def tokenize_onedataframe(
    hf_args=None,
    prefix_str=None,
 ):
-    with tokenizer.as_target_tokenizer():
+    _, tokenized_column_names = tokenize_row(
-        _, tokenized_column_names = tokenize_row(
+        dict(X.iloc[0]),
-            dict(X.iloc[0]),
+        tokenizer,
        prefix=(prefix_str,) if task is SUMMARIZATION else None,
        task=task,
        hf_args=hf_args,
        return_column_name=True,
    )
    d = X.apply(
        lambda x: tokenize_row(
            x,
            tokenizer,
            prefix=(prefix_str,) if task is SUMMARIZATION else None,
            task=task,
            hf_args=hf_args,
-            return_column_name=True,
+        ),
-        )
+        axis=1,
-        d = X.apply(
+        result_type="expand",
-            lambda x: tokenize_row(
+    )
-                x,
+    X_tokenized = pd.DataFrame(columns=tokenized_column_names)
-                tokenizer,
+    X_tokenized[tokenized_column_names] = d
-                prefix=(prefix_str,) if task is SUMMARIZATION else None,
+    return X_tokenized
                task=task,
                hf_args=hf_args,
            ),
            axis=1,
            result_type="expand",
        )
        X_tokenized = pd.DataFrame(columns=tokenized_column_names)
        X_tokenized[tokenized_column_names] = d
        return X_tokenized
 def tokenize_row(