Fix transformers API compatibility: support v4.26+ and v5.0+ with version-aware parameter selection (#1514)

* Initial plan * Fix transformers API compatibility issues Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add backward compatibility for transformers v4.26+ by version check Support both tokenizer (v4.26-4.43) and processing_class (v4.44+) parameters based on installed transformers version. Fallback to tokenizer if version check fails. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Improve exception handling specificity Use specific exception types (ImportError, AttributeError, ValueError) instead of broad Exception catch for better error handling. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Run pre-commit formatting on all files Applied black formatting to fix code style across the repository. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
2026-02-09 02:09:16 +08:00 · 2026-01-28 09:00:21 +08:00
parent a5021152d2
commit 158ff7d99e
2 changed files with 43 additions and 29 deletions
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -1196,16 +1196,31 @@ class TransformersEstimator(BaseEstimator):
                    control.should_save = True
                    control.should_evaluate = True

-        self._trainer = TrainerForAuto(
-            args=self._training_args,
-            model_init=self._model_init,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            tokenizer=self.tokenizer,
-            data_collator=self.data_collator,
-            compute_metrics=self._compute_metrics_by_dataset_name,
-            callbacks=[EarlyStoppingCallbackForAuto],
-        )
+        # Use processing_class for transformers >= 4.44.0, tokenizer for older versions
+        trainer_kwargs = {
+            "args": self._training_args,
+            "model_init": self._model_init,
+            "train_dataset": train_dataset,
+            "eval_dataset": eval_dataset,
+            "data_collator": self.data_collator,
+            "compute_metrics": self._compute_metrics_by_dataset_name,
+            "callbacks": [EarlyStoppingCallbackForAuto],
+        }
+
+        # Check if processing_class parameter is supported (transformers >= 4.44.0)
+        try:
+            import transformers
+            from packaging import version
+
+            if version.parse(transformers.__version__) >= version.parse("4.44.0"):
+                trainer_kwargs["processing_class"] = self.tokenizer
+            else:
+                trainer_kwargs["tokenizer"] = self.tokenizer
+        except (ImportError, AttributeError, ValueError):
+            # Fallback to tokenizer if version check fails
+            trainer_kwargs["tokenizer"] = self.tokenizer
+
+        self._trainer = TrainerForAuto(**trainer_kwargs)

        if self._task in NLG_TASKS:
            setattr(self._trainer, "_is_seq2seq", True)
--- a/flaml/automl/nlp/huggingface/utils.py
+++ b/flaml/automl/nlp/huggingface/utils.py
@@ -211,7 +211,6 @@ def tokenize_onedataframe(
    hf_args=None,
    prefix_str=None,
 ):
-    with tokenizer.as_target_tokenizer():
    _, tokenized_column_names = tokenize_row(
        dict(X.iloc[0]),
        tokenizer,