diff --git a/flaml/automl/model.py b/flaml/automl/model.py index 65ff77199..b74137cf6 100644 --- a/flaml/automl/model.py +++ b/flaml/automl/model.py @@ -1196,16 +1196,31 @@ class TransformersEstimator(BaseEstimator): control.should_save = True control.should_evaluate = True - self._trainer = TrainerForAuto( - args=self._training_args, - model_init=self._model_init, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - tokenizer=self.tokenizer, - data_collator=self.data_collator, - compute_metrics=self._compute_metrics_by_dataset_name, - callbacks=[EarlyStoppingCallbackForAuto], - ) + # Use processing_class for transformers >= 4.44.0, tokenizer for older versions + trainer_kwargs = { + "args": self._training_args, + "model_init": self._model_init, + "train_dataset": train_dataset, + "eval_dataset": eval_dataset, + "data_collator": self.data_collator, + "compute_metrics": self._compute_metrics_by_dataset_name, + "callbacks": [EarlyStoppingCallbackForAuto], + } + + # Check if processing_class parameter is supported (transformers >= 4.44.0) + try: + import transformers + from packaging import version + + if version.parse(transformers.__version__) >= version.parse("4.44.0"): + trainer_kwargs["processing_class"] = self.tokenizer + else: + trainer_kwargs["tokenizer"] = self.tokenizer + except (ImportError, AttributeError, ValueError): + # Fallback to tokenizer if version check fails + trainer_kwargs["tokenizer"] = self.tokenizer + + self._trainer = TrainerForAuto(**trainer_kwargs) if self._task in NLG_TASKS: setattr(self._trainer, "_is_seq2seq", True) diff --git a/flaml/automl/nlp/huggingface/utils.py b/flaml/automl/nlp/huggingface/utils.py index 17e601a8f..0d04cfa29 100644 --- a/flaml/automl/nlp/huggingface/utils.py +++ b/flaml/automl/nlp/huggingface/utils.py @@ -211,29 +211,28 @@ def tokenize_onedataframe( hf_args=None, prefix_str=None, ): - with tokenizer.as_target_tokenizer(): - _, tokenized_column_names = tokenize_row( - dict(X.iloc[0]), + _, tokenized_column_names = tokenize_row( + dict(X.iloc[0]), + tokenizer, + prefix=(prefix_str,) if task is SUMMARIZATION else None, + task=task, + hf_args=hf_args, + return_column_name=True, + ) + d = X.apply( + lambda x: tokenize_row( + x, tokenizer, prefix=(prefix_str,) if task is SUMMARIZATION else None, task=task, hf_args=hf_args, - return_column_name=True, - ) - d = X.apply( - lambda x: tokenize_row( - x, - tokenizer, - prefix=(prefix_str,) if task is SUMMARIZATION else None, - task=task, - hf_args=hf_args, - ), - axis=1, - result_type="expand", - ) - X_tokenized = pd.DataFrame(columns=tokenized_column_names) - X_tokenized[tokenized_column_names] = d - return X_tokenized + ), + axis=1, + result_type="expand", + ) + X_tokenized = pd.DataFrame(columns=tokenized_column_names) + X_tokenized[tokenized_column_names] = d + return X_tokenized def tokenize_row(