mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-09 02:09:16 +08:00
Fix transformers API compatibility: support v4.26+ and v5.0+ with version-aware parameter selection (#1514)
* Initial plan * Fix transformers API compatibility issues Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add backward compatibility for transformers v4.26+ by version check Support both tokenizer (v4.26-4.43) and processing_class (v4.44+) parameters based on installed transformers version. Fallback to tokenizer if version check fails. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Improve exception handling specificity Use specific exception types (ImportError, AttributeError, ValueError) instead of broad Exception catch for better error handling. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Run pre-commit formatting on all files Applied black formatting to fix code style across the repository. Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
This commit is contained in:
@@ -1196,16 +1196,31 @@ class TransformersEstimator(BaseEstimator):
|
|||||||
control.should_save = True
|
control.should_save = True
|
||||||
control.should_evaluate = True
|
control.should_evaluate = True
|
||||||
|
|
||||||
self._trainer = TrainerForAuto(
|
# Use processing_class for transformers >= 4.44.0, tokenizer for older versions
|
||||||
args=self._training_args,
|
trainer_kwargs = {
|
||||||
model_init=self._model_init,
|
"args": self._training_args,
|
||||||
train_dataset=train_dataset,
|
"model_init": self._model_init,
|
||||||
eval_dataset=eval_dataset,
|
"train_dataset": train_dataset,
|
||||||
tokenizer=self.tokenizer,
|
"eval_dataset": eval_dataset,
|
||||||
data_collator=self.data_collator,
|
"data_collator": self.data_collator,
|
||||||
compute_metrics=self._compute_metrics_by_dataset_name,
|
"compute_metrics": self._compute_metrics_by_dataset_name,
|
||||||
callbacks=[EarlyStoppingCallbackForAuto],
|
"callbacks": [EarlyStoppingCallbackForAuto],
|
||||||
)
|
}
|
||||||
|
|
||||||
|
# Check if processing_class parameter is supported (transformers >= 4.44.0)
|
||||||
|
try:
|
||||||
|
import transformers
|
||||||
|
from packaging import version
|
||||||
|
|
||||||
|
if version.parse(transformers.__version__) >= version.parse("4.44.0"):
|
||||||
|
trainer_kwargs["processing_class"] = self.tokenizer
|
||||||
|
else:
|
||||||
|
trainer_kwargs["tokenizer"] = self.tokenizer
|
||||||
|
except (ImportError, AttributeError, ValueError):
|
||||||
|
# Fallback to tokenizer if version check fails
|
||||||
|
trainer_kwargs["tokenizer"] = self.tokenizer
|
||||||
|
|
||||||
|
self._trainer = TrainerForAuto(**trainer_kwargs)
|
||||||
|
|
||||||
if self._task in NLG_TASKS:
|
if self._task in NLG_TASKS:
|
||||||
setattr(self._trainer, "_is_seq2seq", True)
|
setattr(self._trainer, "_is_seq2seq", True)
|
||||||
|
|||||||
@@ -211,29 +211,28 @@ def tokenize_onedataframe(
|
|||||||
hf_args=None,
|
hf_args=None,
|
||||||
prefix_str=None,
|
prefix_str=None,
|
||||||
):
|
):
|
||||||
with tokenizer.as_target_tokenizer():
|
_, tokenized_column_names = tokenize_row(
|
||||||
_, tokenized_column_names = tokenize_row(
|
dict(X.iloc[0]),
|
||||||
dict(X.iloc[0]),
|
tokenizer,
|
||||||
|
prefix=(prefix_str,) if task is SUMMARIZATION else None,
|
||||||
|
task=task,
|
||||||
|
hf_args=hf_args,
|
||||||
|
return_column_name=True,
|
||||||
|
)
|
||||||
|
d = X.apply(
|
||||||
|
lambda x: tokenize_row(
|
||||||
|
x,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
prefix=(prefix_str,) if task is SUMMARIZATION else None,
|
prefix=(prefix_str,) if task is SUMMARIZATION else None,
|
||||||
task=task,
|
task=task,
|
||||||
hf_args=hf_args,
|
hf_args=hf_args,
|
||||||
return_column_name=True,
|
),
|
||||||
)
|
axis=1,
|
||||||
d = X.apply(
|
result_type="expand",
|
||||||
lambda x: tokenize_row(
|
)
|
||||||
x,
|
X_tokenized = pd.DataFrame(columns=tokenized_column_names)
|
||||||
tokenizer,
|
X_tokenized[tokenized_column_names] = d
|
||||||
prefix=(prefix_str,) if task is SUMMARIZATION else None,
|
return X_tokenized
|
||||||
task=task,
|
|
||||||
hf_args=hf_args,
|
|
||||||
),
|
|
||||||
axis=1,
|
|
||||||
result_type="expand",
|
|
||||||
)
|
|
||||||
X_tokenized = pd.DataFrame(columns=tokenized_column_names)
|
|
||||||
X_tokenized[tokenized_column_names] = d
|
|
||||||
return X_tokenized
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize_row(
|
def tokenize_row(
|
||||||
|
|||||||
Reference in New Issue
Block a user