diff --git a/flaml/automl.py b/flaml/automl.py
index db5f9d1c1..47b9169e9 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -2366,7 +2366,17 @@ class AutoML(BaseEstimator):
         if mlflow is not None and mlflow.active_run():
             with mlflow.start_run(nested=True):
                 mlflow.log_metric("iter_counter", self._iter_per_learner[estimator])
-                mlflow.log_param("metric_for_logging", search_state.metric_for_logging)
+                if "intermediate_results" in search_state.metric_for_logging:
+                    for each_entry in search_state.metric_for_logging[
+                        "intermediate_results"
+                    ]:
+                        with mlflow.start_run(nested=True):
+                            mlflow.log_metrics(each_entry)
+                            mlflow.log_metric(
+                                "iter_counter", self._iter_per_learner[estimator]
+                            )
+                    del search_state.metric_for_logging["intermediate_results"]
+                mlflow.log_metrics(search_state.metric_for_logging)
                 mlflow.log_metric("trial_time", search_state.trial_time)
                 mlflow.log_metric("wall_clock_time", self._state.time_from_start)
                 mlflow.log_metric("validation_loss", search_state.val_loss)
diff --git a/flaml/ml.py b/flaml/ml.py
index 0655d5cc8..5564c5d79 100644
--- a/flaml/ml.py
+++ b/flaml/ml.py
@@ -94,11 +94,19 @@ huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}
 def get_estimator_class(task, estimator_name):
     # when adding a new learner, need to add an elif branch
     if "xgboost" == estimator_name:
-        estimator_class = XGBoost_TS_Regressor if TS_FORECAST == task else XGBoostSklearnEstimator
+        estimator_class = (
+            XGBoost_TS_Regressor if TS_FORECAST == task else XGBoostSklearnEstimator
+        )
     elif "xgb_limitdepth" == estimator_name:
-        estimator_class = XGBoostLimitDepth_TS_Regressor if TS_FORECAST == task else XGBoostLimitDepthEstimator
+        estimator_class = (
+            XGBoostLimitDepth_TS_Regressor
+            if TS_FORECAST == task
+            else XGBoostLimitDepthEstimator
+        )
     elif "rf" == estimator_name:
-        estimator_class = RF_TS_Regressor if TS_FORECAST == task else RandomForestEstimator
+        estimator_class = (
+            RF_TS_Regressor if TS_FORECAST == task else RandomForestEstimator
+        )
     elif "lgbm" == estimator_name:
         estimator_class = LGBM_TS_Regressor if TS_FORECAST == task else LGBMEstimator
     elif "lrl1" == estimator_name:
@@ -108,7 +116,9 @@ def get_estimator_class(task, estimator_name):
     elif "catboost" == estimator_name:
         estimator_class = CatBoostEstimator
     elif "extra_tree" == estimator_name:
-        estimator_class = ExtraTrees_TS_Regressor if TS_FORECAST == task else ExtraTreesEstimator
+        estimator_class = (
+            ExtraTrees_TS_Regressor if TS_FORECAST == task else ExtraTreesEstimator
+        )
     elif "kneighbor" == estimator_name:
         estimator_class = KNeighborsEstimator
     elif "prophet" in estimator_name:
@@ -207,8 +217,10 @@ def metric_loss_score(
                     + ", ".join(huggingface_metric_to_mode.keys())
                     + ". Please pass a customized metric function to AutoML.fit(metric=func)"
                 )
-        multiplier = -1 if metric_mode == "max" else 1
-        return score * multiplier
+        if metric_mode == "max":
+            return 1 - score
+        else:
+            return score
 
 
 def is_in_sklearn_metric_name_set(metric_name):
@@ -409,6 +421,8 @@ def get_val_loss(
         log_training_metric,
         fit_kwargs,
     )
+    if hasattr(estimator, "intermediate_results"):
+        metric_for_logging["intermediate_results"] = estimator.intermediate_results
     train_time = time.time() - start
     return val_loss, metric_for_logging, train_time, pred_time
 
diff --git a/flaml/model.py b/flaml/model.py
index 71896f11a..045ba11f8 100644
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -675,6 +675,9 @@ class TransformersEstimator(BaseEstimator):
                 X_train=self._X_train,
                 y_train=self._y_train,
             )
+            if not hasattr(self, "intermediate_results"):
+                self.intermediate_results = []
+            self.intermediate_results.append(metric_dict)
             return metric_dict
 
     def _init_model_for_predict(self, X_test):
@@ -702,6 +705,7 @@ class TransformersEstimator(BaseEstimator):
             )
             if self._task == MULTICHOICECLASSIFICATION
             else None,
+            compute_metrics=self._compute_metrics_by_dataset_name,
         )
         return test_dataset, training_args
 
diff --git a/test/nlp/run_gpu.py b/test/nlp/run_gpu.py
index 8551dd70b..1574c7fc9 100644
--- a/test/nlp/run_gpu.py
+++ b/test/nlp/run_gpu.py
@@ -11,9 +11,9 @@ def _test_hf_data():
     from datasets import load_dataset
 
     try:
-        train_dataset = load_dataset("glue", "mrpc", split="train").to_pandas()
-        dev_dataset = load_dataset("glue", "mrpc", split="validation").to_pandas()
-        test_dataset = load_dataset("glue", "mrpc", split="test").to_pandas()
+        train_dataset = load_dataset("glue", "mrpc", split="train[:1%]").to_pandas()
+        dev_dataset = load_dataset("glue", "mrpc", split="validation[:1%]").to_pandas()
+        test_dataset = load_dataset("glue", "mrpc", split="test[:1%]").to_pandas()
     except requests.exceptions.ConnectionError:
         return
 
@@ -32,7 +32,7 @@ def _test_hf_data():
 
     automl_settings = {
         "gpu_per_trial": 1,
-        "max_iter": 5,
+        "max_iter": 2,
         "time_budget": 5000,
         "task": "seq-classification",
         "metric": "accuracy",
diff --git a/test/nlp/test_autohf_custom_metric.py b/test/nlp/test_autohf_custom_metric.py
index 6df95b943..63461ea87 100644
--- a/test/nlp/test_autohf_custom_metric.py
+++ b/test/nlp/test_autohf_custom_metric.py
@@ -17,6 +17,7 @@ def custom_metric(
 ):
     from datasets import Dataset
     from flaml.model import TransformersEstimator
+    from flaml.nlp.utils import load_default_huggingface_metric_for_task
 
     if estimator._trainer is None:
         estimator._init_model_for_predict(X_test)
@@ -31,12 +32,13 @@ def custom_metric(
         X_test, _ = estimator._preprocess(X_test)
         eval_dataset = Dataset.from_pandas(X_test)
 
-    trainer_compute_metrics_cache = trainer.compute_metrics
-    trainer.compute_metrics = None
+    estimator_metric_cache = estimator._metric
+    estimator._metric = load_default_huggingface_metric_for_task(estimator._task)
 
     metrics = trainer.evaluate(eval_dataset)
-    trainer.compute_metrics = trainer_compute_metrics_cache
-    return metrics["eval_loss"], metrics
+    estimator._metric = estimator_metric_cache
+
+    return metrics["eval_val_loss"], metrics
 
 
 @pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
diff --git a/test/nlp/test_autohf_regression.py b/test/nlp/test_autohf_regression.py
index a4b4877db..7edc01751 100644
--- a/test/nlp/test_autohf_regression.py
+++ b/test/nlp/test_autohf_regression.py
@@ -38,7 +38,7 @@ def test_regression():
         "max_iter": 2,
         "time_budget": 5,
         "task": "seq-regression",
-        "metric": "rmse",
+        "metric": "pearsonr",
         "starting_points": {"transformer": {"num_train_epochs": 1}},
         "use_ray": True,
     }