diff --git a/flaml/ml.py b/flaml/ml.py
index 8f1e88470..15b1d0ba0 100644
--- a/flaml/ml.py
+++ b/flaml/ml.py
@@ -157,6 +157,32 @@ def get_y_pred(estimator, X, eval_metric, obj):
     return y_pred
 
 
+def _eval_estimator(config, estimator, X_train, y_train, X_test, y_test, weight_test,
+                    groups_test, eval_metric, obj, labels=None,
+                    log_training_metric=False, fit_kwargs={}):
+    if isinstance(eval_metric, str):
+        pred_start = time.time()
+        test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
+        pred_time = (time.time() - pred_start) / X_test.shape[0]
+        test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
+                                              labels, weight_test, groups_test)
+        metric_for_logging = {}
+        if log_training_metric:
+            train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
+            metric_for_logging['train_loss'] = sklearn_metric_loss_score(
+                eval_metric, train_pred_y, y_train, labels,
+                fit_kwargs.get('sample_weight'), fit_kwargs.get('groups'))
+    else:  # customized metric function
+        test_loss, metric_for_logging = eval_metric(
+            X_test, y_test, estimator, labels, X_train, y_train, weight_test,
+            fit_kwargs.get('sample_weight'), config, groups_test,
+            fit_kwargs.get('groups'))
+        if isinstance(metric_for_logging, dict):
+            pred_time = metric_for_logging.get('pred_time', 0)
+        test_pred_y = None  # eval_metric may return test_pred_y but not necessarily. Setting None for now.
+    return test_loss, metric_for_logging, pred_time, test_pred_y
+
+
 def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test,
                   groups_test, eval_metric, obj, labels=None, budget=None,
                   log_training_metric=False, fit_kwargs={}):
@@ -167,27 +193,10 @@ def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_te
     #     fit_kwargs['X_val'] = X_test
     #     fit_kwargs['y_val'] = y_test
     estimator.fit(X_train, y_train, budget, **fit_kwargs)
-    if isinstance(eval_metric, str):
-        pred_start = time.time()
-        test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
-        pred_time = (time.time() - pred_start) / X_test.shape[0]
-        test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
-                                              labels, weight_test, groups_test)
-        if log_training_metric:
-            test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
-            metric_for_logging = sklearn_metric_loss_score(
-                eval_metric, test_pred_y, y_train, labels,
-                fit_kwargs.get('sample_weight'), fit_kwargs.get('groups'))
-        else:
-            metric_for_logging = None
-    else:  # customized metric function
-        test_loss, metrics = eval_metric(
-            X_test, y_test, estimator, labels, X_train, y_train, weight_test,
-            fit_kwargs.get('sample_weight'), config, groups_test,
-            fit_kwargs.get('groups'))
-        if isinstance(metrics, dict):
-            pred_time = metrics.get('pred_time', 0)
-        metric_for_logging = metrics
+    test_loss, metric_for_logging, pred_time, _ = _eval_estimator(config, estimator,
+                                                                  X_train, y_train, X_test, y_test,
+                                                                  weight_test, groups_test, eval_metric, obj,
+                                                                  labels, log_training_metric, fit_kwargs)
     train_time = time.time() - start
     return test_loss, metric_for_logging, train_time, pred_time