diff --git a/.gitignore b/.gitignore index c3452c202..bf37299eb 100644 --- a/.gitignore +++ b/.gitignore @@ -156,3 +156,7 @@ automl.pkl .idea/* .DS_Store + +test/nlp/testtmp.py +test/nlp/testtmpfl.py + diff --git a/flaml/automl.py b/flaml/automl.py index 1571ece49..c68c7e0fa 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -246,11 +246,6 @@ class AutoMLState: * sample_size / state.data_size[0] ) - # raise Exception("bbbbb", state.time_budget, budget) - - if _is_nlp_task(state.task): - state.fit_kwargs["X_val"] = state.X_val - state.fit_kwargs["y_val"] = state.y_val ( trained_estimator, @@ -344,7 +339,7 @@ class AutoMLState: estimator_class=self.learner_classes.get(estimator), budget=budget, fit_kwargs=self.fit_kwargs, - eval_metric="train_time", + eval_metric=self.metric if hasattr(self, "metric") else "train_time", ) if sampled_weight is not None: @@ -699,6 +694,16 @@ class AutoML(BaseEstimator): """Time taken to find best model in seconds.""" return self.__dict__.get("_time_taken_best_iter") + def score(self, X: pd.DataFrame, y: pd.Series, **kwargs): + estimator = getattr(self, "_trained_estimator", None) + if estimator is None: + logger.warning( + "No estimator is trained. Please run fit with enough budget." + ) + return None + X = self._preprocess(X) + return estimator.score(X, y, **kwargs) + def predict( self, X: Union[np.array, pd.DataFrame, List[str], List[List[str]]], @@ -1259,7 +1264,7 @@ class AutoML(BaseEstimator): record_id: An integer of the record ID in the file, 0 corresponds to the first trial. task: A string of the task type, - 'binary', 'multi', 'regression', 'ts_forecast', 'rank'. + 'binary', 'multiclass', 'regression', 'ts_forecast', 'rank'. Returns: An estimator object for the given configuration. @@ -1645,8 +1650,12 @@ class AutoML(BaseEstimator): estimator_to_training_function = {} for estimator in self.estimator_list: search_state = self._search_states[estimator] - estimator_to_training_function[estimator] = search_state.training_function - del search_state.training_function + if hasattr(search_state, "training_function"): + estimator_to_training_function[ + estimator + ] = search_state.training_function + del search_state.training_function + with open(output_file_name, "wb") as f: pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) @@ -1781,7 +1790,7 @@ class AutoML(BaseEstimator): 'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2', 'mape'. Default is 'auto'. If passing a customized metric function, the function needs to - have the follwing signature: + have the following signature: ```python def custom_metric( X_test, y_test, estimator, labels, @@ -2114,7 +2123,7 @@ class AutoML(BaseEstimator): metric = load_default_huggingface_metric_for_task(self._state.task) elif "binary" in self._state.task: metric = "roc_auc" - elif "multi" in self._state.task: + elif "multiclass" in self._state.task: metric = "log_loss" elif self._state.task in TS_FORECAST: metric = "mape" @@ -2838,7 +2847,7 @@ class AutoML(BaseEstimator): estimators = [] if self._ensemble and self._state.task in ( "binary", - "multi", + "multiclass", "regression", ): search_states = list( diff --git a/flaml/data.py b/flaml/data.py index 90162ff2f..be7a067e6 100644 --- a/flaml/data.py +++ b/flaml/data.py @@ -18,7 +18,7 @@ MULTICHOICECLASSIFICATION = "multichoice-classification" TOKENCLASSIFICATION = "token-classification" CLASSIFICATION = ( "binary", - "multi", + "multiclass", "classification", SEQCLASSIFICATION, MULTICHOICECLASSIFICATION, diff --git a/flaml/default/suggest.py b/flaml/default/suggest.py index ec7a71dfe..cd244a4e2 100644 --- a/flaml/default/suggest.py +++ b/flaml/default/suggest.py @@ -25,7 +25,7 @@ def load_config_predictor(estimator_name, task, location=None): predictor = CONFIG_PREDICTORS.get(key) if predictor: return predictor - task = "multiclass" if task == "multi" else task + task = "multiclass" if task == "multi" else task # TODO: multi -> multiclass? try: location = location or LOCATION with open(f"{location}/{estimator_name}/{task}.json", "r") as f: diff --git a/flaml/ml.py b/flaml/ml.py index 5e5c82d06..25c7b123d 100644 --- a/flaml/ml.py +++ b/flaml/ml.py @@ -219,6 +219,13 @@ def is_in_sklearn_metric_name_set(metric_name): return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set +def is_min_metric(metric_name): + return ( + metric_name in ["rmse", "mae", "mse", "log_loss", "mape"] + or huggingface_metric_to_mode.get(metric_name, None) == "min" + ) + + def sklearn_metric_loss_score( metric_name, y_predict, @@ -565,6 +572,8 @@ def compute_estimator( if isinstance(estimator, TransformersEstimator): fit_kwargs["metric"] = eval_metric + fit_kwargs["X_val"] = X_val + fit_kwargs["y_val"] = y_val if "holdout" == eval_method: val_loss, metric_for_logging, train_time, pred_time = get_val_loss( @@ -633,7 +642,7 @@ def get_classification_objective(num_labels: int) -> str: if num_labels == 2: objective_name = "binary" else: - objective_name = "multi" + objective_name = "multiclass" return objective_name diff --git a/flaml/model.py b/flaml/model.py index d0067ddf3..7c4b8e861 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -88,7 +88,9 @@ class BaseEstimator: Args: task: A string of the task type, one of - 'binary', 'multi', 'regression', 'rank', 'forecast'. + 'binary', 'multiclass', 'regression', 'rank', 'seq-classification', + 'seq-regression', 'token-classification', 'multichoice-classification', + 'summarization', 'ts_forecast', 'ts_forecast_classification'. config: A dictionary containing the hyperparameter names, 'n_jobs' as keys. n_jobs is the number of parallel threads. """ @@ -234,6 +236,56 @@ class BaseEstimator: X = self._preprocess(X) return self._model.predict_proba(X) + def score(self, X_val: DataFrame, y_val: Series, **kwargs): + """Report the evaluation score of a trained estimator. + + + Args: + X_val: A pandas dataframe of the validation input data. + y_val: A pandas series of the validation label. + kwargs: keyword argument of the evaluation function, for example: + - metric: A string of the metric name or a function + e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', + 'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2', + 'mape'. Default is 'auto'. + If metric is given, the score will report the user specified metric. + If metric is not given, the metric is set to accuracy for classification and r2 + for regression. + You can also pass a customized metric function, for examples on how to pass a + customized metric function, please check + [test/nlp/test_autohf_custom_metric.py](https://github.com/microsoft/FLAML/blob/main/test/nlp/test_autohf_custom_metric.py) and + [test/automl/test_multiclass.py](https://github.com/microsoft/FLAML/blob/main/test/automl/test_multiclass.py). + + ``` + + Returns: + The evaluation score on the validation dataset. + """ + from .ml import metric_loss_score + from .ml import is_min_metric + + if self._model is not None: + if self._task == "rank": + raise NotImplementedError( + "AutoML.score() is not implemented for ranking" + ) + else: + X_val = self._preprocess(X_val) + metric = kwargs.get("metric", None) + if metric: + y_pred = self.predict(X_val, **kwargs) + if is_min_metric(metric): + return metric_loss_score(metric, y_pred, y_val) + else: + return 1.0 - metric_loss_score(metric, y_pred, y_val) + else: + return self._model.score(X_val, y_val, **kwargs) + else: + logger.warning( + "Estimator is not fit yet. Please run fit() before predict()." + ) + return 0.0 + def cleanup(self): del self._model self._model = None @@ -244,7 +296,7 @@ class BaseEstimator: Args: data_size: A tuple of two integers, number of rows and columns. - task: A str of the task type, e.g., "binary", "multi", "regression". + task: A str of the task type, e.g., "binary", "multiclass", "regression". Returns: A dictionary of the search space. @@ -518,7 +570,6 @@ class TransformersEstimator(BaseEstimator): else self.hf_args.model_path, self._task, ) - self._metric = kwargs["metric"] try: @@ -720,15 +771,11 @@ class TransformersEstimator(BaseEstimator): metric_dict["automl_metric"] = loss return metric_dict - def _init_model_for_predict(self, X_test): - from datasets import Dataset + def _init_model_for_predict(self): from .nlp.huggingface.trainer import TrainerForAuto from .nlp.huggingface.data_collator import DataCollatorForPredict from .nlp.utils import load_model - X_test, _ = self._preprocess(X_test, **self._kwargs) - test_dataset = Dataset.from_pandas(X_test) - this_model = load_model( checkpoint_path=self._checkpoint_path, task=self._task, @@ -750,25 +797,56 @@ class TransformersEstimator(BaseEstimator): ) if self._task in NLG_TASKS: setattr(new_trainer, "_is_seq2seq", True) - return new_trainer, test_dataset, training_args + return new_trainer, training_args def predict_proba(self, X, **kwargs): + from datasets import Dataset + self._update_hf_args(kwargs) assert ( self._task in CLASSIFICATION ), "predict_proba() only for classification tasks." - new_trainer, test_dataset, _ = self._init_model_for_predict(X) + X_test, _ = self._preprocess(X, **self._kwargs) + test_dataset = Dataset.from_pandas(X_test) + + new_trainer, _ = self._init_model_for_predict() predictions = new_trainer.predict(test_dataset) return predictions.predictions + def score(self, X_val: DataFrame, y_val: Series, **kwargs): + import transformers + from datasets import Dataset + + transformers.logging.set_verbosity_error() + + self._metric = kwargs["metric"] + + if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION): + self._X_val, _ = self._preprocess(X=X_val) + self._y_val = y_val + else: + self._X_val, self._y_val = self._preprocess(X=X_val, y=y_val) + + eval_dataset = Dataset.from_pandas( + TransformersEstimator._join(self._X_val, self._y_val) + ) + + new_trainer, training_args = self._init_model_for_predict() + return new_trainer.evaluate(eval_dataset) + def predict(self, X, **kwargs): import transformers + from datasets import Dataset transformers.logging.set_verbosity_error() self._update_hf_args(kwargs) - new_trainer, test_dataset, training_args = self._init_model_for_predict(X) + + X_test, _ = self._preprocess(X, **self._kwargs) + test_dataset = Dataset.from_pandas(X_test) + + new_trainer, training_args = self._init_model_for_predict() if self._task not in NLG_TASKS: predictions = new_trainer.predict(test_dataset) @@ -1677,6 +1755,17 @@ class Prophet(SKLearnEstimator): ) return np.ones(X.shape[0]) + def score(self, X_val: DataFrame, y_val: Series, **kwargs): + from sklearn.metrics import r2_score + from .ml import metric_loss_score + + y_pred = self.predict(X_val) + self._metric = kwargs.get("metric", None) + if self._metric: + return metric_loss_score(self._metric, y_pred, y_val) + else: + return r2_score(y_pred, y_val) + class ARIMA(Prophet): """The class for tuning ARIMA.""" diff --git a/notebook/automl_classification.ipynb b/notebook/automl_classification.ipynb index 650786ee1..955b5f16c 100644 --- a/notebook/automl_classification.ipynb +++ b/notebook/automl_classification.ipynb @@ -128,9 +128,9 @@ "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.9/site-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel->jupyter->flaml[notebook]) (0.8.2)\n", "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.9/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->flaml[notebook]) (21.2.0)\n", "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.9/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->flaml[notebook]) (0.18.0)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", - "\u001b[33mWARNING: You are using pip version 21.3; however, version 21.3.1 is available.\n", - "You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" + "\u001B[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001B[0m\n", + "\u001B[33mWARNING: You are using pip version 21.3; however, version 21.3.1 is available.\n", + "You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n" ] } ], @@ -863,7 +863,7 @@ " \n", " Args:\n", " task: A string of the task type, one of\n", - " 'binary', 'multi', 'regression'\n", + " 'binary', 'multiclass', 'regression'\n", " config: A dictionary containing the hyperparameter names\n", " and 'n_jobs' as keys. n_jobs is the number of parallel threads.\n", " '''\n", @@ -1283,4 +1283,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/test/automl/test_multiclass.py b/test/automl/test_multiclass.py index 7dbdfda0e..b7996f999 100644 --- a/test/automl/test_multiclass.py +++ b/test/automl/test_multiclass.py @@ -203,7 +203,7 @@ class TestMultiClass(unittest.TestCase): print(automl_experiment.best_estimator) automl_experiment = AutoML() estimator = automl_experiment.get_estimator_from_log( - automl_settings["log_file_name"], record_id=0, task="multi" + automl_settings["log_file_name"], record_id=0, task="multiclass" ) print(estimator) ( diff --git a/test/automl/test_score.py b/test/automl/test_score.py new file mode 100644 index 000000000..2e62a833c --- /dev/null +++ b/test/automl/test_score.py @@ -0,0 +1,218 @@ +from flaml import AutoML +import pandas as pd +from sklearn.datasets import fetch_california_housing, fetch_openml + + +class TestScore: + def test_forecast(self, budget=5): + import pickle + + # using dataframe + import statsmodels.api as sm + + data = sm.datasets.co2.load_pandas().data["co2"].resample("MS").mean() + data = ( + data.fillna(data.bfill()) + .to_frame() + .reset_index() + .rename(columns={"index": "ds", "co2": "y"}) + ) + num_samples = data.shape[0] + time_horizon = 12 + split_idx = num_samples - time_horizon + X_test = data[split_idx:]["ds"] + y_test = data[split_idx:]["y"] + + df = data[:split_idx] + automl = AutoML() + settings = { + "time_budget": budget, # total running time in seconds + "metric": "mape", # primary metric + "task": "ts_forecast", # task type + "log_file_name": "test/CO2_forecast.log", # flaml log file + "eval_method": "holdout", + "label": "y", + } + """The main flaml automl API""" + try: + import prophet + + automl.fit( + dataframe=df, + estimator_list=["prophet", "arima", "sarimax"], + **settings, + period=time_horizon, + ) + automl.score(X_test, y_test) + automl.pickle("automl.pkl") + with open("automl.pkl", "rb") as f: + pickle.load(f) + except ImportError: + print("not using prophet due to ImportError") + automl.fit( + dataframe=df, + **settings, + estimator_list=["arima", "sarimax"], + period=time_horizon, + ) + automl.score(X_test, y_test) + automl.pickle("automl.pkl") + with open("automl.pkl", "rb") as f: + pickle.load(f) + + def test_classification(self): + X = pd.DataFrame( + { + "f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14], + "f2": [ + 3.0, + 16.0, + 10.0, + 12.0, + 3.0, + 14.0, + 11.0, + 12.0, + 5.0, + 14.0, + 20.0, + 16.0, + 15.0, + 11.0, + ], + "f3": [ + "a", + "b", + "a", + "c", + "c", + "b", + "b", + "b", + "b", + "a", + "b", + 1.0, + 1.0, + "a", + ], + "f4": [ + True, + True, + False, + True, + True, + False, + False, + False, + True, + True, + False, + False, + True, + True, + ], + } + ) + y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) + + automl = AutoML() + for each_estimator in [ + "catboost", + "lrl2", + "lrl1", + "rf", + "lgbm", + "extra_tree", + "kneighbor", + "xgboost", + ]: + automl_settings = { + "time_budget": 6, + "task": "classification", + "n_jobs": 1, + "estimator_list": [each_estimator], + "metric": "accuracy", + "log_training_metric": True, + } + automl.score(X, y) # for covering the case no estimator is trained + + automl.fit(X, y, **automl_settings) + automl.score(X, y) + automl.score(X, y, **{"metric": "accuracy"}) + + automl.pickle("automl.pkl") + + def test_regression(self): + automl_experiment = AutoML() + + X_train, y_train = fetch_california_housing(return_X_y=True) + n = int(len(y_train) * 9 // 10) + + for each_estimator in [ + "lgbm", + "xgboost", + "rf", + "extra_tree", + "catboost", + "kneighbor", + ]: + automl_settings = { + "time_budget": 2, + "task": "regression", + "log_file_name": "test/california.log", + "log_training_metric": True, + "estimator_list": [each_estimator], + "n_jobs": 1, + "model_history": True, + } + automl_experiment.fit( + X_train=X_train[:n], + y_train=y_train[:n], + X_val=X_train[n:], + y_val=y_train[n:], + **automl_settings, + ) + + automl_experiment.score(X_train[n:], y_train[n:], **{"metric": "mse"}) + automl_experiment.pickle("automl.pkl") + + def test_rank(self): + from sklearn.externals._arff import ArffException + + dataset = "credit-g" + + try: + X, y = fetch_openml(name=dataset, return_X_y=True) + y = y.cat.codes + except (ArffException, ValueError): + from sklearn.datasets import load_wine + + X, y = load_wine(return_X_y=True) + + import numpy as np + + automl = AutoML() + n = 500 + + for each_estimator in ["lgbm", "xgboost"]: + automl_settings = { + "time_budget": 2, + "task": "rank", + "log_file_name": "test/{}.log".format(dataset), + "model_history": True, + "groups": np.array([0] * 200 + [1] * 200 + [2] * 100), # group labels + "learner_selector": "roundrobin", + "estimator_list": [each_estimator], + } + automl.fit(X[:n], y[:n], **automl_settings) + try: + automl.score(X[n:], y[n:]) + automl.pickle("automl.pkl") + except NotImplementedError: + pass + + +if __name__ == "__main__": + test = TestScore() + test.test_forecast() diff --git a/test/nlp/test_autohf.py b/test/nlp/test_autohf.py index ba05024ff..b4f76aa1e 100644 --- a/test/nlp/test_autohf.py +++ b/test/nlp/test_autohf.py @@ -102,6 +102,8 @@ def test_hf_data(): y_val=y_val, **automl_settings ) + automl.score(X_val, y_val, **{"metric": "accuracy"}) + automl.pickle("automl.pkl") except requests.exceptions.HTTPError: return @@ -113,10 +115,6 @@ def test_hf_data(): record_id=0, **automl_settings ) - with open("automl.pkl", "wb") as f: - pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL) - with open("automl.pkl", "rb") as f: - automl = pickle.load(f) automl.predict(X_test) automl.predict(["test test", "test test"]) automl.predict( @@ -183,8 +181,6 @@ def _test_custom_data(): ] ) - import pickle - automl.pickle("automl.pkl") with open("automl.pkl", "rb") as f: diff --git a/test/nlp/test_autohf_custom_metric.py b/test/nlp/test_autohf_custom_metric.py index e81ffc78c..0d3afe991 100644 --- a/test/nlp/test_autohf_custom_metric.py +++ b/test/nlp/test_autohf_custom_metric.py @@ -19,7 +19,7 @@ def custom_metric( from flaml.model import TransformersEstimator if estimator._trainer is None: - trainer, _, _ = estimator._init_model_for_predict(X_test) + trainer, _ = estimator._init_model_for_predict() estimator._trainer = None else: trainer = estimator._trainer @@ -93,6 +93,14 @@ def test_custom_metric(): # testing when max_iter=1 and do retrain only without hpo + try: + import ray + + if not ray.is_initialized(): + ray.init() + except ImportError: + return + automl_settings = { "gpu_per_trial": 0, "max_iter": 1, @@ -100,6 +108,7 @@ def test_custom_metric(): "task": "seq-classification", "metric": custom_metric, "log_file_name": "seqclass.log", + "use_ray": {"local_dir": "data/outut/"}, } automl_settings["hf_args"] = { @@ -126,6 +135,8 @@ def test_custom_metric(): automl.fit( X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings ) + automl.score(X_val, y_val, **{"metric": custom_metric}) + automl.pickle("automl.pkl") del automl