From 868e7dd1ca4782d044acb857d183ba651dbcbd15 Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Thu, 21 Sep 2023 23:55:00 -0700 Subject: [PATCH] support xgboost 2.0 (#1219) * support xgboost 2.0 * try classes_ * test version * quote * use_label_encoder * Fix xgboost test error * remove deprecated files * remove deprecated files * remove deprecated import * replace deprecated import in integrate_spark.ipynb * replace deprecated import in automl_lightgbm.ipynb * formatted integrate_spark.ipynb * replace deprecated import * try fix driver python path * Update python-package.yml * replace deprecated reference * move spark python env var to other section * Update setup.py, install xgb<2 for MacOS * Fix typo * assert * Try assert xgboost version * Fail fast * Keep all test/spark to try fail fast * No need to skip spark test in Mac or Win * Remove assert xgb version * Remove fail fast * Found root cause, fix test_sparse_matrix_xgboost * Revert "No need to skip spark test in Mac or Win" This reverts commit a09034817fda738ff2b8ed4bfd7352c02fb453d6. * remove assertion --------- Co-authored-by: Li Jiang Co-authored-by: levscaut <57213911+levscaut@users.noreply.github.com> Co-authored-by: levscaut Co-authored-by: Li Jiang --- .github/workflows/python-package.yml | 4 +- flaml/automl/automl.py | 2 +- flaml/automl/model.py | 23 +- flaml/automl/time_series/ts_model.py | 4 +- flaml/data.py | 9 - flaml/default/estimator.py | 7 +- flaml/model.py | 9 - flaml/version.py | 2 +- notebook/automl_classification.ipynb | 6 +- notebook/automl_flight_delays.ipynb | 381 +------------- notebook/automl_lightgbm.ipynb | 8 +- notebook/automl_nlp.ipynb | 10 +- notebook/automl_time_series_forecast.ipynb | 4 +- notebook/automl_xgboost.ipynb | 14 +- notebook/integrate_azureml.ipynb | 4 +- notebook/integrate_sklearn.ipynb | 14 +- notebook/integrate_spark.ipynb | 494 +++++++++++++++++- notebook/zeroshot_lightgbm.ipynb | 4 +- test/automl/test_classification.py | 5 +- test/automl/test_split.py | 2 +- website/docs/Examples/Default-Flamlized.md | 2 +- .../docs/Use-Cases/Task-Oriented-AutoML.md | 8 +- 22 files changed, 576 insertions(+), 440 deletions(-) delete mode 100644 flaml/data.py delete mode 100644 flaml/model.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index bc90024c1..be6863123 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -64,10 +64,12 @@ jobs: if: matrix.os == 'ubuntu-latest' run: | pip install "ray[tune]<2.5.0" - - name: If mac, install ray + - name: If mac, install ray and xgboost 1 if: matrix.os == 'macOS-latest' run: | pip install -e .[ray] + # use macOS to test xgboost 1, but macOS also supports xgboost 2 + pip install "xgboost<2" - name: If linux or mac, install prophet on python < 3.9 if: (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest') && matrix.python-version != '3.9' && matrix.python-version != '3.10' run: | diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index af4159f90..200970802 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -606,7 +606,7 @@ class AutoML(BaseEstimator): Args: learner_name: A string of the learner's name. - learner_class: A subclass of flaml.model.BaseEstimator. + learner_class: A subclass of flaml.automl.model.BaseEstimator. """ self._state.learner_classes[learner_name] = learner_class diff --git a/flaml/automl/model.py b/flaml/automl/model.py index 6a0a0aa80..901237ac1 100644 --- a/flaml/automl/model.py +++ b/flaml/automl/model.py @@ -32,6 +32,7 @@ try: from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier from sklearn.linear_model import LogisticRegression from sklearn.dummy import DummyClassifier, DummyRegressor + from xgboost import __version__ as xgboost_version except ImportError: pass @@ -212,10 +213,10 @@ class BaseEstimator: model = self.estimator_class(**self.params) if logger.level == logging.DEBUG: # xgboost 1.6 doesn't display all the params in the model str - logger.debug(f"flaml.model - {model} fit started with params {self.params}") + logger.debug(f"flaml.automl.model - {model} fit started with params {self.params}") model.fit(X_train, y_train, **kwargs) if logger.level == logging.DEBUG: - logger.debug(f"flaml.model - {model} fit finished") + logger.debug(f"flaml.automl.model - {model} fit finished") train_time = time.time() - current_time self._model = model return train_time @@ -455,10 +456,10 @@ class SparkEstimator(BaseEstimator): current_time = time.time() pipeline_model = self.estimator_class(**self.params, **kwargs) if logger.level == logging.DEBUG: - logger.debug(f"flaml.model - {pipeline_model} fit started with params {self.params}") + logger.debug(f"flaml.automl.model - {pipeline_model} fit started with params {self.params}") pipeline_model.fit(df_train) if logger.level == logging.DEBUG: - logger.debug(f"flaml.model - {pipeline_model} fit finished") + logger.debug(f"flaml.automl.model - {pipeline_model} fit finished") train_time = time.time() - current_time self._model = pipeline_model return train_time @@ -690,12 +691,12 @@ class SparkLGBMEstimator(SparkEstimator): current_time = time.time() model = self.estimator_class(**self.params, **kwargs) if logger.level == logging.DEBUG: - logger.debug(f"flaml.model - {model} fit started with params {self.params}") + logger.debug(f"flaml.automl.model - {model} fit started with params {self.params}") self._model = model.fit(df_train) self._model.classes_ = self.model_classes_ self._model.n_classes_ = self.model_n_classes_ if logger.level == logging.DEBUG: - logger.debug(f"flaml.model - {model} fit finished") + logger.debug(f"flaml.automl.model - {model} fit finished") train_time = time.time() - current_time return train_time @@ -1412,7 +1413,7 @@ class LGBMEstimator(BaseEstimator): callbacks = self.params.pop("callbacks") self._model.set_params(callbacks=callbacks[:-1]) best_iteration = ( - self._model.get_booster().best_iteration + getattr(self._model.get_booster(), "best_iteration", None) if isinstance(self, XGBoostSklearnEstimator) else self._model.best_iteration_ ) @@ -1510,8 +1511,6 @@ class XGBoostEstimator(SKLearnEstimator): # params["booster"] = params.get("booster", "gbtree") # use_label_encoder is deprecated in 1.7. - from xgboost import __version__ as xgboost_version - if xgboost_version < "1.7.0": params["use_label_encoder"] = params.get("use_label_encoder", False) if "n_jobs" in config: @@ -1559,7 +1558,7 @@ class XGBoostEstimator(SKLearnEstimator): obj=obj, callbacks=callbacks, ) - self.params["n_estimators"] = self._model.best_iteration + 1 + self.params["n_estimators"] = getattr(self._model, "best_iteration", _n_estimators - 1) + 1 else: self._model = xgb.train(self.params, dtrain, _n_estimators, obj=obj) self.params["n_estimators"] = _n_estimators @@ -1620,7 +1619,9 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): if max_depth == 0: params["grow_policy"] = params.get("grow_policy", "lossguide") params["tree_method"] = params.get("tree_method", "hist") - params["use_label_encoder"] = params.get("use_label_encoder", False) + # use_label_encoder is deprecated in 1.7. + if xgboost_version < "1.7.0": + params["use_label_encoder"] = params.get("use_label_encoder", False) return params def __init__( diff --git a/flaml/automl/time_series/ts_model.py b/flaml/automl/time_series/ts_model.py index da1bfcbaf..f507cb7c1 100644 --- a/flaml/automl/time_series/ts_model.py +++ b/flaml/automl/time_series/ts_model.py @@ -22,7 +22,7 @@ except ImportError: import numpy as np from flaml import tune -from flaml.model import ( +from flaml.automl.model import ( suppress_stdout_stderr, SKLearnEstimator, logger, @@ -33,7 +33,7 @@ from flaml.model import ( XGBoostLimitDepthEstimator, CatBoostEstimator, ) -from flaml.data import TS_TIMESTAMP_COL, TS_VALUE_COL +from flaml.automl.data import TS_TIMESTAMP_COL, TS_VALUE_COL from flaml.automl.time_series.ts_data import ( TimeSeriesDataset, enrich_dataset, diff --git a/flaml/data.py b/flaml/data.py deleted file mode 100644 index 522b47fe0..000000000 --- a/flaml/data.py +++ /dev/null @@ -1,9 +0,0 @@ -import warnings - -from flaml.automl.data import * - - -warnings.warn( - "Importing from `flaml.data` is deprecated. Please use `flaml.automl.data`.", - DeprecationWarning, -) diff --git a/flaml/default/estimator.py b/flaml/default/estimator.py index d8aaa989f..5354a7c97 100644 --- a/flaml/default/estimator.py +++ b/flaml/default/estimator.py @@ -105,7 +105,12 @@ def flamlize_estimator(super_class, name: str, task: str, alternatives=None): # if hasattr(self, "_classes"): # self._classes = self._label_transformer.classes_ # else: - self.classes_ = self._label_transformer.classes_ + try: + self.classes_ = self._label_transformer.classes_ + except AttributeError: + # xgboost 2: AttributeError: can't set attribute + if "xgb" not in estimator_name: + raise if "xgb" not in estimator_name: # rf and et would do inverse transform automatically; xgb doesn't self._label_transformer = None diff --git a/flaml/model.py b/flaml/model.py deleted file mode 100644 index b780a67d1..000000000 --- a/flaml/model.py +++ /dev/null @@ -1,9 +0,0 @@ -import warnings - -from flaml.automl.model import * - - -warnings.warn( - "Importing from `flaml.model` is deprecated. Please use `flaml.automl.model`.", - DeprecationWarning, -) diff --git a/flaml/version.py b/flaml/version.py index 9aa3f9036..58039f505 100644 --- a/flaml/version.py +++ b/flaml/version.py @@ -1 +1 @@ -__version__ = "2.1.0" +__version__ = "2.1.1" diff --git a/notebook/automl_classification.ipynb b/notebook/automl_classification.ipynb index d143e63d5..0aefd0135 100644 --- a/notebook/automl_classification.ipynb +++ b/notebook/automl_classification.ipynb @@ -80,7 +80,7 @@ ], "source": [ "from minio.error import ServerError\n", - "from flaml.data import load_openml_dataset\n", + "from flaml.automl.data import load_openml_dataset\n", "\n", "try:\n", " X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')\n", @@ -1252,7 +1252,7 @@ } ], "source": [ - "from flaml.data import get_output_from_log\n", + "from flaml.automl.data import get_output_from_log\n", "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", " get_output_from_log(filename=settings['log_file_name'], time_budget=240)\n", "for config in config_history:\n", @@ -1540,7 +1540,7 @@ "outputs": [], "source": [ "''' SKLearnEstimator is the super class for a sklearn learner '''\n", - "from flaml.model import SKLearnEstimator\n", + "from flaml.automl.model import SKLearnEstimator\n", "from flaml import tune\n", "from flaml.automl.task.task import CLASSIFICATION\n", "\n", diff --git a/notebook/automl_flight_delays.ipynb b/notebook/automl_flight_delays.ipynb index 2edd20abb..b30ee1336 100644 --- a/notebook/automl_flight_delays.ipynb +++ b/notebook/automl_flight_delays.ipynb @@ -37,383 +37,20 @@ "\n", "In this notebook, we use one real data example (binary classification) to showcase how to use FLAML library.\n", "\n", - "FLAML requires `Python>=3.7`. To run this notebook example, please install the following packages." + "FLAML requires `Python>=3.8`. To run this notebook example, please install the following packages." ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": true } }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2023-04-09T03:11:05.782522Z", - "execution_start_time": "2023-04-09T03:11:05.7822033Z", - "livy_statement_state": "available", - "parent_msg_id": "18b2ee64-09c4-4ceb-8975-e4ed43d7c41a", - "queued_time": "2023-04-09T03:10:33.571519Z", - "session_id": "7", - "session_start_time": null, - "spark_jobs": null, - "spark_pool": null, - "state": "finished", - "statement_id": -1 - }, - "text/plain": [ - "StatementMeta(, 7, -1, Finished, Available)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": {}, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting flaml[synapse]==1.1.3\n", - " Using cached FLAML-1.1.3-py3-none-any.whl (224 kB)\n", - "Collecting xgboost==1.6.1\n", - " Using cached xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)\n", - "Collecting pandas==1.5.1\n", - " Using cached pandas-1.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)\n", - "Collecting numpy==1.23.4\n", - " Using cached numpy-1.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)\n", - "Collecting openml\n", - " Using cached openml-0.13.1-py3-none-any.whl\n", - "Collecting scipy>=1.4.1\n", - " Using cached scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)\n", - "Collecting scikit-learn>=0.24\n", - " Using cached scikit_learn-1.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)\n", - "Collecting lightgbm>=2.3.1\n", - " Using cached lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)\n", - "Collecting pyspark>=3.0.0\n", - " Using cached pyspark-3.3.2-py2.py3-none-any.whl\n", - "Collecting optuna==2.8.0\n", - " Using cached optuna-2.8.0-py3-none-any.whl (301 kB)\n", - "Collecting joblibspark>=0.5.0\n", - " Using cached joblibspark-0.5.1-py3-none-any.whl (15 kB)\n", - "Collecting python-dateutil>=2.8.1\n", - " Using cached python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n", - "Collecting pytz>=2020.1\n", - " Using cached pytz-2023.3-py2.py3-none-any.whl (502 kB)\n", - "Collecting cliff\n", - " Using cached cliff-4.2.0-py3-none-any.whl (81 kB)\n", - "Collecting packaging>=20.0\n", - " Using cached packaging-23.0-py3-none-any.whl (42 kB)\n", - "Collecting cmaes>=0.8.2\n", - " Using cached cmaes-0.9.1-py3-none-any.whl (21 kB)\n", - "Collecting sqlalchemy>=1.1.0\n", - " Using cached SQLAlchemy-2.0.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n", - "Collecting tqdm\n", - " Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)\n", - "Collecting alembic\n", - " Using cached alembic-1.10.3-py3-none-any.whl (212 kB)\n", - "Collecting colorlog\n", - " Using cached colorlog-6.7.0-py2.py3-none-any.whl (11 kB)\n", - "Collecting xmltodict\n", - " Using cached xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)\n", - "Collecting requests\n", - " Using cached requests-2.28.2-py3-none-any.whl (62 kB)\n", - "Collecting minio\n", - " Using cached minio-7.1.14-py3-none-any.whl (77 kB)\n", - "Collecting liac-arff>=2.4.0\n", - " Using cached liac_arff-2.5.0-py3-none-any.whl\n", - "Collecting pyarrow\n", - " Using cached pyarrow-11.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.0 MB)\n", - "Collecting joblib>=0.14\n", - " Using cached joblib-1.2.0-py3-none-any.whl (297 kB)\n", - "Collecting wheel\n", - " Using cached wheel-0.40.0-py3-none-any.whl (64 kB)\n", - "Collecting py4j==0.10.9.5\n", - " Using cached py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)\n", - "Collecting six>=1.5\n", - " Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)\n", - "Collecting threadpoolctl>=2.0.0\n", - " Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)\n", - "Collecting urllib3\n", - " Using cached urllib3-1.26.15-py2.py3-none-any.whl (140 kB)\n", - "Collecting certifi\n", - " Using cached certifi-2022.12.7-py3-none-any.whl (155 kB)\n", - "Collecting idna<4,>=2.5\n", - " Using cached idna-3.4-py3-none-any.whl (61 kB)\n", - "Collecting charset-normalizer<4,>=2\n", - " Using cached charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (195 kB)\n", - "Collecting typing-extensions>=4.2.0\n", - " Using cached typing_extensions-4.5.0-py3-none-any.whl (27 kB)\n", - "Collecting greenlet!=0.4.17\n", - " Using cached greenlet-2.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (618 kB)\n", - "Collecting importlib-metadata\n", - " Using cached importlib_metadata-6.2.0-py3-none-any.whl (21 kB)\n", - "Collecting importlib-resources\n", - " Using cached importlib_resources-5.12.0-py3-none-any.whl (36 kB)\n", - "Collecting Mako\n", - " Using cached Mako-1.2.4-py3-none-any.whl (78 kB)\n", - "Collecting autopage>=0.4.0\n", - " Using cached autopage-0.5.1-py3-none-any.whl (29 kB)\n", - "Collecting cmd2>=1.0.0\n", - " Using cached cmd2-2.4.3-py3-none-any.whl (147 kB)\n", - "Collecting stevedore>=2.0.1\n", - " Using cached stevedore-5.0.0-py3-none-any.whl (49 kB)\n", - "Collecting PrettyTable>=0.7.2\n", - " Using cached prettytable-3.6.0-py3-none-any.whl (27 kB)\n", - "Collecting PyYAML>=3.12\n", - " Using cached PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (701 kB)\n", - "Collecting attrs>=16.3.0\n", - " Using cached attrs-22.2.0-py3-none-any.whl (60 kB)\n", - "Collecting pyperclip>=1.6\n", - " Using cached pyperclip-1.8.2-py3-none-any.whl\n", - "Collecting wcwidth>=0.1.7\n", - " Using cached wcwidth-0.2.6-py2.py3-none-any.whl (29 kB)\n", - "Collecting zipp>=0.5\n", - " Using cached zipp-3.15.0-py3-none-any.whl (6.8 kB)\n", - "Collecting pbr!=2.1.0,>=2.0.0\n", - " Using cached pbr-5.11.1-py2.py3-none-any.whl (112 kB)\n", - "Collecting MarkupSafe>=0.9.2\n", - " Using cached MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)\n", - "Installing collected packages: wcwidth, pytz, pyperclip, py4j, zipp, xmltodict, wheel, urllib3, typing-extensions, tqdm, threadpoolctl, six, PyYAML, pyspark, PrettyTable, pbr, packaging, numpy, MarkupSafe, liac-arff, joblib, idna, greenlet, colorlog, charset-normalizer, certifi, autopage, attrs, stevedore, sqlalchemy, scipy, requests, python-dateutil, pyarrow, minio, Mako, joblibspark, importlib-resources, importlib-metadata, cmd2, cmaes, xgboost, scikit-learn, pandas, cliff, alembic, optuna, openml, lightgbm, flaml\n", - " Attempting uninstall: wcwidth\n", - " Found existing installation: wcwidth 0.2.6\n", - " Uninstalling wcwidth-0.2.6:\n", - " Successfully uninstalled wcwidth-0.2.6\n", - " Attempting uninstall: pytz\n", - " Found existing installation: pytz 2023.3\n", - " Uninstalling pytz-2023.3:\n", - " Successfully uninstalled pytz-2023.3\n", - " Attempting uninstall: pyperclip\n", - " Found existing installation: pyperclip 1.8.2\n", - " Uninstalling pyperclip-1.8.2:\n", - " Successfully uninstalled pyperclip-1.8.2\n", - " Attempting uninstall: py4j\n", - " Found existing installation: py4j 0.10.9.5\n", - " Uninstalling py4j-0.10.9.5:\n", - " Successfully uninstalled py4j-0.10.9.5\n", - " Attempting uninstall: zipp\n", - " Found existing installation: zipp 3.15.0\n", - " Uninstalling zipp-3.15.0:\n", - " Successfully uninstalled zipp-3.15.0\n", - " Attempting uninstall: xmltodict\n", - " Found existing installation: xmltodict 0.13.0\n", - " Uninstalling xmltodict-0.13.0:\n", - " Successfully uninstalled xmltodict-0.13.0\n", - " Attempting uninstall: wheel\n", - " Found existing installation: wheel 0.40.0\n", - " Uninstalling wheel-0.40.0:\n", - " Successfully uninstalled wheel-0.40.0\n", - " Attempting uninstall: urllib3\n", - " Found existing installation: urllib3 1.26.15\n", - " Uninstalling urllib3-1.26.15:\n", - " Successfully uninstalled urllib3-1.26.15\n", - " Attempting uninstall: typing-extensions\n", - " Found existing installation: typing_extensions 4.5.0\n", - " Uninstalling typing_extensions-4.5.0:\n", - " Successfully uninstalled typing_extensions-4.5.0\n", - " Attempting uninstall: tqdm\n", - " Found existing installation: tqdm 4.65.0\n", - " Uninstalling tqdm-4.65.0:\n", - " Successfully uninstalled tqdm-4.65.0\n", - " Attempting uninstall: threadpoolctl\n", - " Found existing installation: threadpoolctl 3.1.0\n", - " Uninstalling threadpoolctl-3.1.0:\n", - " Successfully uninstalled threadpoolctl-3.1.0\n", - " Attempting uninstall: six\n", - " Found existing installation: six 1.16.0\n", - " Uninstalling six-1.16.0:\n", - " Successfully uninstalled six-1.16.0\n", - " Attempting uninstall: PyYAML\n", - " Found existing installation: PyYAML 6.0\n", - " Uninstalling PyYAML-6.0:\n", - " Successfully uninstalled PyYAML-6.0\n", - " Attempting uninstall: pyspark\n", - " Found existing installation: pyspark 3.3.2\n", - " Uninstalling pyspark-3.3.2:\n", - " Successfully uninstalled pyspark-3.3.2\n", - " Attempting uninstall: PrettyTable\n", - " Found existing installation: prettytable 3.6.0\n", - " Uninstalling prettytable-3.6.0:\n", - " Successfully uninstalled prettytable-3.6.0\n", - " Attempting uninstall: pbr\n", - " Found existing installation: pbr 5.11.1\n", - " Uninstalling pbr-5.11.1:\n", - " Successfully uninstalled pbr-5.11.1\n", - " Attempting uninstall: packaging\n", - " Found existing installation: packaging 23.0\n", - " Uninstalling packaging-23.0:\n", - " Successfully uninstalled packaging-23.0\n", - " Attempting uninstall: numpy\n", - " Found existing installation: numpy 1.23.4\n", - " Uninstalling numpy-1.23.4:\n", - " Successfully uninstalled numpy-1.23.4\n", - " Attempting uninstall: MarkupSafe\n", - " Found existing installation: MarkupSafe 2.1.2\n", - " Uninstalling MarkupSafe-2.1.2:\n", - " Successfully uninstalled MarkupSafe-2.1.2\n", - " Attempting uninstall: liac-arff\n", - " Found existing installation: liac-arff 2.5.0\n", - " Uninstalling liac-arff-2.5.0:\n", - " Successfully uninstalled liac-arff-2.5.0\n", - " Attempting uninstall: joblib\n", - " Found existing installation: joblib 1.2.0\n", - " Uninstalling joblib-1.2.0:\n", - " Successfully uninstalled joblib-1.2.0\n", - " Attempting uninstall: idna\n", - " Found existing installation: idna 3.4\n", - " Uninstalling idna-3.4:\n", - " Successfully uninstalled idna-3.4\n", - " Attempting uninstall: greenlet\n", - " Found existing installation: greenlet 2.0.2\n", - " Uninstalling greenlet-2.0.2:\n", - " Successfully uninstalled greenlet-2.0.2\n", - " Attempting uninstall: colorlog\n", - " Found existing installation: colorlog 6.7.0\n", - " Uninstalling colorlog-6.7.0:\n", - " Successfully uninstalled colorlog-6.7.0\n", - " Attempting uninstall: charset-normalizer\n", - " Found existing installation: charset-normalizer 3.1.0\n", - " Uninstalling charset-normalizer-3.1.0:\n", - " Successfully uninstalled charset-normalizer-3.1.0\n", - " Attempting uninstall: certifi\n", - " Found existing installation: certifi 2022.12.7\n", - " Uninstalling certifi-2022.12.7:\n", - " Successfully uninstalled certifi-2022.12.7\n", - " Attempting uninstall: autopage\n", - " Found existing installation: autopage 0.5.1\n", - " Uninstalling autopage-0.5.1:\n", - " Successfully uninstalled autopage-0.5.1\n", - " Attempting uninstall: attrs\n", - " Found existing installation: attrs 22.2.0\n", - " Uninstalling attrs-22.2.0:\n", - " Successfully uninstalled attrs-22.2.0\n", - " Attempting uninstall: stevedore\n", - " Found existing installation: stevedore 5.0.0\n", - " Uninstalling stevedore-5.0.0:\n", - " Successfully uninstalled stevedore-5.0.0\n", - " Attempting uninstall: sqlalchemy\n", - " Found existing installation: SQLAlchemy 2.0.9\n", - " Uninstalling SQLAlchemy-2.0.9:\n", - " Successfully uninstalled SQLAlchemy-2.0.9\n", - " Attempting uninstall: scipy\n", - " Found existing installation: scipy 1.10.1\n", - " Uninstalling scipy-1.10.1:\n", - " Successfully uninstalled scipy-1.10.1\n", - " Attempting uninstall: requests\n", - " Found existing installation: requests 2.28.2\n", - " Uninstalling requests-2.28.2:\n", - " Successfully uninstalled requests-2.28.2\n", - " Attempting uninstall: python-dateutil\n", - " Found existing installation: python-dateutil 2.8.2\n", - " Uninstalling python-dateutil-2.8.2:\n", - " Successfully uninstalled python-dateutil-2.8.2\n", - " Attempting uninstall: pyarrow\n", - " Found existing installation: pyarrow 11.0.0\n", - " Uninstalling pyarrow-11.0.0:\n", - " Successfully uninstalled pyarrow-11.0.0\n", - " Attempting uninstall: minio\n", - " Found existing installation: minio 7.1.14\n", - " Uninstalling minio-7.1.14:\n", - " Successfully uninstalled minio-7.1.14\n", - " Attempting uninstall: Mako\n", - " Found existing installation: Mako 1.2.4\n", - " Uninstalling Mako-1.2.4:\n", - " Successfully uninstalled Mako-1.2.4\n", - " Attempting uninstall: joblibspark\n", - " Found existing installation: joblibspark 0.5.1\n", - " Uninstalling joblibspark-0.5.1:\n", - " Successfully uninstalled joblibspark-0.5.1\n", - " Attempting uninstall: importlib-resources\n", - " Found existing installation: importlib-resources 5.12.0\n", - " Uninstalling importlib-resources-5.12.0:\n", - " Successfully uninstalled importlib-resources-5.12.0\n", - " Attempting uninstall: importlib-metadata\n", - " Found existing installation: importlib-metadata 6.2.0\n", - " Uninstalling importlib-metadata-6.2.0:\n", - " Successfully uninstalled importlib-metadata-6.2.0\n", - " Attempting uninstall: cmd2\n", - " Found existing installation: cmd2 2.4.3\n", - " Uninstalling cmd2-2.4.3:\n", - " Successfully uninstalled cmd2-2.4.3\n", - " Attempting uninstall: cmaes\n", - " Found existing installation: cmaes 0.9.1\n", - " Uninstalling cmaes-0.9.1:\n", - " Successfully uninstalled cmaes-0.9.1\n", - " Attempting uninstall: xgboost\n", - " Found existing installation: xgboost 1.6.1\n", - " Uninstalling xgboost-1.6.1:\n", - " Successfully uninstalled xgboost-1.6.1\n", - " Attempting uninstall: scikit-learn\n", - " Found existing installation: scikit-learn 1.2.2\n", - " Uninstalling scikit-learn-1.2.2:\n", - " Successfully uninstalled scikit-learn-1.2.2\n", - " Attempting uninstall: pandas\n", - " Found existing installation: pandas 1.5.1\n", - " Uninstalling pandas-1.5.1:\n", - " Successfully uninstalled pandas-1.5.1\n", - " Attempting uninstall: cliff\n", - " Found existing installation: cliff 4.2.0\n", - " Uninstalling cliff-4.2.0:\n", - " Successfully uninstalled cliff-4.2.0\n", - " Attempting uninstall: alembic\n", - " Found existing installation: alembic 1.10.3\n", - " Uninstalling alembic-1.10.3:\n", - " Successfully uninstalled alembic-1.10.3\n", - " Attempting uninstall: optuna\n", - " Found existing installation: optuna 2.8.0\n", - " Uninstalling optuna-2.8.0:\n", - " Successfully uninstalled optuna-2.8.0\n", - " Attempting uninstall: openml\n", - " Found existing installation: openml 0.13.1\n", - " Uninstalling openml-0.13.1:\n", - " Successfully uninstalled openml-0.13.1\n", - " Attempting uninstall: lightgbm\n", - " Found existing installation: lightgbm 3.3.5\n", - " Uninstalling lightgbm-3.3.5:\n", - " Successfully uninstalled lightgbm-3.3.5\n", - " Attempting uninstall: flaml\n", - " Found existing installation: FLAML 1.1.3\n", - " Uninstalling FLAML-1.1.3:\n", - " Successfully uninstalled FLAML-1.1.3\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "virtualenv 20.14.0 requires platformdirs<3,>=2, but you have platformdirs 3.2.0 which is incompatible.\n", - "tensorflow 2.4.1 requires six~=1.15.0, but you have six 1.16.0 which is incompatible.\n", - "tensorflow 2.4.1 requires typing-extensions~=3.7.4, but you have typing-extensions 4.5.0 which is incompatible.\n", - "pmdarima 1.8.2 requires numpy~=1.19.0, but you have numpy 1.23.4 which is incompatible.\n", - "koalas 1.8.0 requires numpy<1.20.0,>=1.14, but you have numpy 1.23.4 which is incompatible.\n", - "gevent 21.1.2 requires greenlet<2.0,>=0.4.17; platform_python_implementation == \"CPython\", but you have greenlet 2.0.2 which is incompatible.\n", - "azureml-dataset-runtime 1.34.0 requires pyarrow<4.0.0,>=0.17.0, but you have pyarrow 11.0.0 which is incompatible.\n", - "azureml-core 1.34.0 requires urllib3<=1.26.6,>=1.23, but you have urllib3 1.26.15 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed Mako-1.2.4 MarkupSafe-2.1.2 PrettyTable-3.6.0 PyYAML-6.0 alembic-1.10.3 attrs-22.2.0 autopage-0.5.1 certifi-2022.12.7 charset-normalizer-3.1.0 cliff-4.2.0 cmaes-0.9.1 cmd2-2.4.3 colorlog-6.7.0 flaml-1.1.3 greenlet-2.0.2 idna-3.4 importlib-metadata-6.2.0 importlib-resources-5.12.0 joblib-1.2.0 joblibspark-0.5.1 liac-arff-2.5.0 lightgbm-3.3.5 minio-7.1.14 numpy-1.23.4 openml-0.13.1 optuna-2.8.0 packaging-23.0 pandas-1.5.1 pbr-5.11.1 py4j-0.10.9.5 pyarrow-11.0.0 pyperclip-1.8.2 pyspark-3.3.2 python-dateutil-2.8.2 pytz-2023.3 requests-2.28.2 scikit-learn-1.2.2 scipy-1.10.1 six-1.16.0 sqlalchemy-2.0.9 stevedore-5.0.0 threadpoolctl-3.1.0 tqdm-4.65.0 typing-extensions-4.5.0 urllib3-1.26.15 wcwidth-0.2.6 wheel-0.40.0 xgboost-1.6.1 xmltodict-0.13.0 zipp-3.15.0\n", - "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.0.1 is available.\n", - "You should consider upgrading via the '/nfs4/pyenv-bfada21f-d1ed-44b9-a41d-4ff480d237e7/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", - "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" - ] - }, - { - "data": {}, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Warning: PySpark kernel has been restarted to use updated packages.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ - "%pip install flaml[synapse]==1.1.3 xgboost==1.6.1 pandas==1.5.1 numpy==1.23.4 openml --force-reinstall" + "%pip install flaml[automl,synapse] xgboost==1.6.1 pandas==1.5.1 numpy==1.23.4 openml --force-reinstall" ] }, { @@ -480,7 +117,7 @@ } ], "source": [ - "from flaml.data import load_openml_dataset\n", + "from flaml.automl.data import load_openml_dataset\n", "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')" ] }, @@ -1389,7 +1026,7 @@ } ], "source": [ - "from flaml.data import get_output_from_log\n", + "from flaml.automl.data import get_output_from_log\n", "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", " get_output_from_log(filename=settings['log_file_name'], time_budget=240)\n", "for config in config_history:\n", @@ -1861,7 +1498,7 @@ } ], "source": [ - "!pip install rgf-python " + "%pip install rgf-python " ] }, { @@ -1898,9 +1535,9 @@ ], "source": [ "''' SKLearnEstimator is the super class for a sklearn learner '''\n", - "from flaml.model import SKLearnEstimator\n", + "from flaml.automl.model import SKLearnEstimator\n", "from flaml import tune\n", - "from flaml.data import CLASSIFICATION\n", + "from flaml.automl.data import CLASSIFICATION\n", "\n", "\n", "class MyRegularizedGreedyForest(SKLearnEstimator):\n", diff --git a/notebook/automl_lightgbm.ipynb b/notebook/automl_lightgbm.ipynb index e8c7abe02..b59b1ba57 100644 --- a/notebook/automl_lightgbm.ipynb +++ b/notebook/automl_lightgbm.ipynb @@ -28,7 +28,7 @@ "\n", "In this notebook, we demonstrate how to use FLAML library to tune hyperparameters of LightGBM with a regression example.\n", "\n", - "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the `automl` option (this option is introduced from version 2, for version 1 it is installed by default):\n", + "FLAML requires `Python>=3.8`. To run this notebook example, please install flaml with the `automl` option (this option is introduced from version 2, for version 1 it is installed by default):\n", "```bash\n", "pip install flaml[automl]\n", "```" @@ -87,7 +87,7 @@ } ], "source": [ - "from flaml.data import load_openml_dataset\n", + "from flaml.automl.data import load_openml_dataset\n", "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')" ] }, @@ -509,7 +509,7 @@ } ], "source": [ - "from flaml.data import get_output_from_log\n", + "from flaml.automl.data import get_output_from_log\n", "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", " get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n", "\n", @@ -852,7 +852,7 @@ " coef[0] * hess + coef[1] * hess_rmse + coef[2] * hess_mae\n", "\n", "\n", - "from flaml.model import LGBMEstimator\n", + "from flaml.automl.model import LGBMEstimator\n", "\n", "''' create a customized LightGBM learner class with your objective function '''\n", "class MyLGBM(LGBMEstimator):\n", diff --git a/notebook/automl_nlp.ipynb b/notebook/automl_nlp.ipynb index d46d3493f..c1c20ad51 100644 --- a/notebook/automl_nlp.ipynb +++ b/notebook/automl_nlp.ipynb @@ -21,7 +21,7 @@ "\n", "In this notebook, we demonstrate how to use the FLAML library to fine tune an NLP language model with hyperparameter search. We will use [flaml.tune](https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function) with the built in GPU in colab for the tuning. However, if you have a machine with more than 1 GPU, you can also use FLAML's [parallel tuning](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) with the ray tune option. \n", "\n", - "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the `[automl,hf,blendsearch]` option:\n", + "FLAML requires `Python>=3.8`. To run this notebook example, please install flaml with the `[automl,hf,blendsearch]` option:\n", "```bash\n", "pip install flaml[automl,hf,blendsearch]; \n", "```" @@ -2107,7 +2107,7 @@ } ], "source": [ - "from flaml.data import get_output_from_log\n", + "from flaml.automl.data import get_output_from_log\n", "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", " get_output_from_log(filename=automl_settings['log_file_name'], time_budget=3000)\n", "for config in config_history:\n", @@ -3460,7 +3460,7 @@ } ], "source": [ - "from flaml.data import get_output_from_log\n", + "from flaml.automl.data import get_output_from_log\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", @@ -4098,7 +4098,7 @@ } ], "source": [ - "from flaml.data import get_output_from_log\n", + "from flaml.automl.data import get_output_from_log\n", "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", " get_output_from_log(filename=automl_settings['log_file_name'], time_budget=3000)\n", "for config in config_history:\n", @@ -5136,7 +5136,7 @@ ], "source": [ "\n", - "from flaml.data import get_output_from_log\n", + "from flaml.automl.data import get_output_from_log\n", "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", " get_output_from_log(filename=automl_settings['log_file_name'], time_budget=3000)\n", "for config in config_history:\n", diff --git a/notebook/automl_time_series_forecast.ipynb b/notebook/automl_time_series_forecast.ipynb index c7cf3b9b5..b56308ea9 100644 --- a/notebook/automl_time_series_forecast.ipynb +++ b/notebook/automl_time_series_forecast.ipynb @@ -22,7 +22,7 @@ "\n", "In this notebook, we demonstrate how to use FLAML library for time series forecasting tasks: univariate time series forecasting (only time), multivariate time series forecasting (with exogneous variables) and forecasting discrete values.\n", "\n", - "FLAML requires Python>=3.7. To run this notebook example, please install flaml with the [automl,ts_forecast] option:\n" + "FLAML requires Python>=3.8. To run this notebook example, please install flaml with the [automl,ts_forecast] option:\n" ] }, { @@ -1518,7 +1518,7 @@ } ], "source": [ - "from flaml.data import get_output_from_log\n", + "from flaml.automl.data import get_output_from_log\n", "time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n", " get_output_from_log(filename=settings['log_file_name'], time_budget=180)\n", "\n", diff --git a/notebook/automl_xgboost.ipynb b/notebook/automl_xgboost.ipynb index a46e520c2..c84e39db5 100644 --- a/notebook/automl_xgboost.ipynb +++ b/notebook/automl_xgboost.ipynb @@ -28,7 +28,7 @@ "\n", "In this notebook, we demonstrate how to use FLAML library to tune hyperparameters of XGBoost with a regression example.\n", "\n", - "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the `automl` option (this option is introduced from version 2, for version 1 it is installed by default):\n", + "FLAML requires `Python>=3.8`. To run this notebook example, please install flaml with the `automl` option (this option is introduced from version 2, for version 1 it is installed by default):\n", "```bash\n", "pip install flaml[automl]\n", "```" @@ -44,6 +44,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { @@ -87,11 +88,12 @@ } ], "source": [ - "from flaml.data import load_openml_dataset\n", + "from flaml.automl.data import load_openml_dataset\n", "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { @@ -509,6 +511,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "slideshow": { @@ -761,7 +764,7 @@ } ], "source": [ - "from flaml.data import get_output_from_log\n", + "from flaml.automl.data import get_output_from_log\n", "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", " get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n", "\n", @@ -804,6 +807,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -832,6 +836,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -922,6 +927,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1839,7 +1845,7 @@ " return grad, hess\n", "\n", "# create customized XGBoost learners class with your objective function\n", - "from flaml.model import XGBoostEstimator\n", + "from flaml.automl.model import XGBoostEstimator\n", "\n", "\n", "class MyXGB1(XGBoostEstimator):\n", diff --git a/notebook/integrate_azureml.ipynb b/notebook/integrate_azureml.ipynb index 88cb7fe04..45dcdba01 100644 --- a/notebook/integrate_azureml.ipynb +++ b/notebook/integrate_azureml.ipynb @@ -28,7 +28,7 @@ "\n", "In this notebook, we use one real data example (binary classification) to showcase how to use FLAML library together with AzureML.\n", "\n", - "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [automl,azureml] option:\n", + "FLAML requires `Python>=3.8`. To run this notebook example, please install flaml with the [automl,azureml] option:\n", "```bash\n", "pip install flaml[automl,azureml]\n", "```" @@ -88,7 +88,7 @@ }, "outputs": [], "source": [ - "from flaml.data import load_openml_dataset\n", + "from flaml.automl.data import load_openml_dataset\n", "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')" ] }, diff --git a/notebook/integrate_sklearn.ipynb b/notebook/integrate_sklearn.ipynb index e124ca995..f7cbd1fc1 100644 --- a/notebook/integrate_sklearn.ipynb +++ b/notebook/integrate_sklearn.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -12,6 +13,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -40,7 +42,7 @@ "\n", "In this notebook, we use one real data example (binary classification) to showcase how to use FLAML library.\n", "\n", - "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the `[automl]` option (this option is introduced from version 2, for version 1 it is installed by default):\n", + "FLAML requires `Python>=3.8`. To run this notebook example, please install flaml with the `[automl]` option (this option is introduced from version 2, for version 1 it is installed by default):\n", "```bash\n", "pip install flaml[automl]\n", "```" @@ -56,6 +58,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -76,6 +79,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -83,6 +87,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -109,7 +114,7 @@ } ], "source": [ - "from flaml.data import load_openml_dataset\n", + "from flaml.automl.data import load_openml_dataset\n", "X_train, X_test, y_train, y_test = load_openml_dataset(\n", " dataset_id=1169, data_dir='./', random_state=1234, dataset_format='array')" ] @@ -135,6 +140,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -232,6 +238,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -449,7 +456,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -462,6 +469,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/notebook/integrate_spark.ipynb b/notebook/integrate_spark.ipynb index 5423a1ad2..70e6d8539 100644 --- a/notebook/integrate_spark.ipynb +++ b/notebook/integrate_spark.ipynb @@ -1 +1,493 @@ -{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["Copyright (c) Microsoft Corporation. All rights reserved. \n","\n","Licensed under the MIT License.\n","\n","# Run FLAML Parallel tuning with Spark\n","\n","\n","## 1. Introduction\n","\n","FLAML is a Python library (https://github.com/microsoft/FLAML) designed to automatically produce accurate machine learning models \n","with low computational cost. It is fast and economical. The simple and lightweight design makes it easy \n","to use and extend, such as adding new learners. FLAML can \n","- serve as an economical AutoML engine,\n","- be used as a fast hyperparameter tuning tool, or \n","- be embedded in self-tuning software that requires low latency & resource in repetitive\n"," tuning tasks.\n","\n","In this notebook, we demonstrate how to run FLAML parallel tuning using Spark as the backend.\n","\n","FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the following options:\n","```bash\n","pip install flaml[automl,spark,blendsearch]\n","```\n","*Spark support is added in v1.1.0*"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:16:51.6335768Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:17:21.9028602Z\",\"execution_finish_time\":\"2022-12-07T08:18:52.3646576Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[],"source":["# %pip install flaml[automl,spark,blendsearch] matplotlib openml"]},{"attachments":{},"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["## 2. Regression Example\n","### Load data and preprocess\n","\n","Download [houses dataset](https://www.openml.org/d/537) from OpenML. The task is to predict median price of the house in the region based on demographic composition and a state of housing market in the region."]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.4783943Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:20:55.7666047Z\",\"execution_finish_time\":\"2022-12-07T08:21:10.9050139Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"subslide"},"tags":[]},"outputs":[],"source":["from minio.error import ServerError\n","from flaml.data import load_openml_dataset\n","\n","try:\n"," X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')\n","except (ServerError, Exception):\n"," from sklearn.datasets import fetch_california_housing\n"," from sklearn.model_selection import train_test_split\n","\n"," X, y = fetch_california_housing(return_X_y=True)\n"," X_train, X_test, y_train, y_test = train_test_split(X, y)\n"]},{"attachments":{},"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["### Run FLAML\n","In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. \n","\n","Notice that here `use_spark` is set to `True` in order to use Spark as the parallel training backend."]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.7001471Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:10.9846131Z\",\"execution_finish_time\":\"2022-12-07T08:21:11.3604062Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' import AutoML class from flaml package '''\n","from flaml import AutoML\n","automl = AutoML()"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.8983341Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:11.4417491Z\",\"execution_finish_time\":\"2022-12-07T08:21:11.8242955Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["settings = {\n"," \"time_budget\": 30, # total running time in seconds\n"," \"metric\": 'r2', # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']\n"," \"estimator_list\": ['lgbm'], # list of ML learners; we tune lightgbm in this example\n"," \"task\": 'regression', # task type \n"," \"log_file_name\": 'houses_experiment.log', # flaml log file\n"," \"seed\": 7654321, # random seed\n"," \"use_spark\": True, # whether to use Spark for distributed training\n"," \"n_concurrent_trials\": 2, # the maximum number of concurrent trials\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.3953298Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:11.9003975Z\",\"execution_finish_time\":\"2022-12-07T08:27:58.525709Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["'''The main flaml automl API'''\n","automl.fit(X_train=X_train, y_train=y_train, **settings)"]},{"attachments":{},"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["### Best model and metric"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.789647Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:58.6014435Z\",\"execution_finish_time\":\"2022-12-07T08:27:58.9745212Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' retrieve best config'''\n","print('Best hyperparmeter config:', automl.best_config)\n","print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))\n","print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.9962623Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:59.0491242Z\",\"execution_finish_time\":\"2022-12-07T08:27:59.4076477Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["automl.model.estimator"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.2539877Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:59.5247209Z\",\"execution_finish_time\":\"2022-12-07T08:28:00.4849272Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[],"source":["import matplotlib.pyplot as plt\n","plt.barh(automl.feature_names_in_, automl.feature_importances_)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.5182783Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:00.5644015Z\",\"execution_finish_time\":\"2022-12-07T08:28:01.5531147Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["''' pickle and save the automl object '''\n","import pickle\n","with open('automl.pkl', 'wb') as f:\n"," pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.803107Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:01.6350567Z\",\"execution_finish_time\":\"2022-12-07T08:28:02.5774117Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' compute predictions of testing dataset ''' \n","y_pred = automl.predict(X_test)\n","print('Predicted labels', y_pred)\n","print('True labels', y_test)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.0585537Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:02.6537337Z\",\"execution_finish_time\":\"2022-12-07T08:28:03.0177805Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' compute different metric values on testing dataset'''\n","from flaml.ml import sklearn_metric_loss_score\n","print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))\n","print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))\n","print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.2226463Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:03.1150781Z\",\"execution_finish_time\":\"2022-12-07T08:28:03.4858362Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"subslide"},"tags":[]},"outputs":[],"source":["from flaml.data import get_output_from_log\n","time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n"," get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n","\n","for config in config_history:\n"," print(config)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.4020235Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:03.5811012Z\",\"execution_finish_time\":\"2022-12-07T08:28:04.5493292Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["import numpy as np\n","\n","plt.title('Learning Curve')\n","plt.xlabel('Wall Clock Time (s)')\n","plt.ylabel('Validation r2')\n","plt.scatter(time_history, 1 - np.array(valid_loss_history))\n","plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\n","plt.show()"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["## 3. Add a customized LightGBM learner in FLAML\n","The native API of LightGBM allows one to specify a custom objective function in the model constructor. You can easily enable it by adding a customized LightGBM learner in FLAML. In the following example, we show how to add such a customized LightGBM learner with a custom objective function for parallel tuning with Spark.\n","\n","It's a little bit different from adding customized learners for sequential training. In sequential training, we can define the customized learner in a notebook cell. However, in spark training, we have to import it from a file so that Spark can use it in executors. We can easily do it by leveraging `broadcast_code` function in `flaml.tune.spark.utils`."]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Create a customized LightGBM learner with a custom objective function"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:09:49.540914Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:09:49.6259637Z\",\"execution_finish_time\":\"2022-12-07T09:09:50.5841239Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[],"source":["custom_code = \"\"\"\n","import numpy as np \n","from flaml.model import LGBMEstimator\n","from flaml import tune\n","\n","\n","''' define your customized objective function '''\n","def my_loss_obj(y_true, y_pred):\n"," c = 0.5\n"," residual = y_pred - y_true\n"," grad = c * residual /(np.abs(residual) + c)\n"," hess = c ** 2 / (np.abs(residual) + c) ** 2\n"," # rmse grad and hess\n"," grad_rmse = residual\n"," hess_rmse = 1.0\n"," \n"," # mae grad and hess\n"," grad_mae = np.array(residual)\n"," grad_mae[grad_mae > 0] = 1.\n"," grad_mae[grad_mae <= 0] = -1.\n"," hess_mae = 1.0\n","\n"," coef = [0.4, 0.3, 0.3]\n"," return coef[0] * grad + coef[1] * grad_rmse + coef[2] * grad_mae, \\\n"," coef[0] * hess + coef[1] * hess_rmse + coef[2] * hess_mae\n","\n","\n","''' create a customized LightGBM learner class with your objective function '''\n","class MyLGBM(LGBMEstimator):\n"," '''LGBMEstimator with my_loss_obj as the objective function\n"," '''\n","\n"," def __init__(self, **config):\n"," super().__init__(objective=my_loss_obj, **config)\n","\"\"\"\n","\n","from flaml.tune.spark.utils import broadcast_code\n","custom_learner_path = broadcast_code(custom_code=custom_code)\n","print(custom_learner_path)\n","from flaml.tune.spark.mylearner import MyLGBM"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Add the customized learner in FLAML"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:14:16.2449566Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:14:16.3227204Z\",\"execution_finish_time\":\"2022-12-07T09:16:49.7573919Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","tags":[]},"outputs":[],"source":["automl = AutoML()\n","automl.add_learner(learner_name='my_lgbm', learner_class=MyLGBM)\n","settings = {\n"," \"time_budget\": 30, # total running time in seconds\n"," \"metric\": 'r2', # primary metrics for regression can be chosen from: ['mae','mse','r2']\n"," \"estimator_list\": ['my_lgbm',], # list of ML learners; we tune lightgbm in this example\n"," \"task\": 'regression', # task type \n"," \"log_file_name\": 'houses_experiment_my_lgbm.log', # flaml log file\n"," \"n_concurrent_trials\": 2,\n"," \"use_spark\": True,\n","}\n","automl.fit(X_train=X_train, y_train=y_train, **settings)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:17:06.0159529Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:17:06.1042554Z\",\"execution_finish_time\":\"2022-12-07T09:17:06.467989Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","tags":[]},"outputs":[],"source":["print('Best hyperparmeter config:', automl.best_config)\n","print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))\n","print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))\n","\n","y_pred = automl.predict(X_test)\n","print('Predicted labels', y_pred)\n","print('True labels', y_test)\n","\n","from flaml.ml import sklearn_metric_loss_score\n","print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))\n","print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))\n","print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))"]},{"cell_type":"code","execution_count":null,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":[]}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Python 3.8.13 ('syml-py38')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.13 (default, Oct 21 2022, 23:50:54) \n[GCC 11.2.0]"},"notebook_environment":{},"save_output":true,"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.livy.synapse.ipythonInterpreter.enabled":"true"},"enableDebugMode":false,"keepAliveTimeout":30}},"synapse_widget":{"state":{},"version":"0.1"},"trident":{"lakehouse":{}},"vscode":{"interpreter":{"hash":"e3d9487e2ef008ade0db1bc293d3206d35cb2b6081faff9f66b40b257b7398f7"}}},"nbformat":4,"nbformat_minor":0} +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved. \n", + "\n", + "Licensed under the MIT License.\n", + "\n", + "# Run FLAML Parallel tuning with Spark\n", + "\n", + "\n", + "## 1. Introduction\n", + "\n", + "FLAML is a Python library (https://github.com/microsoft/FLAML) designed to automatically produce accurate machine learning models \n", + "with low computational cost. It is fast and economical. The simple and lightweight design makes it easy \n", + "to use and extend, such as adding new learners. FLAML can \n", + "- serve as an economical AutoML engine,\n", + "- be used as a fast hyperparameter tuning tool, or \n", + "- be embedded in self-tuning software that requires low latency & resource in repetitive\n", + " tuning tasks.\n", + "\n", + "In this notebook, we demonstrate how to run FLAML parallel tuning using Spark as the backend.\n", + "\n", + "FLAML requires `Python>=3.8`. To run this notebook example, please install flaml with the following options:\n", + "```bash\n", + "pip install flaml[automl,spark,blendsearch]\n", + "```\n", + "*Spark support is added in v1.1.0*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:16:51.6335768Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:17:21.9028602Z\",\"execution_finish_time\":\"2022-12-07T08:18:52.3646576Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}" + }, + "outputs": [], + "source": [ + "# %pip install flaml[automl,spark,blendsearch] matplotlib openml" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## 2. Regression Example\n", + "### Load data and preprocess\n", + "\n", + "Download [houses dataset](https://www.openml.org/d/537) from OpenML. The task is to predict median price of the house in the region based on demographic composition and a state of housing market in the region." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.4783943Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:20:55.7666047Z\",\"execution_finish_time\":\"2022-12-07T08:21:10.9050139Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "subslide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from minio.error import ServerError\n", + "from flaml.automl.data import load_openml_dataset\n", + "\n", + "try:\n", + " X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')\n", + "except (ServerError, Exception):\n", + " from sklearn.datasets import fetch_california_housing\n", + " from sklearn.model_selection import train_test_split\n", + "\n", + " X, y = fetch_california_housing(return_X_y=True)\n", + " X_train, X_test, y_train, y_test = train_test_split(X, y)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Run FLAML\n", + "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. \n", + "\n", + "Notice that here `use_spark` is set to `True` in order to use Spark as the parallel training backend." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.7001471Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:10.9846131Z\",\"execution_finish_time\":\"2022-12-07T08:21:11.3604062Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "''' import AutoML class from flaml package '''\n", + "from flaml import AutoML\n", + "automl = AutoML()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.8983341Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:11.4417491Z\",\"execution_finish_time\":\"2022-12-07T08:21:11.8242955Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "settings = {\n", + " \"time_budget\": 30, # total running time in seconds\n", + " \"metric\": 'r2', # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']\n", + " \"estimator_list\": ['lgbm'], # list of ML learners; we tune lightgbm in this example\n", + " \"task\": 'regression', # task type \n", + " \"log_file_name\": 'houses_experiment.log', # flaml log file\n", + " \"seed\": 7654321, # random seed\n", + " \"use_spark\": True, # whether to use Spark for distributed training\n", + " \"n_concurrent_trials\": 2, # the maximum number of concurrent trials\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.3953298Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:11.9003975Z\",\"execution_finish_time\":\"2022-12-07T08:27:58.525709Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "'''The main flaml automl API'''\n", + "automl.fit(X_train=X_train, y_train=y_train, **settings)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Best model and metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.789647Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:58.6014435Z\",\"execution_finish_time\":\"2022-12-07T08:27:58.9745212Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "''' retrieve best config'''\n", + "print('Best hyperparmeter config:', automl.best_config)\n", + "print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))\n", + "print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.9962623Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:59.0491242Z\",\"execution_finish_time\":\"2022-12-07T08:27:59.4076477Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "automl.model.estimator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.2539877Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:59.5247209Z\",\"execution_finish_time\":\"2022-12-07T08:28:00.4849272Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.barh(automl.feature_names_in_, automl.feature_importances_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.5182783Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:00.5644015Z\",\"execution_finish_time\":\"2022-12-07T08:28:01.5531147Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "''' pickle and save the automl object '''\n", + "import pickle\n", + "with open('automl.pkl', 'wb') as f:\n", + " pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.803107Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:01.6350567Z\",\"execution_finish_time\":\"2022-12-07T08:28:02.5774117Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "''' compute predictions of testing dataset ''' \n", + "y_pred = automl.predict(X_test)\n", + "print('Predicted labels', y_pred)\n", + "print('True labels', y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.0585537Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:02.6537337Z\",\"execution_finish_time\":\"2022-12-07T08:28:03.0177805Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "''' compute different metric values on testing dataset'''\n", + "from flaml.ml import sklearn_metric_loss_score\n", + "print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))\n", + "print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))\n", + "print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.2226463Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:03.1150781Z\",\"execution_finish_time\":\"2022-12-07T08:28:03.4858362Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "subslide" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from flaml.automl.data import get_output_from_log\n", + "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", + " get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n", + "\n", + "for config in config_history:\n", + " print(config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.4020235Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:03.5811012Z\",\"execution_finish_time\":\"2022-12-07T08:28:04.5493292Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "plt.title('Learning Curve')\n", + "plt.xlabel('Wall Clock Time (s)')\n", + "plt.ylabel('Validation r2')\n", + "plt.scatter(time_history, 1 - np.array(valid_loss_history))\n", + "plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\n", + "plt.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Add a customized LightGBM learner in FLAML\n", + "The native API of LightGBM allows one to specify a custom objective function in the model constructor. You can easily enable it by adding a customized LightGBM learner in FLAML. In the following example, we show how to add such a customized LightGBM learner with a custom objective function for parallel tuning with Spark.\n", + "\n", + "It's a little bit different from adding customized learners for sequential training. In sequential training, we can define the customized learner in a notebook cell. However, in spark training, we have to import it from a file so that Spark can use it in executors. We can easily do it by leveraging `broadcast_code` function in `flaml.tune.spark.utils`." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a customized LightGBM learner with a custom objective function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:09:49.540914Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:09:49.6259637Z\",\"execution_finish_time\":\"2022-12-07T09:09:50.5841239Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}" + }, + "outputs": [], + "source": [ + "custom_code = \"\"\"\n", + "import numpy as np \n", + "from flaml.automl.model import LGBMEstimator\n", + "from flaml import tune\n", + "\n", + "\n", + "''' define your customized objective function '''\n", + "def my_loss_obj(y_true, y_pred):\n", + " c = 0.5\n", + " residual = y_pred - y_true\n", + " grad = c * residual /(np.abs(residual) + c)\n", + " hess = c ** 2 / (np.abs(residual) + c) ** 2\n", + " # rmse grad and hess\n", + " grad_rmse = residual\n", + " hess_rmse = 1.0\n", + " \n", + " # mae grad and hess\n", + " grad_mae = np.array(residual)\n", + " grad_mae[grad_mae > 0] = 1.\n", + " grad_mae[grad_mae <= 0] = -1.\n", + " hess_mae = 1.0\n", + "\n", + " coef = [0.4, 0.3, 0.3]\n", + " return coef[0] * grad + coef[1] * grad_rmse + coef[2] * grad_mae, \\\n", + " coef[0] * hess + coef[1] * hess_rmse + coef[2] * hess_mae\n", + "\n", + "\n", + "''' create a customized LightGBM learner class with your objective function '''\n", + "class MyLGBM(LGBMEstimator):\n", + " '''LGBMEstimator with my_loss_obj as the objective function\n", + " '''\n", + "\n", + " def __init__(self, **config):\n", + " super().__init__(objective=my_loss_obj, **config)\n", + "\"\"\"\n", + "\n", + "from flaml.tune.spark.utils import broadcast_code\n", + "custom_learner_path = broadcast_code(custom_code=custom_code)\n", + "print(custom_learner_path)\n", + "from flaml.tune.spark.mylearner import MyLGBM" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add the customized learner in FLAML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:14:16.2449566Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:14:16.3227204Z\",\"execution_finish_time\":\"2022-12-07T09:16:49.7573919Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "tags": [] + }, + "outputs": [], + "source": [ + "automl = AutoML()\n", + "automl.add_learner(learner_name='my_lgbm', learner_class=MyLGBM)\n", + "settings = {\n", + " \"time_budget\": 30, # total running time in seconds\n", + " \"metric\": 'r2', # primary metrics for regression can be chosen from: ['mae','mse','r2']\n", + " \"estimator_list\": ['my_lgbm',], # list of ML learners; we tune lightgbm in this example\n", + " \"task\": 'regression', # task type \n", + " \"log_file_name\": 'houses_experiment_my_lgbm.log', # flaml log file\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + "}\n", + "automl.fit(X_train=X_train, y_train=y_train, **settings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellStatus": "{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:17:06.0159529Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:17:06.1042554Z\",\"execution_finish_time\":\"2022-12-07T09:17:06.467989Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}", + "tags": [] + }, + "outputs": [], + "source": [ + "print('Best hyperparmeter config:', automl.best_config)\n", + "print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))\n", + "print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))\n", + "\n", + "y_pred = automl.predict(X_test)\n", + "print('Predicted labels', y_pred)\n", + "print('True labels', y_test)\n", + "\n", + "from flaml.ml import sklearn_metric_loss_score\n", + "print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))\n", + "print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))\n", + "print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Python 3.8.13 ('syml-py38')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13 (default, Oct 21 2022, 23:50:54) \n[GCC 11.2.0]" + }, + "notebook_environment": {}, + "save_output": true, + "spark_compute": { + "compute_id": "/trident/default", + "session_options": { + "conf": { + "spark.livy.synapse.ipythonInterpreter.enabled": "true" + }, + "enableDebugMode": false, + "keepAliveTimeout": 30 + } + }, + "vscode": { + "interpreter": { + "hash": "e3d9487e2ef008ade0db1bc293d3206d35cb2b6081faff9f66b40b257b7398f7" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebook/zeroshot_lightgbm.ipynb b/notebook/zeroshot_lightgbm.ipynb index 32acda41c..ddece6a29 100644 --- a/notebook/zeroshot_lightgbm.ipynb +++ b/notebook/zeroshot_lightgbm.ipynb @@ -28,7 +28,7 @@ "\n", "In this notebook, we demonstrate a basic use case of zero-shot AutoML with FLAML.\n", "\n", - "FLAML requires `Python>=3.7`. To run this notebook example, please install the [autozero] option:" + "FLAML requires `Python>=3.8`. To run this notebook example, please install the [autozero] option:" ] }, { @@ -130,7 +130,7 @@ } ], "source": [ - "from flaml.data import load_openml_dataset\n", + "from flaml.automl.data import load_openml_dataset\n", "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')" ] }, diff --git a/test/automl/test_classification.py b/test/automl/test_classification.py index ecec9a6d4..703aa70b9 100644 --- a/test/automl/test_classification.py +++ b/test/automl/test_classification.py @@ -277,12 +277,15 @@ class TestClassification(unittest.TestCase): import subprocess import sys + current_xgboost_version = xgb.__version__ subprocess.check_call([sys.executable, "-m", "pip", "install", "xgboost==1.3.3", "--user"]) automl = AutoML() automl.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl.feature_names_in_) print(automl.feature_importances_) - subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "xgboost", "--user"]) + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "-U", f"xgboost=={current_xgboost_version}", "--user"] + ) def test_ray_classification(self): X, y = load_breast_cancer(return_X_y=True) diff --git a/test/automl/test_split.py b/test/automl/test_split.py index 00990348f..f0e010d95 100644 --- a/test/automl/test_split.py +++ b/test/automl/test_split.py @@ -91,7 +91,7 @@ def test_groups(): def test_stratified_groupkfold(): from sklearn.model_selection import StratifiedGroupKFold from minio.error import ServerError - from flaml.data import load_openml_dataset + from flaml.automl.data import load_openml_dataset try: X_train, _, y_train, _ = load_openml_dataset(dataset_id=1169, data_dir="test/") diff --git a/website/docs/Examples/Default-Flamlized.md b/website/docs/Examples/Default-Flamlized.md index 4b0f2853f..2ac6cc3ea 100644 --- a/website/docs/Examples/Default-Flamlized.md +++ b/website/docs/Examples/Default-Flamlized.md @@ -45,7 +45,7 @@ LGBMRegressor(colsample_bytree=0.7019911744574896, ### Suggest hyperparameters without training ``` -from flaml.data import load_openml_dataset +from flaml.automl.data import load_openml_dataset from flaml.default import LGBMRegressor from flaml.ml import sklearn_metric_loss_score diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md index 7df7363f5..13202a4d1 100644 --- a/website/docs/Use-Cases/Task-Oriented-AutoML.md +++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md @@ -135,7 +135,7 @@ The estimator list can contain one or more estimator names, each corresponding t #### Guidelines on tuning a custom estimator To tune a custom estimator that is not built-in, you need to: -1. Build a custom estimator by inheritting [`flaml.model.BaseEstimator`](/docs/reference/automl/model#baseestimator-objects) or a derived class. +1. Build a custom estimator by inheritting [`flaml.automl.model.BaseEstimator`](/docs/reference/automl/model#baseestimator-objects) or a derived class. For example, if you have a estimator class with scikit-learn style `fit()` and `predict()` functions, you only need to set `self.estimator_class` to be that class in your constructor. ```python @@ -177,7 +177,7 @@ class MyRegularizedGreedyForest(SKLearnEstimator): return space ``` -In the constructor, we set `self.estimator_class` as `RGFClassifier` or `RGFRegressor` according to the task type. If the estimator you want to tune does not have a scikit-learn style `fit()` and `predict()` API, you can override the `fit()` and `predict()` function of `flaml.model.BaseEstimator`, like [XGBoostEstimator](/docs/reference/automl/model#xgboostestimator-objects). Importantly, we also add the `task="binary"` parameter in the signature of `__init__` so that it doesn't get grouped together with the `**config` kwargs that determines the parameters with which the underlying estimator (`self.estimator_class`) is constructed. If your estimator doesn't use one of the parameters that it is passed, for example some regressors in `scikit-learn` don't use the `n_jobs` parameter, it is enough to add `n_jobs=None` to the signature so that it is ignored by the `**config` dict. +In the constructor, we set `self.estimator_class` as `RGFClassifier` or `RGFRegressor` according to the task type. If the estimator you want to tune does not have a scikit-learn style `fit()` and `predict()` API, you can override the `fit()` and `predict()` function of `flaml.automl.model.BaseEstimator`, like [XGBoostEstimator](/docs/reference/automl/model#xgboostestimator-objects). Importantly, we also add the `task="binary"` parameter in the signature of `__init__` so that it doesn't get grouped together with the `**config` kwargs that determines the parameters with which the underlying estimator (`self.estimator_class`) is constructed. If your estimator doesn't use one of the parameters that it is passed, for example some regressors in `scikit-learn` don't use the `n_jobs` parameter, it is enough to add `n_jobs=None` to the signature so that it is ignored by the `**config` dict. 2. Give the custom estimator a name and add it in AutoML. E.g., @@ -524,10 +524,10 @@ The best model can be obtained by the `model` property of an `AutoML` instance. ```python automl.fit(X_train, y_train, task="regression") print(automl.model) -# +# ``` -[`flaml.model.LGBMEstimator`](/docs/reference/automl/model#lgbmestimator-objects) is a wrapper class for LightGBM models. To access the underlying model, use the `estimator` property of the `flaml.model.LGBMEstimator` instance. +[`flaml.automl.model.LGBMEstimator`](/docs/reference/automl/model#lgbmestimator-objects) is a wrapper class for LightGBM models. To access the underlying model, use the `estimator` property of the `flaml.automl.model.LGBMEstimator` instance. ```python print(automl.model.estimator)