From c26720c2999fa39de1f6f5581f5b52c8fb2c1f82 Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Fri, 11 Jun 2021 10:25:45 -0700 Subject: [PATCH] api doc for chacha (#105) * api doc for chacha * update params * link to paper * update dataset id Co-authored-by: Chi Wang (MSR) Co-authored-by: Qingyun Wu --- README.md | 8 ++--- docs/index.rst | 11 +++++-- flaml/model.py | 65 ++++++++++++++++++------------------- flaml/onlineml/README.md | 10 +++++- notebook/flaml_autovw.ipynb | 4 +-- test/test_automl.py | 1 - test/test_autovw.py | 2 +- 7 files changed, 56 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 593a6a566..094cf7d33 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ automl_settings = { X_train, y_train = load_iris(return_X_y=True) # Train with labeled input data automl.fit(X_train=X_train, y_train=y_train, - **automl_settings) + **automl_settings) # Predict print(automl.predict_proba(X_train)) # Export the best model @@ -111,7 +111,7 @@ automl_settings = { X_train, y_train = load_boston(return_X_y=True) # Train with labeled input data automl.fit(X_train=X_train, y_train=y_train, - **automl_settings) + **automl_settings) # Predict print(automl.predict(X_train)) # Export the best model @@ -140,7 +140,7 @@ For more technical details, please check our papers. ``` * [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021. * [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021. -* ChaCha for online AutoML. Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021. +* [ChaCha for Online AutoML](https://arxiv.org/pdf/2106.04815.pdf). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021. ## Contributing @@ -180,7 +180,7 @@ If all the tests are passed, please also test run notebook/flaml_automl to make * Chi Wang * Qingyun Wu -Contributors (alphabetical order): Sebastien Bubeck, Surajit Chaudhuri, Nadiia Chepurko, Ofer Dekel, Alex Deng, Anshuman Dutt, Nicolo Fusi, Jianfeng Gao, Johannes Gehrke, Silu Huang, Dongwoo Kim, Christian Konig, John Langford, Amin Saied, Neil Tenenholtz, Markus Weimer, Haozhe Zhang, Erkang Zhu. +Contributors (alphabetical order): Sebastien Bubeck, Surajit Chaudhuri, Nadiia Chepurko, Ofer Dekel, Alex Deng, Anshuman Dutt, Nicolo Fusi, Jianfeng Gao, Johannes Gehrke, Silu Huang, Dongwoo Kim, Christian Konig, John Langford, Xueqing Liu, Paul Mineiro, Amin Saied, Neil Tenenholtz, Olga Vrousgou, Markus Weimer, Haozhe Zhang, Erkang Zhu. ## License diff --git a/docs/index.rst b/docs/index.rst index c4ae47a98..5b297675a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,7 +22,7 @@ AutoML Tune ------- +---- .. autofunction:: flaml.tune.run @@ -38,8 +38,15 @@ Tune :members: +Online AutoML +------------- + +.. autoclass:: flaml.AutoVW + :members: + + NLP ------- +--- .. autoclass:: flaml.nlp.AutoTransformers :members: diff --git a/flaml/model.py b/flaml/model.py index 8d32f3f38..57fd34ca9 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -171,6 +171,9 @@ class BaseEstimator: class SKLearnEstimator(BaseEstimator): + def __init__(self, task='binary:logistic', **params): + super().__init__(task, **params) + def _preprocess(self, X): if isinstance(X, pd.DataFrame): X = X.copy() @@ -231,12 +234,7 @@ class LGBMEstimator(BaseEstimator): n_estimators = int(round(config['n_estimators'])) return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8 - def __init__( - self, task='binary:logistic', n_jobs=1, - n_estimators=2, num_leaves=2, min_child_samples=20, learning_rate=0.1, - subsample=1.0, reg_lambda=1.0, reg_alpha=0.0, - colsample_bytree=1.0, log_max_bin=8, **params - ): + def __init__(self, task='binary:logistic', log_max_bin=8, **params): super().__init__(task, **params) # Default: ‘regression’ for LGBMRegressor, # ‘binary’ or ‘multiclass’ for LGBMClassifier @@ -248,20 +246,16 @@ class LGBMEstimator(BaseEstimator): objective = 'multiclass' else: objective = 'regression' - self.params = { - "n_estimators": int(round(n_estimators)), - "num_leaves": int(round(num_leaves)), - 'objective': params.get("objective", objective), - 'n_jobs': n_jobs, - 'learning_rate': float(learning_rate), - 'reg_alpha': float(reg_alpha), - 'reg_lambda': float(reg_lambda), - 'min_child_samples': int(round(min_child_samples)), - 'colsample_bytree': float(colsample_bytree), - 'subsample': float(subsample), - } - self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else ( - 1 << int(round(log_max_bin))) - 1 + if "n_estimators" in self.params: + self.params["n_estimators"] = int(round(self.params["n_estimators"])) + if "num_leaves" in self.params: + self.params["num_leaves"] = int(round(self.params["num_leaves"])) + if "min_child_samples" in self.params: + self.params["min_child_samples"] = int(round(self.params["min_child_samples"])) + if "objective" not in self.params: + self.params["objective"] = objective + if "max_bin" not in self.params: + self.params['max_bin'] = 1 << int(round(log_max_bin)) - 1 if 'regression' in task: self.estimator_class = LGBMRegressor else: @@ -369,7 +363,7 @@ class XGBoostEstimator(SKLearnEstimator): ): super().__init__(task, **params) self._n_estimators = int(round(n_estimators)) - self.params = { + self.params.update({ 'max_leaves': int(round(max_leaves)), 'max_depth': params.get('max_depth', 0), 'grow_policy': params.get("grow_policy", 'lossguide'), @@ -385,7 +379,7 @@ class XGBoostEstimator(SKLearnEstimator): 'colsample_bylevel': float(colsample_bylevel), 'colsample_bytree': float(colsample_bytree), 'objective': params.get("objective") - } + }) if all_thread: del self.params['nthread'] @@ -445,7 +439,8 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): **params ): super().__init__(task, **params) - self.params = { + self.params = params + self.params.update({ "n_estimators": int(round(n_estimators)), 'max_leaves': int(round(max_leaves)), 'max_depth': 0, @@ -462,7 +457,7 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): 'colsample_bylevel': float(colsample_bylevel), 'colsample_bytree': float(colsample_bytree), 'use_label_encoder': params.get('use_label_encoder', False), - } + }) if 'regression' in task: self.estimator_class = xgb.XGBRegressor @@ -513,11 +508,12 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): n_estimators=4, max_features=1.0, criterion='gini', **params ): super().__init__(task, **params) - self.params = { + self.params = params + self.params.update({ "n_estimators": int(round(n_estimators)), "n_jobs": n_jobs, 'max_features': float(max_features), - } + }) if 'regression' in task: self.estimator_class = RandomForestRegressor else: @@ -565,13 +561,13 @@ class LRL1Classifier(SKLearnEstimator): **params ): super().__init__(task, **params) - self.params = { + self.params.update({ 'penalty': params.get("penalty", 'l1'), 'tol': float(tol), 'C': float(C), 'solver': params.get("solver", 'saga'), 'n_jobs': n_jobs, - } + }) if 'regression' in task: self.estimator_class = None raise NotImplementedError('LR does not support regression task') @@ -594,13 +590,13 @@ class LRL2Classifier(SKLearnEstimator): **params ): super().__init__(task, **params) - self.params = { + self.params.update({ 'penalty': params.get("penalty", 'l2'), 'tol': float(tol), 'C': float(C), 'solver': params.get("solver", 'lbfgs'), 'n_jobs': n_jobs, - } + }) if 'regression' in task: self.estimator_class = None raise NotImplementedError('LR does not support regression task') @@ -648,14 +644,14 @@ class CatBoostEstimator(BaseEstimator): n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params ): super().__init__(task, **params) - self.params = { + self.params.update({ "early_stopping_rounds": int(round(early_stopping_rounds)), "n_estimators": n_estimators, 'learning_rate': learning_rate, 'thread_count': n_jobs, 'verbose': params.get('verbose', False), 'random_seed': params.get("random_seed", 10242048), - } + }) if 'regression' in task: from catboost import CatBoostRegressor self.estimator_class = CatBoostRegressor @@ -759,11 +755,12 @@ class KNeighborsEstimator(BaseEstimator): self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params ): super().__init__(task, **params) - self.params = { + self.params = params + self.params.update({ 'n_neighbors': int(round(n_neighbors)), 'weights': params.get('weights', 'distance'), 'n_jobs': n_jobs, - } + }) if 'regression' in task: from sklearn.neighbors import KNeighborsRegressor self.estimator_class = KNeighborsRegressor diff --git a/flaml/onlineml/README.md b/flaml/onlineml/README.md index 926721055..4544f8fa6 100644 --- a/flaml/onlineml/README.md +++ b/flaml/onlineml/README.md @@ -4,7 +4,15 @@ FLAML includes *ChaCha* which is an automatic hyperparameter tuning solution for For more technical details about *ChaCha*, please check our paper. -* ChaCha for online AutoML. Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021. +* [ChaCha for Online AutoML](https://arxiv.org/pdf/2106.04815.pdf). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021. +``` +@inproceedings{wu2021chacha, + title={ChaCha for online AutoML}, + author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi}, + year={2021}, + booktitle={ICML}, +} +``` ## `AutoVW` diff --git a/notebook/flaml_autovw.ipynb b/notebook/flaml_autovw.ipynb index 843b9157d..508628e93 100644 --- a/notebook/flaml_autovw.ipynb +++ b/notebook/flaml_autovw.ipynb @@ -48,7 +48,7 @@ "## 2. Online regression with AutoVW\n", "### Load data from openml and preprocess\n", "\n", - "Download [dataset_sales](https://www.openml.org/d/42183) from OpenML." + "Download [NewFuelCar](https://www.openml.org/d/41506) from OpenML." ] }, { @@ -412,4 +412,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/test/test_automl.py b/test/test_automl.py index 7e3bfdc51..6916ba9ec 100644 --- a/test/test_automl.py +++ b/test/test_automl.py @@ -273,7 +273,6 @@ class TestAutoML(unittest.TestCase): automl_experiment = AutoML() automl_settings = { "time_budget": 2, - "metric": 'mse', "task": 'regression', "log_file_name": "test/boston.log", "log_training_metric": True, diff --git a/test/test_autovw.py b/test/test_autovw.py index bc7524e22..3f294535e 100644 --- a/test/test_autovw.py +++ b/test/test_autovw.py @@ -5,7 +5,6 @@ import scipy.sparse import pandas as pd from sklearn.metrics import mean_squared_error, mean_absolute_error -import time import logging from flaml.tune import loguniform, polynomial_expansion_set from vowpalwabbit import pyvw @@ -13,6 +12,7 @@ from flaml import AutoVW import string import os import openml + VW_DS_DIR = 'test/data/' NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase) logger = logging.getLogger(__name__)