From c26720c2999fa39de1f6f5581f5b52c8fb2c1f82 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 11 Jun 2021 10:25:45 -0700
Subject: [PATCH] api doc for chacha (#105)

* api doc for chacha

* update params

* link to paper

* update dataset id

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: Qingyun Wu <qiw@microsoft.com>
---
 README.md                   |  8 ++---
 docs/index.rst              | 11 +++++--
 flaml/model.py              | 65 ++++++++++++++++++-------------------
 flaml/onlineml/README.md    | 10 +++++-
 notebook/flaml_autovw.ipynb |  4 +--
 test/test_automl.py         |  1 -
 test/test_autovw.py         |  2 +-
 7 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 593a6a566..094cf7d33 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ automl_settings = {
 X_train, y_train = load_iris(return_X_y=True)
 # Train with labeled input data
 automl.fit(X_train=X_train, y_train=y_train,
-                        **automl_settings)
+           **automl_settings)
 # Predict
 print(automl.predict_proba(X_train))
 # Export the best model
@@ -111,7 +111,7 @@ automl_settings = {
 X_train, y_train = load_boston(return_X_y=True)
 # Train with labeled input data
 automl.fit(X_train=X_train, y_train=y_train,
-                        **automl_settings)
+           **automl_settings)
 # Predict
 print(automl.predict(X_train))
 # Export the best model
@@ -140,7 +140,7 @@ For more technical details, please check our papers.
 ```
 * [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.
 * [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.
-* ChaCha for online AutoML. Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
+* [ChaCha for Online AutoML](https://arxiv.org/pdf/2106.04815.pdf). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
 
 ## Contributing
 
@@ -180,7 +180,7 @@ If all the tests are passed, please also test run notebook/flaml_automl to make
 * Chi Wang
 * Qingyun Wu
 
-Contributors (alphabetical order): Sebastien Bubeck, Surajit Chaudhuri, Nadiia Chepurko, Ofer Dekel, Alex Deng, Anshuman Dutt, Nicolo Fusi, Jianfeng Gao, Johannes Gehrke, Silu Huang, Dongwoo Kim, Christian Konig, John Langford, Amin Saied, Neil Tenenholtz, Markus Weimer, Haozhe Zhang, Erkang Zhu.
+Contributors (alphabetical order): Sebastien Bubeck, Surajit Chaudhuri, Nadiia Chepurko, Ofer Dekel, Alex Deng, Anshuman Dutt, Nicolo Fusi, Jianfeng Gao, Johannes Gehrke, Silu Huang, Dongwoo Kim, Christian Konig, John Langford, Xueqing Liu, Paul Mineiro, Amin Saied, Neil Tenenholtz, Olga Vrousgou, Markus Weimer, Haozhe Zhang, Erkang Zhu.
 
 ## License
 
diff --git a/docs/index.rst b/docs/index.rst
index c4ae47a98..5b297675a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -22,7 +22,7 @@ AutoML
 
 
 Tune
-------
+----
 
 .. autofunction:: flaml.tune.run
 
@@ -38,8 +38,15 @@ Tune
    :members:
 
 
+Online AutoML
+-------------
+
+.. autoclass:: flaml.AutoVW
+   :members:
+
+
 NLP
-------
+---
 
 .. autoclass:: flaml.nlp.AutoTransformers
    :members:
diff --git a/flaml/model.py b/flaml/model.py
index 8d32f3f38..57fd34ca9 100644
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -171,6 +171,9 @@ class BaseEstimator:
 
 class SKLearnEstimator(BaseEstimator):
 
+    def __init__(self, task='binary:logistic', **params):
+        super().__init__(task, **params)
+
     def _preprocess(self, X):
         if isinstance(X, pd.DataFrame):
             X = X.copy()
@@ -231,12 +234,7 @@ class LGBMEstimator(BaseEstimator):
         n_estimators = int(round(config['n_estimators']))
         return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
 
-    def __init__(
-        self, task='binary:logistic', n_jobs=1,
-        n_estimators=2, num_leaves=2, min_child_samples=20, learning_rate=0.1,
-        subsample=1.0, reg_lambda=1.0, reg_alpha=0.0,
-        colsample_bytree=1.0, log_max_bin=8, **params
-    ):
+    def __init__(self, task='binary:logistic', log_max_bin=8, **params):
         super().__init__(task, **params)
         # Default: ‘regression’ for LGBMRegressor,
         # ‘binary’ or ‘multiclass’ for LGBMClassifier
@@ -248,20 +246,16 @@ class LGBMEstimator(BaseEstimator):
             objective = 'multiclass'
         else:
             objective = 'regression'
-        self.params = {
-            "n_estimators": int(round(n_estimators)),
-            "num_leaves": int(round(num_leaves)),
-            'objective': params.get("objective", objective),
-            'n_jobs': n_jobs,
-            'learning_rate': float(learning_rate),
-            'reg_alpha': float(reg_alpha),
-            'reg_lambda': float(reg_lambda),
-            'min_child_samples': int(round(min_child_samples)),
-            'colsample_bytree': float(colsample_bytree),
-            'subsample': float(subsample),
-        }
-        self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
-            1 << int(round(log_max_bin))) - 1
+        if "n_estimators" in self.params:
+            self.params["n_estimators"] = int(round(self.params["n_estimators"]))
+        if "num_leaves" in self.params:
+            self.params["num_leaves"] = int(round(self.params["num_leaves"]))
+        if "min_child_samples" in self.params:
+            self.params["min_child_samples"] = int(round(self.params["min_child_samples"]))
+        if "objective" not in self.params:
+            self.params["objective"] = objective
+        if "max_bin" not in self.params:
+            self.params['max_bin'] = 1 << int(round(log_max_bin)) - 1
         if 'regression' in task:
             self.estimator_class = LGBMRegressor
         else:
@@ -369,7 +363,7 @@ class XGBoostEstimator(SKLearnEstimator):
     ):
         super().__init__(task, **params)
         self._n_estimators = int(round(n_estimators))
-        self.params = {
+        self.params.update({
             'max_leaves': int(round(max_leaves)),
             'max_depth': params.get('max_depth', 0),
             'grow_policy': params.get("grow_policy", 'lossguide'),
@@ -385,7 +379,7 @@ class XGBoostEstimator(SKLearnEstimator):
             'colsample_bylevel': float(colsample_bylevel),
             'colsample_bytree': float(colsample_bytree),
             'objective': params.get("objective")
-        }
+        })
         if all_thread:
             del self.params['nthread']
 
@@ -445,7 +439,8 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
         **params
     ):
         super().__init__(task, **params)
-        self.params = {
+        self.params = params
+        self.params.update({
             "n_estimators": int(round(n_estimators)),
             'max_leaves': int(round(max_leaves)),
             'max_depth': 0,
@@ -462,7 +457,7 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
             'colsample_bylevel': float(colsample_bylevel),
             'colsample_bytree': float(colsample_bytree),
             'use_label_encoder': params.get('use_label_encoder', False),
-        }
+        })
 
         if 'regression' in task:
             self.estimator_class = xgb.XGBRegressor
@@ -513,11 +508,12 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
         n_estimators=4, max_features=1.0, criterion='gini', **params
     ):
         super().__init__(task, **params)
-        self.params = {
+        self.params = params
+        self.params.update({
             "n_estimators": int(round(n_estimators)),
             "n_jobs": n_jobs,
             'max_features': float(max_features),
-        }
+        })
         if 'regression' in task:
             self.estimator_class = RandomForestRegressor
         else:
@@ -565,13 +561,13 @@ class LRL1Classifier(SKLearnEstimator):
         **params
     ):
         super().__init__(task, **params)
-        self.params = {
+        self.params.update({
             'penalty': params.get("penalty", 'l1'),
             'tol': float(tol),
             'C': float(C),
             'solver': params.get("solver", 'saga'),
             'n_jobs': n_jobs,
-        }
+        })
         if 'regression' in task:
             self.estimator_class = None
             raise NotImplementedError('LR does not support regression task')
@@ -594,13 +590,13 @@ class LRL2Classifier(SKLearnEstimator):
         **params
     ):
         super().__init__(task, **params)
-        self.params = {
+        self.params.update({
             'penalty': params.get("penalty", 'l2'),
             'tol': float(tol),
             'C': float(C),
             'solver': params.get("solver", 'lbfgs'),
             'n_jobs': n_jobs,
-        }
+        })
         if 'regression' in task:
             self.estimator_class = None
             raise NotImplementedError('LR does not support regression task')
@@ -648,14 +644,14 @@ class CatBoostEstimator(BaseEstimator):
         n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params
     ):
         super().__init__(task, **params)
-        self.params = {
+        self.params.update({
             "early_stopping_rounds": int(round(early_stopping_rounds)),
             "n_estimators": n_estimators,
             'learning_rate': learning_rate,
             'thread_count': n_jobs,
             'verbose': params.get('verbose', False),
             'random_seed': params.get("random_seed", 10242048),
-        }
+        })
         if 'regression' in task:
             from catboost import CatBoostRegressor
             self.estimator_class = CatBoostRegressor
@@ -759,11 +755,12 @@ class KNeighborsEstimator(BaseEstimator):
         self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params
     ):
         super().__init__(task, **params)
-        self.params = {
+        self.params = params
+        self.params.update({
             'n_neighbors': int(round(n_neighbors)),
             'weights': params.get('weights', 'distance'),
             'n_jobs': n_jobs,
-        }
+        })
         if 'regression' in task:
             from sklearn.neighbors import KNeighborsRegressor
             self.estimator_class = KNeighborsRegressor
diff --git a/flaml/onlineml/README.md b/flaml/onlineml/README.md
index 926721055..4544f8fa6 100644
--- a/flaml/onlineml/README.md
+++ b/flaml/onlineml/README.md
@@ -4,7 +4,15 @@ FLAML includes *ChaCha* which is an automatic hyperparameter tuning solution for
 
 For more technical details about *ChaCha*, please check our paper.
 
-* ChaCha for online AutoML. Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
+* [ChaCha for Online AutoML](https://arxiv.org/pdf/2106.04815.pdf). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
+```
+@inproceedings{wu2021chacha,
+    title={ChaCha for online AutoML},
+    author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},
+    year={2021},
+    booktitle={ICML},
+}
+```
 
 ## `AutoVW`
 
diff --git a/notebook/flaml_autovw.ipynb b/notebook/flaml_autovw.ipynb
index 843b9157d..508628e93 100644
--- a/notebook/flaml_autovw.ipynb
+++ b/notebook/flaml_autovw.ipynb
@@ -48,7 +48,7 @@
     "## 2. Online regression with AutoVW\n",
     "### Load data from openml and preprocess\n",
     "\n",
-    "Download [dataset_sales](https://www.openml.org/d/42183) from OpenML."
+    "Download [NewFuelCar](https://www.openml.org/d/41506) from OpenML."
    ]
   },
   {
@@ -412,4 +412,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/test/test_automl.py b/test/test_automl.py
index 7e3bfdc51..6916ba9ec 100644
--- a/test/test_automl.py
+++ b/test/test_automl.py
@@ -273,7 +273,6 @@ class TestAutoML(unittest.TestCase):
         automl_experiment = AutoML()
         automl_settings = {
             "time_budget": 2,
-            "metric": 'mse',
             "task": 'regression',
             "log_file_name": "test/boston.log",
             "log_training_metric": True,
diff --git a/test/test_autovw.py b/test/test_autovw.py
index bc7524e22..3f294535e 100644
--- a/test/test_autovw.py
+++ b/test/test_autovw.py
@@ -5,7 +5,6 @@ import scipy.sparse
 
 import pandas as pd
 from sklearn.metrics import mean_squared_error, mean_absolute_error
-import time
 import logging
 from flaml.tune import loguniform, polynomial_expansion_set
 from vowpalwabbit import pyvw
@@ -13,6 +12,7 @@ from flaml import AutoVW
 import string
 import os
 import openml
+
 VW_DS_DIR = 'test/data/'
 NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
 logger = logging.getLogger(__name__)