api doc for chacha (#105)

* api doc for chacha * update params * link to paper * update dataset id Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com> Co-authored-by: Qingyun Wu <qiw@microsoft.com>
2026-02-09 02:09:16 +08:00 · 2021-06-11 10:25:45 -07:00
parent a4049ad9b6
commit c26720c299
7 changed files with 56 additions and 45 deletions
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ automl_settings = {
 X_train, y_train = load_iris(return_X_y=True)
 # Train with labeled input data
 automl.fit(X_train=X_train, y_train=y_train,
-                        **automl_settings)
+           **automl_settings)
 # Predict
 print(automl.predict_proba(X_train))
 # Export the best model
@@ -111,7 +111,7 @@ automl_settings = {
 X_train, y_train = load_boston(return_X_y=True)
 # Train with labeled input data
 automl.fit(X_train=X_train, y_train=y_train,
-                        **automl_settings)
+           **automl_settings)
 # Predict
 print(automl.predict(X_train))
 # Export the best model
@@ -140,7 +140,7 @@ For more technical details, please check our papers.
 ```
 * [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.
 * [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.
-* ChaCha for online AutoML. Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
+* [ChaCha for Online AutoML](https://arxiv.org/pdf/2106.04815.pdf). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.

 ## Contributing

@@ -180,7 +180,7 @@ If all the tests are passed, please also test run notebook/flaml_automl to make
 * Chi Wang
 * Qingyun Wu

-Contributors (alphabetical order): Sebastien Bubeck, Surajit Chaudhuri, Nadiia Chepurko, Ofer Dekel, Alex Deng, Anshuman Dutt, Nicolo Fusi, Jianfeng Gao, Johannes Gehrke, Silu Huang, Dongwoo Kim, Christian Konig, John Langford, Amin Saied, Neil Tenenholtz, Markus Weimer, Haozhe Zhang, Erkang Zhu.
+Contributors (alphabetical order): Sebastien Bubeck, Surajit Chaudhuri, Nadiia Chepurko, Ofer Dekel, Alex Deng, Anshuman Dutt, Nicolo Fusi, Jianfeng Gao, Johannes Gehrke, Silu Huang, Dongwoo Kim, Christian Konig, John Langford, Xueqing Liu, Paul Mineiro, Amin Saied, Neil Tenenholtz, Olga Vrousgou, Markus Weimer, Haozhe Zhang, Erkang Zhu.

 ## License

--- a/docs/index.rst
+++ b/docs/index.rst
@@ -22,7 +22,7 @@ AutoML


 Tune
------
+----

 .. autofunction:: flaml.tune.run

@@ -38,8 +38,15 @@ Tune
   :members:


+Online AutoML
+-------------
+
+.. autoclass:: flaml.AutoVW
+   :members:
+
+
 NLP
------
+---

 .. autoclass:: flaml.nlp.AutoTransformers
   :members:
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -171,6 +171,9 @@ class BaseEstimator:

 class SKLearnEstimator(BaseEstimator):

+    def __init__(self, task='binary:logistic', **params):
+        super().__init__(task, **params)
+
    def _preprocess(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.copy()
@@ -231,12 +234,7 @@ class LGBMEstimator(BaseEstimator):
        n_estimators = int(round(config['n_estimators']))
        return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8

-    def __init__(
-        self, task='binary:logistic', n_jobs=1,
-        n_estimators=2, num_leaves=2, min_child_samples=20, learning_rate=0.1,
-        subsample=1.0, reg_lambda=1.0, reg_alpha=0.0,
-        colsample_bytree=1.0, log_max_bin=8, **params
-    ):
+    def __init__(self, task='binary:logistic', log_max_bin=8, **params):
        super().__init__(task, **params)
        # Default: ‘regression’ for LGBMRegressor,
        # ‘binary’ or ‘multiclass’ for LGBMClassifier
@@ -248,20 +246,16 @@ class LGBMEstimator(BaseEstimator):
            objective = 'multiclass'
        else:
            objective = 'regression'
-        self.params = {
-            "n_estimators": int(round(n_estimators)),
-            "num_leaves": int(round(num_leaves)),
-            'objective': params.get("objective", objective),
-            'n_jobs': n_jobs,
-            'learning_rate': float(learning_rate),
-            'reg_alpha': float(reg_alpha),
-            'reg_lambda': float(reg_lambda),
-            'min_child_samples': int(round(min_child_samples)),
-            'colsample_bytree': float(colsample_bytree),
-            'subsample': float(subsample),
-        }
-        self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
-            1 << int(round(log_max_bin))) - 1
+        if "n_estimators" in self.params:
+            self.params["n_estimators"] = int(round(self.params["n_estimators"]))
+        if "num_leaves" in self.params:
+            self.params["num_leaves"] = int(round(self.params["num_leaves"]))
+        if "min_child_samples" in self.params:
+            self.params["min_child_samples"] = int(round(self.params["min_child_samples"]))
+        if "objective" not in self.params:
+            self.params["objective"] = objective
+        if "max_bin" not in self.params:
+            self.params['max_bin'] = 1 << int(round(log_max_bin)) - 1
        if 'regression' in task:
            self.estimator_class = LGBMRegressor
        else:
@@ -369,7 +363,7 @@ class XGBoostEstimator(SKLearnEstimator):
    ):
        super().__init__(task, **params)
        self._n_estimators = int(round(n_estimators))
-        self.params = {
+        self.params.update({
            'max_leaves': int(round(max_leaves)),
            'max_depth': params.get('max_depth', 0),
            'grow_policy': params.get("grow_policy", 'lossguide'),
@@ -385,7 +379,7 @@ class XGBoostEstimator(SKLearnEstimator):
            'colsample_bylevel': float(colsample_bylevel),
            'colsample_bytree': float(colsample_bytree),
            'objective': params.get("objective")
-        }
+        })
        if all_thread:
            del self.params['nthread']

@@ -445,7 +439,8 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
        **params
    ):
        super().__init__(task, **params)
-        self.params = {
+        self.params = params
+        self.params.update({
            "n_estimators": int(round(n_estimators)),
            'max_leaves': int(round(max_leaves)),
            'max_depth': 0,
@@ -462,7 +457,7 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
            'colsample_bylevel': float(colsample_bylevel),
            'colsample_bytree': float(colsample_bytree),
            'use_label_encoder': params.get('use_label_encoder', False),
-        }
+        })

        if 'regression' in task:
            self.estimator_class = xgb.XGBRegressor
@@ -513,11 +508,12 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
        n_estimators=4, max_features=1.0, criterion='gini', **params
    ):
        super().__init__(task, **params)
-        self.params = {
+        self.params = params
+        self.params.update({
            "n_estimators": int(round(n_estimators)),
            "n_jobs": n_jobs,
            'max_features': float(max_features),
-        }
+        })
        if 'regression' in task:
            self.estimator_class = RandomForestRegressor
        else:
@@ -565,13 +561,13 @@ class LRL1Classifier(SKLearnEstimator):
        **params
    ):
        super().__init__(task, **params)
-        self.params = {
+        self.params.update({
            'penalty': params.get("penalty", 'l1'),
            'tol': float(tol),
            'C': float(C),
            'solver': params.get("solver", 'saga'),
            'n_jobs': n_jobs,
-        }
+        })
        if 'regression' in task:
            self.estimator_class = None
            raise NotImplementedError('LR does not support regression task')
@@ -594,13 +590,13 @@ class LRL2Classifier(SKLearnEstimator):
        **params
    ):
        super().__init__(task, **params)
-        self.params = {
+        self.params.update({
            'penalty': params.get("penalty", 'l2'),
            'tol': float(tol),
            'C': float(C),
            'solver': params.get("solver", 'lbfgs'),
            'n_jobs': n_jobs,
-        }
+        })
        if 'regression' in task:
            self.estimator_class = None
            raise NotImplementedError('LR does not support regression task')
@@ -648,14 +644,14 @@ class CatBoostEstimator(BaseEstimator):
        n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params
    ):
        super().__init__(task, **params)
-        self.params = {
+        self.params.update({
            "early_stopping_rounds": int(round(early_stopping_rounds)),
            "n_estimators": n_estimators,
            'learning_rate': learning_rate,
            'thread_count': n_jobs,
            'verbose': params.get('verbose', False),
            'random_seed': params.get("random_seed", 10242048),
-        }
+        })
        if 'regression' in task:
            from catboost import CatBoostRegressor
            self.estimator_class = CatBoostRegressor
@@ -759,11 +755,12 @@ class KNeighborsEstimator(BaseEstimator):
        self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params
    ):
        super().__init__(task, **params)
-        self.params = {
+        self.params = params
+        self.params.update({
            'n_neighbors': int(round(n_neighbors)),
            'weights': params.get('weights', 'distance'),
            'n_jobs': n_jobs,
-        }
+        })
        if 'regression' in task:
            from sklearn.neighbors import KNeighborsRegressor
            self.estimator_class = KNeighborsRegressor
--- a/flaml/onlineml/README.md
+++ b/flaml/onlineml/README.md
@@ -4,7 +4,15 @@ FLAML includes *ChaCha* which is an automatic hyperparameter tuning solution for

 For more technical details about *ChaCha*, please check our paper.

-* ChaCha for online AutoML. Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
+* [ChaCha for Online AutoML](https://arxiv.org/pdf/2106.04815.pdf). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
+```
+@inproceedings{wu2021chacha,
+    title={ChaCha for online AutoML},
+    author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},
+    year={2021},
+    booktitle={ICML},
+}
+```

 ## `AutoVW`

--- a/notebook/flaml_autovw.ipynb
+++ b/notebook/flaml_autovw.ipynb
@@ -48,7 +48,7 @@
    "## 2. Online regression with AutoVW\n",
    "### Load data from openml and preprocess\n",
    "\n",
-    "Download [dataset_sales](https://www.openml.org/d/42183) from OpenML."
+    "Download [NewFuelCar](https://www.openml.org/d/41506) from OpenML."
   ]
  },
  {
@@ -412,4 +412,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/test/test_automl.py
+++ b/test/test_automl.py
@@ -273,7 +273,6 @@ class TestAutoML(unittest.TestCase):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
-            "metric": 'mse',
            "task": 'regression',
            "log_file_name": "test/boston.log",
            "log_training_metric": True,
--- a/test/test_autovw.py
+++ b/test/test_autovw.py
@@ -5,7 +5,6 @@ import scipy.sparse

 import pandas as pd
 from sklearn.metrics import mean_squared_error, mean_absolute_error
-import time
 import logging
 from flaml.tune import loguniform, polynomial_expansion_set
 from vowpalwabbit import pyvw
@@ -13,6 +12,7 @@ from flaml import AutoVW
 import string
 import os
 import openml
+
 VW_DS_DIR = 'test/data/'
 NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
 logger = logging.getLogger(__name__)