api doc for chacha (#105)

* api doc for chacha

* update params

* link to paper

* update dataset id

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: Qingyun Wu <qiw@microsoft.com>
This commit is contained in:
Chi Wang
2021-06-11 10:25:45 -07:00
committed by GitHub
parent a4049ad9b6
commit c26720c299
7 changed files with 56 additions and 45 deletions

View File

@@ -87,7 +87,7 @@ automl_settings = {
X_train, y_train = load_iris(return_X_y=True)
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,
**automl_settings)
**automl_settings)
# Predict
print(automl.predict_proba(X_train))
# Export the best model
@@ -111,7 +111,7 @@ automl_settings = {
X_train, y_train = load_boston(return_X_y=True)
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,
**automl_settings)
**automl_settings)
# Predict
print(automl.predict(X_train))
# Export the best model
@@ -140,7 +140,7 @@ For more technical details, please check our papers.
```
* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.
* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.
* ChaCha for online AutoML. Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
* [ChaCha for Online AutoML](https://arxiv.org/pdf/2106.04815.pdf). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
## Contributing
@@ -180,7 +180,7 @@ If all the tests are passed, please also test run notebook/flaml_automl to make
* Chi Wang
* Qingyun Wu
Contributors (alphabetical order): Sebastien Bubeck, Surajit Chaudhuri, Nadiia Chepurko, Ofer Dekel, Alex Deng, Anshuman Dutt, Nicolo Fusi, Jianfeng Gao, Johannes Gehrke, Silu Huang, Dongwoo Kim, Christian Konig, John Langford, Amin Saied, Neil Tenenholtz, Markus Weimer, Haozhe Zhang, Erkang Zhu.
Contributors (alphabetical order): Sebastien Bubeck, Surajit Chaudhuri, Nadiia Chepurko, Ofer Dekel, Alex Deng, Anshuman Dutt, Nicolo Fusi, Jianfeng Gao, Johannes Gehrke, Silu Huang, Dongwoo Kim, Christian Konig, John Langford, Xueqing Liu, Paul Mineiro, Amin Saied, Neil Tenenholtz, Olga Vrousgou, Markus Weimer, Haozhe Zhang, Erkang Zhu.
## License

View File

@@ -22,7 +22,7 @@ AutoML
Tune
------
----
.. autofunction:: flaml.tune.run
@@ -38,8 +38,15 @@ Tune
:members:
Online AutoML
-------------
.. autoclass:: flaml.AutoVW
:members:
NLP
------
---
.. autoclass:: flaml.nlp.AutoTransformers
:members:

View File

@@ -171,6 +171,9 @@ class BaseEstimator:
class SKLearnEstimator(BaseEstimator):
def __init__(self, task='binary:logistic', **params):
super().__init__(task, **params)
def _preprocess(self, X):
if isinstance(X, pd.DataFrame):
X = X.copy()
@@ -231,12 +234,7 @@ class LGBMEstimator(BaseEstimator):
n_estimators = int(round(config['n_estimators']))
return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
def __init__(
self, task='binary:logistic', n_jobs=1,
n_estimators=2, num_leaves=2, min_child_samples=20, learning_rate=0.1,
subsample=1.0, reg_lambda=1.0, reg_alpha=0.0,
colsample_bytree=1.0, log_max_bin=8, **params
):
def __init__(self, task='binary:logistic', log_max_bin=8, **params):
super().__init__(task, **params)
# Default: regression for LGBMRegressor,
# binary or multiclass for LGBMClassifier
@@ -248,20 +246,16 @@ class LGBMEstimator(BaseEstimator):
objective = 'multiclass'
else:
objective = 'regression'
self.params = {
"n_estimators": int(round(n_estimators)),
"num_leaves": int(round(num_leaves)),
'objective': params.get("objective", objective),
'n_jobs': n_jobs,
'learning_rate': float(learning_rate),
'reg_alpha': float(reg_alpha),
'reg_lambda': float(reg_lambda),
'min_child_samples': int(round(min_child_samples)),
'colsample_bytree': float(colsample_bytree),
'subsample': float(subsample),
}
self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
1 << int(round(log_max_bin))) - 1
if "n_estimators" in self.params:
self.params["n_estimators"] = int(round(self.params["n_estimators"]))
if "num_leaves" in self.params:
self.params["num_leaves"] = int(round(self.params["num_leaves"]))
if "min_child_samples" in self.params:
self.params["min_child_samples"] = int(round(self.params["min_child_samples"]))
if "objective" not in self.params:
self.params["objective"] = objective
if "max_bin" not in self.params:
self.params['max_bin'] = 1 << int(round(log_max_bin)) - 1
if 'regression' in task:
self.estimator_class = LGBMRegressor
else:
@@ -369,7 +363,7 @@ class XGBoostEstimator(SKLearnEstimator):
):
super().__init__(task, **params)
self._n_estimators = int(round(n_estimators))
self.params = {
self.params.update({
'max_leaves': int(round(max_leaves)),
'max_depth': params.get('max_depth', 0),
'grow_policy': params.get("grow_policy", 'lossguide'),
@@ -385,7 +379,7 @@ class XGBoostEstimator(SKLearnEstimator):
'colsample_bylevel': float(colsample_bylevel),
'colsample_bytree': float(colsample_bytree),
'objective': params.get("objective")
}
})
if all_thread:
del self.params['nthread']
@@ -445,7 +439,8 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
**params
):
super().__init__(task, **params)
self.params = {
self.params = params
self.params.update({
"n_estimators": int(round(n_estimators)),
'max_leaves': int(round(max_leaves)),
'max_depth': 0,
@@ -462,7 +457,7 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
'colsample_bylevel': float(colsample_bylevel),
'colsample_bytree': float(colsample_bytree),
'use_label_encoder': params.get('use_label_encoder', False),
}
})
if 'regression' in task:
self.estimator_class = xgb.XGBRegressor
@@ -513,11 +508,12 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
n_estimators=4, max_features=1.0, criterion='gini', **params
):
super().__init__(task, **params)
self.params = {
self.params = params
self.params.update({
"n_estimators": int(round(n_estimators)),
"n_jobs": n_jobs,
'max_features': float(max_features),
}
})
if 'regression' in task:
self.estimator_class = RandomForestRegressor
else:
@@ -565,13 +561,13 @@ class LRL1Classifier(SKLearnEstimator):
**params
):
super().__init__(task, **params)
self.params = {
self.params.update({
'penalty': params.get("penalty", 'l1'),
'tol': float(tol),
'C': float(C),
'solver': params.get("solver", 'saga'),
'n_jobs': n_jobs,
}
})
if 'regression' in task:
self.estimator_class = None
raise NotImplementedError('LR does not support regression task')
@@ -594,13 +590,13 @@ class LRL2Classifier(SKLearnEstimator):
**params
):
super().__init__(task, **params)
self.params = {
self.params.update({
'penalty': params.get("penalty", 'l2'),
'tol': float(tol),
'C': float(C),
'solver': params.get("solver", 'lbfgs'),
'n_jobs': n_jobs,
}
})
if 'regression' in task:
self.estimator_class = None
raise NotImplementedError('LR does not support regression task')
@@ -648,14 +644,14 @@ class CatBoostEstimator(BaseEstimator):
n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params
):
super().__init__(task, **params)
self.params = {
self.params.update({
"early_stopping_rounds": int(round(early_stopping_rounds)),
"n_estimators": n_estimators,
'learning_rate': learning_rate,
'thread_count': n_jobs,
'verbose': params.get('verbose', False),
'random_seed': params.get("random_seed", 10242048),
}
})
if 'regression' in task:
from catboost import CatBoostRegressor
self.estimator_class = CatBoostRegressor
@@ -759,11 +755,12 @@ class KNeighborsEstimator(BaseEstimator):
self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params
):
super().__init__(task, **params)
self.params = {
self.params = params
self.params.update({
'n_neighbors': int(round(n_neighbors)),
'weights': params.get('weights', 'distance'),
'n_jobs': n_jobs,
}
})
if 'regression' in task:
from sklearn.neighbors import KNeighborsRegressor
self.estimator_class = KNeighborsRegressor

View File

@@ -4,7 +4,15 @@ FLAML includes *ChaCha* which is an automatic hyperparameter tuning solution for
For more technical details about *ChaCha*, please check our paper.
* ChaCha for online AutoML. Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
* [ChaCha for Online AutoML](https://arxiv.org/pdf/2106.04815.pdf). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.
```
@inproceedings{wu2021chacha,
title={ChaCha for online AutoML},
author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},
year={2021},
booktitle={ICML},
}
```
## `AutoVW`

View File

@@ -48,7 +48,7 @@
"## 2. Online regression with AutoVW\n",
"### Load data from openml and preprocess\n",
"\n",
"Download [dataset_sales](https://www.openml.org/d/42183) from OpenML."
"Download [NewFuelCar](https://www.openml.org/d/41506) from OpenML."
]
},
{
@@ -412,4 +412,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@@ -273,7 +273,6 @@ class TestAutoML(unittest.TestCase):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": 'mse',
"task": 'regression',
"log_file_name": "test/boston.log",
"log_training_metric": True,

View File

@@ -5,7 +5,6 @@ import scipy.sparse
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
import logging
from flaml.tune import loguniform, polynomial_expansion_set
from vowpalwabbit import pyvw
@@ -13,6 +12,7 @@ from flaml import AutoVW
import string
import os
import openml
VW_DS_DIR = 'test/data/'
NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
logger = logging.getLogger(__name__)