mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-15 05:09:16 +08:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2ba5f8bed1 | ||
|
|
d0a11958a5 | ||
|
|
0ef9b00a75 | ||
|
|
840f76e5e5 | ||
|
|
d8b7d25b80 | ||
|
|
6d53929803 | ||
|
|
c038fbca07 | ||
|
|
6a99202492 | ||
|
|
42d1dcfa0e | ||
|
|
b83c8a7d3b | ||
|
|
b9194cdcf2 | ||
|
|
9a1f6b0291 | ||
|
|
07f4413aae |
2
.github/workflows/python-package.yml
vendored
2
.github/workflows/python-package.yml
vendored
@@ -85,7 +85,7 @@ jobs:
|
||||
- name: Test with pytest
|
||||
if: matrix.python-version != '3.10'
|
||||
run: |
|
||||
pytest test
|
||||
pytest test/
|
||||
- name: Coverage
|
||||
if: matrix.python-version == '3.10'
|
||||
run: |
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from flaml.automl import AutoML, logger_formatter
|
||||
@@ -12,7 +13,8 @@ from flaml.version import __version__
|
||||
|
||||
# Set the root logger.
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
if logger.level == logging.NOTSET:
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
if not has_automl:
|
||||
logger.warning("flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.")
|
||||
warnings.warn("flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.")
|
||||
|
||||
@@ -203,7 +203,7 @@ class AutoML(BaseEstimator):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
@@ -739,7 +739,7 @@ class AutoML(BaseEstimator):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
@@ -1358,7 +1358,7 @@ class AutoML(BaseEstimator):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
import shutil
|
||||
import signal
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
@@ -89,21 +90,25 @@ def limit_resource(memory_limit, time_limit):
|
||||
except ValueError:
|
||||
# According to https://bugs.python.org/issue40518, it's a mac-specific error.
|
||||
pass
|
||||
main_thread = False
|
||||
if time_limit is not None:
|
||||
alarm_set = False
|
||||
if time_limit is not None and threading.current_thread() is threading.main_thread():
|
||||
try:
|
||||
signal.signal(signal.SIGALRM, TimeoutHandler)
|
||||
signal.alarm(int(time_limit) or 1)
|
||||
main_thread = True
|
||||
alarm_set = True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if main_thread:
|
||||
if alarm_set:
|
||||
signal.alarm(0)
|
||||
if memory_limit > 0:
|
||||
resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
|
||||
try:
|
||||
resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
class BaseEstimator:
|
||||
@@ -130,7 +135,7 @@ class BaseEstimator:
|
||||
self._task = task if isinstance(task, Task) else task_factory(task, None, None)
|
||||
self.params = self.config2params(config)
|
||||
self.estimator_class = self._model = None
|
||||
if "_estimator_type" in config:
|
||||
if "_estimator_type" in self.params:
|
||||
self._estimator_type = self.params.pop("_estimator_type")
|
||||
else:
|
||||
self._estimator_type = "classifier" if self._task.is_classification() else "regressor"
|
||||
@@ -1691,7 +1696,7 @@ class XGBoostEstimator(SKLearnEstimator):
|
||||
# use_label_encoder is deprecated in 1.7.
|
||||
if xgboost_version < "1.7.0":
|
||||
params["use_label_encoder"] = params.get("use_label_encoder", False)
|
||||
if "n_jobs" in config:
|
||||
if "n_jobs" in params:
|
||||
params["nthread"] = params.pop("n_jobs")
|
||||
return params
|
||||
|
||||
@@ -1891,7 +1896,7 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
params = super().config2params(config)
|
||||
if "max_leaves" in params:
|
||||
params["max_leaf_nodes"] = params.get("max_leaf_nodes", params.pop("max_leaves"))
|
||||
if not self._task.is_classification() and "criterion" in config:
|
||||
if not self._task.is_classification() and "criterion" in params:
|
||||
params.pop("criterion")
|
||||
if "random_state" not in params:
|
||||
params["random_state"] = 12032022
|
||||
@@ -2066,8 +2071,8 @@ class CatBoostEstimator(BaseEstimator):
|
||||
self.estimator_class = CatBoostRegressor
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
|
||||
if "is_retrain" in kwargs:
|
||||
kwargs.pop("is_retrain")
|
||||
kwargs.pop("is_retrain", None)
|
||||
kwargs.pop("groups", None)
|
||||
start_time = time.time()
|
||||
deadline = start_time + budget if budget else np.inf
|
||||
train_dir = f"catboost_{str(start_time)}"
|
||||
@@ -2344,7 +2349,7 @@ class SGDEstimator(SKLearnEstimator):
|
||||
params["loss"] = params.get("loss", None)
|
||||
if params["loss"] is None and self._task.is_classification():
|
||||
params["loss"] = "log_loss" if SKLEARN_VERSION >= "1.1" else "log"
|
||||
if not self._task.is_classification():
|
||||
if not self._task.is_classification() and "n_jobs" in params:
|
||||
params.pop("n_jobs")
|
||||
|
||||
if params.get("penalty") != "elasticnet":
|
||||
|
||||
@@ -442,8 +442,8 @@ class GenericTask(Task):
|
||||
X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||
if data_is_df:
|
||||
X_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all.reset_index(drop=True, inplace=True)
|
||||
|
||||
X_train, y_train = X_train_all, y_train_all
|
||||
state.groups_all = state.groups
|
||||
@@ -769,10 +769,10 @@ class GenericTask(Task):
|
||||
if not is_spark_dataframe:
|
||||
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
||||
if weight is not None:
|
||||
fit_kwargs["sample_weight"], weight_val = (
|
||||
weight[train_index],
|
||||
weight[val_index],
|
||||
fit_kwargs["sample_weight"] = (
|
||||
weight[train_index] if isinstance(weight, np.ndarray) else weight.iloc[train_index]
|
||||
)
|
||||
weight_val = weight[val_index] if isinstance(weight, np.ndarray) else weight.iloc[val_index]
|
||||
if groups is not None:
|
||||
fit_kwargs["groups"] = (
|
||||
groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]
|
||||
|
||||
@@ -192,7 +192,7 @@ class Task(ABC):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
|
||||
@@ -197,9 +197,16 @@ def report(_metric=None, **kwargs):
|
||||
global _training_iteration
|
||||
if _use_ray:
|
||||
try:
|
||||
from ray import tune
|
||||
from ray import __version__ as ray_version
|
||||
|
||||
return tune.report(_metric, **kwargs)
|
||||
if ray_version.startswith("1."):
|
||||
from ray import tune
|
||||
|
||||
return tune.report(_metric, **kwargs)
|
||||
else: # ray>=2
|
||||
from ray.air import session
|
||||
|
||||
return session.report(metrics={"metric": _metric, **kwargs})
|
||||
except ImportError:
|
||||
# calling tune.report() outside tune.run()
|
||||
return
|
||||
@@ -260,6 +267,8 @@ def run(
|
||||
mlflow_exp_name: Optional[str] = None,
|
||||
automl_info: Optional[Tuple[float]] = None,
|
||||
extra_tag: Optional[dict] = None,
|
||||
cost_attr: Optional[str] = "auto",
|
||||
cost_budget: Optional[float] = None,
|
||||
**ray_args,
|
||||
):
|
||||
"""The function-based way of performing HPO.
|
||||
@@ -462,6 +471,12 @@ def run(
|
||||
overwritten by the value of `n_concurrent_trials` in AutoML. When <= 0, the concurrent trials
|
||||
will be set to the number of executors.
|
||||
extra_tag: dict, default=None | Extra tags to be added to the mlflow runs created by autologging.
|
||||
cost_attr: None or str to specify the attribute to evaluate the cost of different trials.
|
||||
Default is "auto", which means that we will automatically choose the cost attribute to use (depending
|
||||
on the nature of the resource budget). When cost_attr is set to None, cost differences between different trials will be omitted
|
||||
in our search algorithm. When cost_attr is set to a str different from "auto" and "time_total_s",
|
||||
this cost_attr must be available in the result dict of the trial.
|
||||
cost_budget: A float of the cost budget. Only valid when cost_attr is a str different from "auto" and "time_total_s".
|
||||
**ray_args: keyword arguments to pass to ray.tune.run().
|
||||
Only valid when use_ray=True.
|
||||
"""
|
||||
@@ -600,6 +615,8 @@ def run(
|
||||
metric_constraints=metric_constraints,
|
||||
use_incumbent_result_in_evaluation=use_incumbent_result_in_evaluation,
|
||||
lexico_objectives=lexico_objectives,
|
||||
cost_attr=cost_attr,
|
||||
cost_budget=cost_budget,
|
||||
)
|
||||
else:
|
||||
if metric is None or mode is None:
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.3.2"
|
||||
__version__ = "2.3.4"
|
||||
|
||||
@@ -143,4 +143,5 @@ def test_prep():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_lrl2()
|
||||
test_prep()
|
||||
@@ -1,4 +1,6 @@
|
||||
from sklearn.datasets import fetch_openml
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.datasets import fetch_openml, load_iris
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.model_selection import GroupKFold, KFold, train_test_split
|
||||
|
||||
@@ -48,7 +50,7 @@ def test_time():
|
||||
_test(split_type="time")
|
||||
|
||||
|
||||
def test_groups():
|
||||
def test_groups_for_classification_task():
|
||||
from sklearn.externals._arff import ArffException
|
||||
|
||||
try:
|
||||
@@ -58,8 +60,6 @@ def test_groups():
|
||||
|
||||
X, y = load_wine(return_X_y=True)
|
||||
|
||||
import numpy as np
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
@@ -68,7 +68,7 @@ def test_groups():
|
||||
"model_history": True,
|
||||
"eval_method": "cv",
|
||||
"groups": np.random.randint(low=0, high=10, size=len(y)),
|
||||
"estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
|
||||
"estimator_list": ["catboost", "lgbm", "rf", "xgboost", "kneighbor"],
|
||||
"learner_selector": "roundrobin",
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
@@ -88,6 +88,72 @@ def test_groups():
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
|
||||
def test_groups_for_regression_task():
|
||||
"""Append nonsensical groups to iris dataset and use it to test that GroupKFold works for regression tasks"""
|
||||
iris_dict_data = load_iris(as_frame=True) # numpy arrays
|
||||
iris_data = iris_dict_data["frame"] # pandas dataframe data + target
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
iris_data["cluster"] = rng.integers(
|
||||
low=0, high=5, size=iris_data.shape[0]
|
||||
) # np.random.randint(0, 5, iris_data.shape[0])
|
||||
|
||||
automl = AutoML()
|
||||
X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
|
||||
y = iris_data["petal width (cm)"]
|
||||
X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
|
||||
X, y, iris_data["cluster"], random_state=42
|
||||
)
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"metric": "r2",
|
||||
"task": "regression",
|
||||
"estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
|
||||
"eval_method": "cv",
|
||||
"split_type": "uniform",
|
||||
"groups": groups_train,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
|
||||
def test_groups_with_sample_weights():
|
||||
"""Verifies that sample weights can be used with group splits i.e. that https://github.com/microsoft/FLAML/issues/1396 remains fixed"""
|
||||
iris_dict_data = load_iris(as_frame=True) # numpy arrays
|
||||
iris_data = iris_dict_data["frame"] # pandas dataframe data + target
|
||||
iris_data["cluster"] = np.random.randint(0, 5, iris_data.shape[0])
|
||||
automl = AutoML()
|
||||
|
||||
X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
|
||||
y = iris_data["petal width (cm)"]
|
||||
sample_weight = pd.Series(np.random.rand(X.shape[0]))
|
||||
(
|
||||
X_train,
|
||||
X_test,
|
||||
y_train,
|
||||
y_test,
|
||||
groups_train,
|
||||
groups_test,
|
||||
sample_weight_train,
|
||||
sample_weight_test,
|
||||
) = train_test_split(X, y, iris_data["cluster"], sample_weight, random_state=42)
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"metric": "r2",
|
||||
"task": "regression",
|
||||
"log_file_name": "error.log",
|
||||
"log_type": "all",
|
||||
"estimator_list": ["lgbm"],
|
||||
"eval_method": "cv",
|
||||
"split_type": "group",
|
||||
"groups": groups_train,
|
||||
"sample_weight": sample_weight_train,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
assert automl.model is not None
|
||||
|
||||
|
||||
def test_stratified_groupkfold():
|
||||
from minio.error import ServerError
|
||||
from sklearn.model_selection import StratifiedGroupKFold
|
||||
@@ -108,6 +174,7 @@ def test_stratified_groupkfold():
|
||||
"split_type": splitter,
|
||||
"groups": X_train["Airline"],
|
||||
"estimator_list": [
|
||||
"catboost",
|
||||
"lgbm",
|
||||
"rf",
|
||||
"xgboost",
|
||||
@@ -203,4 +270,4 @@ def test_object():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_groups()
|
||||
test_groups_for_classification_task()
|
||||
|
||||
@@ -59,6 +59,17 @@ def _test_hf_data():
|
||||
except requests.exceptions.ConnectionError:
|
||||
return
|
||||
|
||||
# Tests will only run if there is a GPU available
|
||||
try:
|
||||
import ray
|
||||
|
||||
pg = ray.util.placement_group([{"CPU": 1, "GPU": 1}])
|
||||
|
||||
if not pg.wait(timeout_seconds=10): # Wait 10 seconds for resources
|
||||
raise RuntimeError("No available node types can fulfill resource request!")
|
||||
except RuntimeError:
|
||||
return
|
||||
|
||||
custom_sent_keys = ["sentence1", "sentence2"]
|
||||
label_key = "label"
|
||||
|
||||
|
||||
@@ -3371,9 +3371,9 @@ cross-fetch@^3.1.5:
|
||||
node-fetch "2.6.7"
|
||||
|
||||
cross-spawn@^7.0.3:
|
||||
version "7.0.3"
|
||||
resolved "https://registry.npmmirror.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6"
|
||||
integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==
|
||||
version "7.0.6"
|
||||
resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f"
|
||||
integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==
|
||||
dependencies:
|
||||
path-key "^3.1.0"
|
||||
shebang-command "^2.0.0"
|
||||
@@ -5709,9 +5709,9 @@ multicast-dns@^7.2.5:
|
||||
thunky "^1.0.2"
|
||||
|
||||
nanoid@^3.3.6:
|
||||
version "3.3.6"
|
||||
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.6.tgz#443380c856d6e9f9824267d960b4236ad583ea4c"
|
||||
integrity sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==
|
||||
version "3.3.8"
|
||||
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.8.tgz#b1be3030bee36aaff18bacb375e5cce521684baf"
|
||||
integrity sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==
|
||||
|
||||
negotiator@0.6.3:
|
||||
version "0.6.3"
|
||||
@@ -7272,14 +7272,7 @@ send@0.19.0:
|
||||
range-parser "~1.2.1"
|
||||
statuses "2.0.1"
|
||||
|
||||
serialize-javascript@^6.0.0:
|
||||
version "6.0.0"
|
||||
resolved "https://registry.npmmirror.com/serialize-javascript/-/serialize-javascript-6.0.0.tgz#efae5d88f45d7924141da8b5c3a7a7e663fefeb8"
|
||||
integrity sha512-Qr3TosvguFt8ePWqsvRfrKyQXIiW+nGbYpy8XK24NQHE83caxWt+mIymTT19DGFbNWNLfEwsrkSmN64lVWB9ag==
|
||||
dependencies:
|
||||
randombytes "^2.1.0"
|
||||
|
||||
serialize-javascript@^6.0.1:
|
||||
serialize-javascript@^6.0.0, serialize-javascript@^6.0.1:
|
||||
version "6.0.2"
|
||||
resolved "https://registry.yarnpkg.com/serialize-javascript/-/serialize-javascript-6.0.2.tgz#defa1e055c83bf6d59ea805d8da862254eb6a6c2"
|
||||
integrity sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==
|
||||
|
||||
Reference in New Issue
Block a user