mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-16 05:32:24 +08:00
Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5a74227bc3 | ||
|
|
7644958e21 | ||
|
|
a316f84fe1 | ||
|
|
72881d3a2b | ||
|
|
69da685d1e | ||
|
|
c01c3910eb | ||
|
|
98d3fd2f48 | ||
|
|
9724c626cc | ||
|
|
0d92400200 | ||
|
|
d224218ecf | ||
|
|
a2a5e1abb9 | ||
|
|
5c0f18b7bc |
@@ -293,7 +293,7 @@ class DataTransformer:
|
||||
y = y.rename(TS_VALUE_COL)
|
||||
for column in X.columns:
|
||||
# sklearn\utils\validation.py needs int/float values
|
||||
if X[column].dtype.name in ("object", "category"):
|
||||
if X[column].dtype.name in ("object", "category", "string"):
|
||||
if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum():
|
||||
X.drop(columns=column, inplace=True)
|
||||
drop = True
|
||||
|
||||
@@ -157,7 +157,25 @@ class BaseEstimator:
|
||||
|
||||
@property
|
||||
def estimator(self):
|
||||
"""Trained model after fit() is called, or None before fit() is called."""
|
||||
"""
|
||||
Get the best trained estimator model.
|
||||
|
||||
Returns:
|
||||
object or None: The trained model obtained after calling the `fit()` method,
|
||||
representing the best estimator found during the training process. If `fit()` has
|
||||
not been called yet, it returns `None`.
|
||||
|
||||
Examples:
|
||||
>>> from flaml import AutoML
|
||||
>>> automl = AutoML()
|
||||
>>> automl.fit(X_train, y_train)
|
||||
>>> best_estimator = automl.model.estimator
|
||||
>>> print(best_estimator)
|
||||
RandomForestClassifier()
|
||||
|
||||
Note:
|
||||
To access the best estimator, use `automl.model.estimator`.
|
||||
"""
|
||||
return self._model
|
||||
|
||||
@property
|
||||
@@ -249,9 +267,11 @@ class BaseEstimator:
|
||||
mem = psutil.virtual_memory() if psutil is not None else None
|
||||
try:
|
||||
with limit_resource(
|
||||
mem.available * (1 - free_mem_ratio) + psutil.Process(os.getpid()).memory_info().rss
|
||||
if mem is not None
|
||||
else -1,
|
||||
(
|
||||
mem.available * (1 - free_mem_ratio) + psutil.Process(os.getpid()).memory_info().rss
|
||||
if mem is not None
|
||||
else -1
|
||||
),
|
||||
budget,
|
||||
):
|
||||
train_time = self._fit(X_train, y_train, **kwargs)
|
||||
@@ -1534,12 +1554,16 @@ class LGBMEstimator(BaseEstimator):
|
||||
if n_iter > 1:
|
||||
max_iter = min(
|
||||
n_iter,
|
||||
int((budget - time.time() + start_time - self._t1) / self._time_per_iter + 1)
|
||||
if budget is not None
|
||||
else n_iter,
|
||||
int((1 - free_mem_ratio) * mem0 / self._mem_per_iter)
|
||||
if psutil is not None and self._mem_per_iter > 0
|
||||
else n_iter,
|
||||
(
|
||||
int((budget - time.time() + start_time - self._t1) / self._time_per_iter + 1)
|
||||
if budget is not None
|
||||
else n_iter
|
||||
),
|
||||
(
|
||||
int((1 - free_mem_ratio) * mem0 / self._mem_per_iter)
|
||||
if psutil is not None and self._mem_per_iter > 0
|
||||
else n_iter
|
||||
),
|
||||
)
|
||||
if trained and max_iter <= self.params[self.ITER_HP]:
|
||||
return time.time() - start_time
|
||||
@@ -1561,18 +1585,17 @@ class LGBMEstimator(BaseEstimator):
|
||||
callbacks = None
|
||||
if callbacks is None:
|
||||
self._fit(X_train, y_train, **kwargs)
|
||||
else:
|
||||
self._fit(X_train, y_train, callbacks=callbacks, **kwargs)
|
||||
if callbacks is None:
|
||||
# for xgboost>=1.6.0, pop callbacks to enable pickle
|
||||
callbacks = self.params.pop("callbacks")
|
||||
self._model.set_params(callbacks=callbacks[:-1])
|
||||
else:
|
||||
self._fit(X_train, y_train, callbacks=callbacks, **kwargs)
|
||||
best_iteration = (
|
||||
getattr(self._model.get_booster(), "best_iteration", None)
|
||||
if isinstance(self, XGBoostSklearnEstimator)
|
||||
else self._model.best_iteration_
|
||||
)
|
||||
if best_iteration is not None:
|
||||
if best_iteration is not None and best_iteration > 0:
|
||||
self._model.set_params(n_estimators=best_iteration + 1)
|
||||
else:
|
||||
self._fit(X_train, y_train, **kwargs)
|
||||
@@ -2090,7 +2113,8 @@ class CatBoostEstimator(BaseEstimator):
|
||||
if weight is not None:
|
||||
kwargs["sample_weight"] = weight
|
||||
self._model = model
|
||||
self.params[self.ITER_HP] = self._model.tree_count_
|
||||
# Commented-out line below incorrectly assigned n_estimators - see https://github.com/microsoft/FLAML/pull/1364
|
||||
# self.params[self.ITER_HP] = self._model.tree_count_
|
||||
train_time = time.time() - start_time
|
||||
return train_time
|
||||
|
||||
@@ -2184,6 +2208,11 @@ class SVCEstimator(SKLearnEstimator):
|
||||
|
||||
def __init__(self, task="binary", **config):
|
||||
super().__init__(task, **config)
|
||||
self.params.update(
|
||||
{
|
||||
"random_state": config.get("random_seed", 10242048),
|
||||
}
|
||||
)
|
||||
assert self._task.is_classification(), "LinearSVC for classification task only"
|
||||
self.estimator_class = LinearSVC
|
||||
|
||||
@@ -2428,6 +2457,11 @@ class ElasticNetEstimator(SKLearnEstimator):
|
||||
|
||||
def __init__(self, task="regression", **config):
|
||||
super().__init__(task, **config)
|
||||
self.params.update(
|
||||
{
|
||||
"random_state": config.get("random_seed", 10242048),
|
||||
}
|
||||
)
|
||||
assert self._task.is_regression(), "ElasticNet for regression task only"
|
||||
self.estimator_class = ElasticNet
|
||||
|
||||
@@ -2752,7 +2786,7 @@ class BaseResourceLimit:
|
||||
def check_resource_limits(self, current_time, current_iteration, mllib):
|
||||
if (mllib == "xgb" and current_iteration == 0) or (mllib == "cat" and current_iteration == 1):
|
||||
self._time_per_iter = current_time - self.start_time
|
||||
if current_time + self._time_per_iter > self.deadline:
|
||||
if mllib != "cat" and current_time + self._time_per_iter > self.deadline:
|
||||
return False
|
||||
if psutil is not None and self.free_mem_ratio is not None:
|
||||
mem = psutil.virtual_memory()
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
@@ -9,7 +10,7 @@ from pyspark.ml.evaluation import (
|
||||
RegressionEvaluator,
|
||||
)
|
||||
|
||||
from flaml.automl.spark import F, psSeries
|
||||
from flaml.automl.spark import F, T, psDataFrame, psSeries, sparkDataFrame
|
||||
|
||||
|
||||
def ps_group_counts(groups: Union[psSeries, np.ndarray]) -> np.ndarray:
|
||||
@@ -36,6 +37,16 @@ def _compute_label_from_probability(df, probability_col, prediction_col):
|
||||
return df
|
||||
|
||||
|
||||
def string_to_array(s):
|
||||
try:
|
||||
return json.loads(s)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
|
||||
string_to_array_udf = F.udf(string_to_array, T.ArrayType(T.DoubleType()))
|
||||
|
||||
|
||||
def spark_metric_loss_score(
|
||||
metric_name: str,
|
||||
y_predict: psSeries,
|
||||
@@ -135,6 +146,11 @@ def spark_metric_loss_score(
|
||||
)
|
||||
elif metric_name == "log_loss":
|
||||
# For log_loss, prediction_col should be probability, and we need to convert it to label
|
||||
# handle data like "{'type': '1', 'values': '[1, 2, 3]'}"
|
||||
# Fix cannot resolve "array_max(prediction)" due to data type mismatch: Parameter 1 requires the "ARRAY" type,
|
||||
# however "prediction" has the type "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>"
|
||||
df = df.withColumn(prediction_col, df[prediction_col].cast(T.StringType()))
|
||||
df = df.withColumn(prediction_col, string_to_array_udf(df[prediction_col]))
|
||||
df = _compute_label_from_probability(df, prediction_col, prediction_col + "_label")
|
||||
evaluator = MulticlassClassificationEvaluator(
|
||||
metricName="logLoss",
|
||||
|
||||
@@ -87,7 +87,6 @@ class GenericTask(Task):
|
||||
"transformer": TransformersEstimator,
|
||||
"transformer_ms": TransformersEstimatorModelSelection,
|
||||
"histgb": HistGradientBoostingEstimator,
|
||||
# Above are open-source, below are internal
|
||||
"svc": SVCEstimator,
|
||||
"sgd": SGDEstimator,
|
||||
"nb_spark": SparkNaiveBayesEstimator,
|
||||
@@ -706,7 +705,6 @@ class GenericTask(Task):
|
||||
fit_kwargs = {}
|
||||
if cv_score_agg_func is None:
|
||||
cv_score_agg_func = default_cv_score_agg_func
|
||||
start_time = time.time()
|
||||
val_loss_folds = []
|
||||
log_metric_folds = []
|
||||
metric = None
|
||||
@@ -813,8 +811,6 @@ class GenericTask(Task):
|
||||
if is_spark_dataframe:
|
||||
X_train.spark.unpersist() # uncache data to free memory
|
||||
X_val.spark.unpersist() # uncache data to free memory
|
||||
if budget and time.time() - start_time >= budget:
|
||||
break
|
||||
val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
|
||||
n = total_fold_num
|
||||
pred_time /= n
|
||||
|
||||
@@ -393,7 +393,7 @@ class DataTransformerTS:
|
||||
|
||||
for column in X.columns:
|
||||
# sklearn/utils/validation.py needs int/float values
|
||||
if X[column].dtype.name in ("object", "category"):
|
||||
if X[column].dtype.name in ("object", "category", "string"):
|
||||
if (
|
||||
# drop columns where all values are the same
|
||||
X[column].nunique() == 1
|
||||
|
||||
@@ -127,6 +127,13 @@ def _get_notebook_name():
|
||||
return None
|
||||
|
||||
|
||||
def safe_json_dumps(obj):
|
||||
def default(o):
|
||||
return str(o)
|
||||
|
||||
return json.dumps(obj, default=default)
|
||||
|
||||
|
||||
class MLflowIntegration:
|
||||
def __init__(self, experiment_type="automl", mlflow_exp_name=None, extra_tag=None):
|
||||
try:
|
||||
@@ -438,7 +445,7 @@ class MLflowIntegration:
|
||||
"flaml.meric": automl_metric_name,
|
||||
"flaml.run_source": "flaml-automl",
|
||||
"flaml.log_type": self.log_type,
|
||||
"flaml.automl_user_configurations": json.dumps(automl._automl_user_configurations),
|
||||
"flaml.automl_user_configurations": safe_json_dumps(automl._automl_user_configurations),
|
||||
},
|
||||
"params": {
|
||||
"sample_size": search_state.sample_size,
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.3.1"
|
||||
__version__ = "2.3.2"
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
import unittest
|
||||
from datetime import datetime
|
||||
from test.conftest import evaluate_cv_folds_with_underlying_model
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import (
|
||||
train_test_split,
|
||||
)
|
||||
|
||||
from flaml import AutoML, tune
|
||||
from flaml.automl.model import LGBMEstimator
|
||||
@@ -420,6 +424,122 @@ class TestClassification(unittest.TestCase):
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
"catboost",
|
||||
"extra_tree",
|
||||
"histgb",
|
||||
"kneighbor",
|
||||
"lgbm",
|
||||
# "lrl1",
|
||||
"lrl2",
|
||||
"rf",
|
||||
"svc",
|
||||
"xgboost",
|
||||
"xgb_limitdepth",
|
||||
],
|
||||
)
|
||||
def test_reproducibility_of_classification_models(estimator: str):
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues where FLAML was providing an incorrect model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
In this test we take the best model which FLAML provided us, and then retrain and test it on the
|
||||
same folds, to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": [estimator],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 10,
|
||||
"metric": "f1",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
}
|
||||
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
config = best_model.get_params()
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
|
||||
# Take the best model, and see if we can reproduce the best result
|
||||
reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
|
||||
config=config,
|
||||
estimator=best_model,
|
||||
X_train_all=automl._state.X_train_all,
|
||||
y_train_all=automl._state.y_train_all,
|
||||
budget=None,
|
||||
kf=automl._state.kf,
|
||||
eval_metric="f1",
|
||||
best_val_loss=None,
|
||||
cv_score_agg_func=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs=None,
|
||||
free_mem_ratio=0,
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
"catboost",
|
||||
"extra_tree",
|
||||
"histgb",
|
||||
"kneighbor",
|
||||
"lgbm",
|
||||
# "lrl1",
|
||||
"lrl2",
|
||||
"svc",
|
||||
"rf",
|
||||
"xgboost",
|
||||
"xgb_limitdepth",
|
||||
],
|
||||
)
|
||||
def test_reproducibility_of_underlying_classification_models(estimator: str):
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues where FLAML was providing an incorrect model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model.
|
||||
Ideally, FLAMLised models should perform identically to the underlying model, when fitted
|
||||
to the same data, with no budget. This verifies that this is the case for classification models.
|
||||
In this test we take the best model which FLAML provided us, extract the underlying model,
|
||||
before retraining and testing it on the same folds - to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": [estimator],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 10,
|
||||
"metric": "f1",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
}
|
||||
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
reproduced_val_loss_underlying_model = np.mean(
|
||||
evaluate_cv_folds_with_underlying_model(
|
||||
automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "classification"
|
||||
)
|
||||
)
|
||||
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test = TestClassification()
|
||||
test.test_preprocess()
|
||||
|
||||
@@ -187,7 +187,6 @@ class TestMultiClass(unittest.TestCase):
|
||||
def test_custom_metric(self):
|
||||
df, y = load_iris(return_X_y=True, as_frame=True)
|
||||
df["label"] = y
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"dataframe": df,
|
||||
"label": "label",
|
||||
@@ -204,7 +203,8 @@ class TestMultiClass(unittest.TestCase):
|
||||
"pred_time_limit": 1e-5,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(**settings)
|
||||
automl = AutoML(**settings) # test safe_json_dumps
|
||||
automl.fit(dataframe=df, label="label")
|
||||
print(automl.classes_)
|
||||
print(automl.model)
|
||||
print(automl.config_history)
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import unittest
|
||||
from test.conftest import evaluate_cv_folds_with_underlying_model
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import (
|
||||
fetch_california_housing,
|
||||
make_regression,
|
||||
)
|
||||
|
||||
from flaml import AutoML
|
||||
@@ -205,7 +208,6 @@ class TestRegression(unittest.TestCase):
|
||||
|
||||
|
||||
def test_multioutput():
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
|
||||
|
||||
@@ -230,5 +232,210 @@ def test_multioutput():
|
||||
print(model.predict(X_test))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
"catboost",
|
||||
"enet",
|
||||
"extra_tree",
|
||||
"histgb",
|
||||
"kneighbor",
|
||||
"lgbm",
|
||||
"rf",
|
||||
"xgboost",
|
||||
"xgb_limitdepth",
|
||||
],
|
||||
)
|
||||
def test_reproducibility_of_regression_models(estimator: str):
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues where FLAML was providing an incorrect model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
In this test we take the best regression model which FLAML provided us, and then retrain and test it on the
|
||||
same folds, to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 2,
|
||||
"time_budget": -1,
|
||||
"task": "regression",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": [estimator],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "r2",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
"retrain_full": True,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
config = best_model.get_params()
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
|
||||
# Take the best model, and see if we can reproduce the best result
|
||||
reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
|
||||
config=config,
|
||||
estimator=best_model,
|
||||
X_train_all=automl._state.X_train_all,
|
||||
y_train_all=automl._state.y_train_all,
|
||||
budget=None,
|
||||
kf=automl._state.kf,
|
||||
eval_metric="r2",
|
||||
best_val_loss=None,
|
||||
cv_score_agg_func=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs=None,
|
||||
free_mem_ratio=0,
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss
|
||||
|
||||
|
||||
def test_reproducibility_of_catboost_regression_model():
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues around the catboost model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
In this test we take the best catboost regression model which FLAML provided us, and then retrain and test it on the
|
||||
same folds, to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 7,
|
||||
"task": "regression",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["catboost"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 10,
|
||||
"metric": "r2",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
"retrain_full": True,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
config = best_model.get_params()
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
|
||||
# Take the best model, and see if we can reproduce the best result
|
||||
reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
|
||||
config=config,
|
||||
estimator=best_model,
|
||||
X_train_all=automl._state.X_train_all,
|
||||
y_train_all=automl._state.y_train_all,
|
||||
budget=None,
|
||||
kf=automl._state.kf,
|
||||
eval_metric="r2",
|
||||
best_val_loss=None,
|
||||
cv_score_agg_func=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs=None,
|
||||
free_mem_ratio=0,
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss
|
||||
|
||||
|
||||
def test_reproducibility_of_lgbm_regression_model():
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues around LGBMs - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1368
|
||||
In this test we take the best LGBM regression model which FLAML provided us, and then retrain and test it on the
|
||||
same folds, to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"task": "regression",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["lgbm"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 9,
|
||||
"metric": "r2",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
"retrain_full": True,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
config = best_model.get_params()
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
|
||||
# Take the best model, and see if we can reproduce the best result
|
||||
reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
|
||||
config=config,
|
||||
estimator=best_model,
|
||||
X_train_all=automl._state.X_train_all,
|
||||
y_train_all=automl._state.y_train_all,
|
||||
budget=None,
|
||||
kf=automl._state.kf,
|
||||
eval_metric="r2",
|
||||
best_val_loss=None,
|
||||
cv_score_agg_func=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs=None,
|
||||
free_mem_ratio=0,
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss or val_loss_flaml > reproduced_val_loss
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
"catboost",
|
||||
"enet",
|
||||
"extra_tree",
|
||||
"histgb",
|
||||
"kneighbor",
|
||||
"lgbm",
|
||||
"rf",
|
||||
"xgboost",
|
||||
"xgb_limitdepth",
|
||||
],
|
||||
)
|
||||
def test_reproducibility_of_underlying_regression_models(estimator: str):
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues where FLAML was providing an incorrect model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model.
|
||||
Ideally, FLAMLised models should perform identically to the underlying model, when fitted
|
||||
to the same data, with no budget. This verifies that this is the case for regression models.
|
||||
In this test we take the best model which FLAML provided us, extract the underlying model,
|
||||
before retraining and testing it on the same folds - to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"task": "regression",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": [estimator],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 10,
|
||||
"metric": "r2",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
"retrain_full": False,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
reproduced_val_loss_underlying_model = np.mean(
|
||||
evaluate_cv_folds_with_underlying_model(
|
||||
automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "regression"
|
||||
)
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
42
test/conftest.py
Normal file
42
test/conftest.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
|
||||
from sklearn.metrics import f1_score, r2_score
|
||||
|
||||
|
||||
def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> pd.DataFrame:
|
||||
"""Mimic the FLAML CV process to calculate the metrics across each fold.
|
||||
|
||||
:param X_train_all: X training data
|
||||
:param y_train_all: y training data
|
||||
:param kf: The splitter object to use to generate the folds
|
||||
:param model: The estimator to fit to the data during the CV process
|
||||
:param task: classification or regression
|
||||
:return: An array containing the metrics
|
||||
"""
|
||||
rng = np.random.RandomState(2020)
|
||||
all_fold_metrics: List[Dict[str, Union[int, float]]] = []
|
||||
for train_index, val_index in kf.split(X_train_all, y_train_all):
|
||||
X_train_split, y_train_split = X_train_all, y_train_all
|
||||
train_index = rng.permutation(train_index)
|
||||
X_train = X_train_split.iloc[train_index]
|
||||
X_val = X_train_split.iloc[val_index]
|
||||
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
||||
model_type = type(model)
|
||||
if model_type is not CatBoostClassifier and model_type is not CatBoostRegressor:
|
||||
model.fit(X_train, y_train)
|
||||
else:
|
||||
use_best_model = True
|
||||
n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
|
||||
X_tr, y_tr = (X_train)[:n], y_train[:n]
|
||||
eval_set = Pool(data=X_train[n:], label=y_train[n:], cat_features=[]) if use_best_model else None
|
||||
model.fit(X_tr, y_tr, eval_set=eval_set, use_best_model=True)
|
||||
y_pred_classes = model.predict(X_val)
|
||||
if task == "classification":
|
||||
reproduced_metric = 1 - f1_score(y_val, y_pred_classes)
|
||||
else:
|
||||
reproduced_metric = 1 - r2_score(y_val, y_pred_classes)
|
||||
all_fold_metrics.append(reproduced_metric)
|
||||
return all_fold_metrics
|
||||
@@ -32,6 +32,8 @@ print(automl.predict_proba(X_train))
|
||||
print(automl.model.estimator)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample of output
|
||||
|
||||
```
|
||||
|
||||
@@ -47,6 +47,8 @@ if os.path.exists("data/output/"):
|
||||
shutil.rmtree("data/output/")
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -28,6 +28,8 @@ automl.fit(
|
||||
)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -32,6 +32,8 @@ print(automl.predict(X_train))
|
||||
print(automl.model.estimator)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -29,6 +29,8 @@ automl.fit(
|
||||
print(automl.predict(X_train[84:]))
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -29,6 +29,8 @@ settings = {
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -31,6 +31,8 @@ settings = {
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -393,7 +393,7 @@ For holdout, you can also set:
|
||||
|
||||
- `split_ratio`: the fraction for validation data, 0.1 by default.
|
||||
- `X_val`, `y_val`: a separate validation dataset. When they are passed, the validation metrics will be computed against this given validation dataset. If they are not passed, then a validation dataset will be split from the training data and held out from training during the model search. After the model search, flaml will retrain the model with best configuration on the full training data.
|
||||
You can set`retrain_full` to be `False` to skip the final retraining or "budget" to ask flaml to do its best to retrain within the time budget.
|
||||
You can set`retrain_full` to be `False` to skip the final retraining or "budget" to ask flaml to do its best to retrain within the time budget. When `retrain_full` is set to `True`, the user-provided validation data is not used in the final retraining of the model.
|
||||
|
||||
For cross validation, you can also set `n_splits` of the number of folds. By default it is 5.
|
||||
|
||||
|
||||
@@ -4830,9 +4830,9 @@ http-parser-js@>=0.5.1:
|
||||
integrity sha512-SGeBX54F94Wgu5RH3X5jsDtf4eHyRogWX1XGT3b4HuW3tQPM4AaBzoUji/4AAJNXCEOWZ5O0DgZmJw1947gD5Q==
|
||||
|
||||
http-proxy-middleware@^2.0.3:
|
||||
version "2.0.6"
|
||||
resolved "https://registry.npmmirror.com/http-proxy-middleware/-/http-proxy-middleware-2.0.6.tgz#e1a4dd6979572c7ab5a4e4b55095d1f32a74963f"
|
||||
integrity sha512-ya/UeJ6HVBYxrgYotAZo1KvPWlgB48kUJLDePFeneHsVujFaW5WNj2NgWCAE//B1Dl02BIfYlpNgBy8Kf8Rjmw==
|
||||
version "2.0.7"
|
||||
resolved "https://registry.yarnpkg.com/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz#915f236d92ae98ef48278a95dedf17e991936ec6"
|
||||
integrity sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==
|
||||
dependencies:
|
||||
"@types/http-proxy" "^1.17.8"
|
||||
http-proxy "^1.18.1"
|
||||
|
||||
Reference in New Issue
Block a user