mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-09 02:09:16 +08:00
* Initial plan * Initial analysis - reproduced eval_set preprocessing bug Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix eval_set preprocessing for XGBoost estimators with categorical features Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add eval_set tests to test_xgboost function Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix linting issues with ruff and black Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com>
286 lines
11 KiB
Python
286 lines
11 KiB
Python
import pickle
|
|
import sys
|
|
|
|
import pandas as pd
|
|
from sklearn.datasets import fetch_california_housing, load_breast_cancer, load_iris
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
from flaml import AutoML
|
|
from flaml.default import (
|
|
portfolio,
|
|
preprocess_and_suggest_hyperparams,
|
|
regret,
|
|
suggest_hyperparams,
|
|
suggest_learner,
|
|
)
|
|
|
|
|
|
def test_greedy_feedback(path="test/default", strategy="greedy-feedback"):
|
|
# sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
|
|
# portfolio.main()
|
|
# sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
|
|
# portfolio.main()
|
|
sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm --strategy {strategy}".split()
|
|
portfolio.main()
|
|
|
|
|
|
def test_build_portfolio(path="test/default", strategy="greedy"):
|
|
sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
|
|
portfolio.main()
|
|
sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
|
|
portfolio.main()
|
|
sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
|
|
portfolio.main()
|
|
|
|
|
|
def test_iris(as_frame=True):
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 2,
|
|
"metric": "accuracy",
|
|
"task": "classification",
|
|
"log_file_name": "test/iris.log",
|
|
"n_jobs": 1,
|
|
"starting_points": "data",
|
|
}
|
|
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
|
|
automl.fit(X_train, y_train, **automl_settings)
|
|
automl_settings["starting_points"] = "data:test/default"
|
|
automl.fit(X_train, y_train, **automl_settings)
|
|
|
|
|
|
def test_housing(as_frame=True):
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 2,
|
|
"task": "regression",
|
|
"estimator_list": ["xgboost", "lgbm"],
|
|
"log_file_name": "test/housing.log",
|
|
"n_jobs": 1,
|
|
"starting_points": "data",
|
|
"max_iter": 0,
|
|
}
|
|
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame, data_home="test")
|
|
automl.fit(X_train, y_train, **automl_settings)
|
|
|
|
|
|
def test_regret():
|
|
sys.argv = "regret.py --result_csv test/default/lgbm/results.csv --task_type binary --output test/default/lgbm/binary_regret.csv".split()
|
|
regret.main()
|
|
|
|
|
|
def test_suggest_classification():
|
|
location = "test/default"
|
|
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
|
|
suggested = suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location)
|
|
print(suggested)
|
|
suggested = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location)
|
|
print(suggested)
|
|
suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location)
|
|
print(suggested)
|
|
|
|
X, y = load_iris(return_X_y=True, as_frame=True)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
|
(
|
|
hyperparams,
|
|
estimator_class,
|
|
X,
|
|
y,
|
|
feature_transformer,
|
|
label_transformer,
|
|
) = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location)
|
|
with open("test/default/feature_transformer", "wb") as f:
|
|
pickle.dump(feature_transformer, f, pickle.HIGHEST_PROTOCOL)
|
|
model = estimator_class(**hyperparams) # estimator_class is LGBMClassifier
|
|
model.fit(X, y)
|
|
X_test = feature_transformer.transform(X_test)
|
|
y_pred = label_transformer.inverse_transform(pd.Series(model.predict(X_test).astype(int)))
|
|
print(y_pred)
|
|
suggested = suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location)
|
|
print(suggested)
|
|
suggested = preprocess_and_suggest_hyperparams(
|
|
"classification", X_train, y_train, "xgb_limitdepth", location=location
|
|
)
|
|
print(suggested)
|
|
suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location)
|
|
suggested = suggest_learner(
|
|
"classification",
|
|
X_train,
|
|
y_train,
|
|
estimator_list=["xgboost", "xgb_limitdepth"],
|
|
location=location,
|
|
)
|
|
print(suggested)
|
|
|
|
|
|
def test_suggest_regression():
|
|
location = "test/default"
|
|
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
|
suggested = suggest_hyperparams("regression", X_train, y_train, "lgbm", location=location)
|
|
print(suggested)
|
|
suggested = preprocess_and_suggest_hyperparams("regression", X_train, y_train, "xgboost", location=location)
|
|
print(suggested)
|
|
suggested = suggest_hyperparams("regression", X_train, y_train, "xgb_limitdepth", location=location)
|
|
print(suggested)
|
|
suggested = suggest_learner("regression", X_train, y_train, location=location)
|
|
print(suggested)
|
|
|
|
|
|
def test_rf():
|
|
from flaml.default import RandomForestClassifier, RandomForestRegressor
|
|
|
|
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
|
|
rf = RandomForestClassifier()
|
|
rf.fit(X_train[:100], y_train[:100])
|
|
rf.predict(X_train)
|
|
rf.predict_proba(X_train)
|
|
print(rf)
|
|
|
|
location = "test/default"
|
|
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
|
rf = RandomForestRegressor(default_location=location)
|
|
rf.fit(X_train[:100], y_train[:100])
|
|
rf.predict(X_train)
|
|
print(rf)
|
|
|
|
|
|
def test_extratrees():
|
|
from flaml.default import ExtraTreesClassifier, ExtraTreesRegressor
|
|
|
|
X_train, y_train = load_iris(return_X_y=True, as_frame=True)
|
|
classifier = ExtraTreesClassifier()
|
|
classifier.fit(X_train[:100], y_train[:100])
|
|
classifier.predict(X_train)
|
|
classifier.predict_proba(X_train)
|
|
print(classifier)
|
|
|
|
location = "test/default"
|
|
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
|
regressor = ExtraTreesRegressor(default_location=location)
|
|
regressor.fit(X_train[:100], y_train[:100])
|
|
regressor.predict(X_train)
|
|
print(regressor)
|
|
|
|
|
|
def test_lgbm():
|
|
from flaml.default import LGBMClassifier, LGBMRegressor
|
|
|
|
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
|
|
classifier = LGBMClassifier(n_jobs=1)
|
|
classifier.fit(X_train, y_train)
|
|
classifier.predict(X_train, pred_contrib=True)
|
|
classifier.predict_proba(X_train)
|
|
print(classifier.get_params())
|
|
print(classifier)
|
|
print(classifier.classes_)
|
|
|
|
location = "test/default"
|
|
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
|
regressor = LGBMRegressor(default_location=location)
|
|
regressor.fit(X_train, y_train)
|
|
regressor.predict(X_train)
|
|
print(regressor)
|
|
|
|
|
|
def test_xgboost():
|
|
import numpy as np
|
|
|
|
from flaml.default import XGBClassifier, XGBRegressor
|
|
|
|
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
|
|
classifier = XGBClassifier(max_depth=0)
|
|
classifier.fit(X_train[:100], y_train[:100])
|
|
classifier.predict(X_train)
|
|
classifier.predict_proba(X_train)
|
|
print(classifier)
|
|
print(classifier.classes_)
|
|
|
|
location = "test/default"
|
|
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
|
regressor = XGBRegressor(default_location=location)
|
|
regressor.fit(X_train[:100], y_train[:100])
|
|
regressor.predict(X_train)
|
|
print(regressor)
|
|
|
|
# Test eval_set with categorical features (Issue: eval_set not preprocessed)
|
|
np.random.seed(42)
|
|
n = 500
|
|
df = pd.DataFrame(
|
|
{
|
|
"num1": np.random.randn(n),
|
|
"num2": np.random.rand(n) * 10,
|
|
"cat1": np.random.choice(["A", "B", "C"], size=n),
|
|
"cat2": np.random.choice(["X", "Y"], size=n),
|
|
"target": np.random.choice([0, 1], size=n),
|
|
}
|
|
)
|
|
|
|
X = df.drop(columns="target")
|
|
y = df["target"]
|
|
|
|
X_train_cat, X_valid_cat, y_train_cat, y_valid_cat = train_test_split(X, y, test_size=0.2, random_state=0)
|
|
|
|
# Convert categorical columns to pandas 'category' dtype
|
|
for col in X_train_cat.select_dtypes(include="object").columns:
|
|
X_train_cat[col] = X_train_cat[col].astype("category")
|
|
X_valid_cat[col] = X_valid_cat[col].astype("category")
|
|
|
|
# Test XGBClassifier with eval_set
|
|
classifier_eval = XGBClassifier(
|
|
tree_method="hist",
|
|
enable_categorical=True,
|
|
eval_metric="logloss",
|
|
use_label_encoder=False,
|
|
early_stopping_rounds=10,
|
|
random_state=0,
|
|
n_estimators=10,
|
|
)
|
|
classifier_eval.fit(X_train_cat, y_train_cat, eval_set=[(X_valid_cat, y_valid_cat)], verbose=False)
|
|
y_pred = classifier_eval.predict(X_valid_cat)
|
|
assert len(y_pred) == len(y_valid_cat)
|
|
|
|
# Test XGBRegressor with eval_set
|
|
y_reg = df["num1"] # Use num1 as target for regression
|
|
X_reg = df.drop(columns=["num1", "target"])
|
|
|
|
X_train_reg, X_valid_reg, y_train_reg, y_valid_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)
|
|
|
|
for col in X_train_reg.select_dtypes(include="object").columns:
|
|
X_train_reg[col] = X_train_reg[col].astype("category")
|
|
X_valid_reg[col] = X_valid_reg[col].astype("category")
|
|
|
|
regressor_eval = XGBRegressor(
|
|
tree_method="hist",
|
|
enable_categorical=True,
|
|
eval_metric="rmse",
|
|
early_stopping_rounds=10,
|
|
random_state=0,
|
|
n_estimators=10,
|
|
)
|
|
regressor_eval.fit(X_train_reg, y_train_reg, eval_set=[(X_valid_reg, y_valid_reg)], verbose=False)
|
|
y_pred = regressor_eval.predict(X_valid_reg)
|
|
assert len(y_pred) == len(y_valid_reg)
|
|
|
|
|
|
def test_nobudget():
|
|
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
|
|
automl = AutoML()
|
|
automl.fit(
|
|
X_train[:20],
|
|
y_train[:20],
|
|
estimator_list=["lgbm", "extra_tree", "rf"],
|
|
max_iter=12,
|
|
starting_points="data",
|
|
log_file_name="test/default/no_budget.txt",
|
|
log_type="all",
|
|
)
|
|
automl.fit(X_train[:20], y_train[:20], estimator_list=["lgbm", "extra_tree", "rf"])
|
|
# make sure that zero-shot config out of the search space does not degnerate to low cost init config
|
|
assert automl.best_config_per_estimator["extra_tree"]["n_estimators"] > 4
|
|
# make sure that the zero-shot config {} is not modified
|
|
assert "criterion" not in automl.best_config_per_estimator["rf"]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_build_portfolio("flaml/default")
|