Files
FLAML/test/default/test_defaults.py
Copilot 1687ca9a94 Fix eval_set preprocessing for XGBoost estimators with categorical features (#1470)
* Initial plan

* Initial analysis - reproduced eval_set preprocessing bug

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Fix eval_set preprocessing for XGBoost estimators with categorical features

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Add eval_set tests to test_xgboost function

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Fix linting issues with ruff and black

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
Co-authored-by: Li Jiang <bnujli@gmail.com>
2026-01-20 20:41:21 +08:00

286 lines
11 KiB
Python

import pickle
import sys
import pandas as pd
from sklearn.datasets import fetch_california_housing, load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split
from flaml import AutoML
from flaml.default import (
portfolio,
preprocess_and_suggest_hyperparams,
regret,
suggest_hyperparams,
suggest_learner,
)
def test_greedy_feedback(path="test/default", strategy="greedy-feedback"):
# sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
# portfolio.main()
# sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
# portfolio.main()
sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm --strategy {strategy}".split()
portfolio.main()
def test_build_portfolio(path="test/default", strategy="greedy"):
sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
portfolio.main()
sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
portfolio.main()
sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
portfolio.main()
def test_iris(as_frame=True):
automl = AutoML()
automl_settings = {
"time_budget": 2,
"metric": "accuracy",
"task": "classification",
"log_file_name": "test/iris.log",
"n_jobs": 1,
"starting_points": "data",
}
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
automl.fit(X_train, y_train, **automl_settings)
automl_settings["starting_points"] = "data:test/default"
automl.fit(X_train, y_train, **automl_settings)
def test_housing(as_frame=True):
automl = AutoML()
automl_settings = {
"time_budget": 2,
"task": "regression",
"estimator_list": ["xgboost", "lgbm"],
"log_file_name": "test/housing.log",
"n_jobs": 1,
"starting_points": "data",
"max_iter": 0,
}
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame, data_home="test")
automl.fit(X_train, y_train, **automl_settings)
def test_regret():
sys.argv = "regret.py --result_csv test/default/lgbm/results.csv --task_type binary --output test/default/lgbm/binary_regret.csv".split()
regret.main()
def test_suggest_classification():
location = "test/default"
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
suggested = suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location)
print(suggested)
suggested = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location)
print(suggested)
suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location)
print(suggested)
X, y = load_iris(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
(
hyperparams,
estimator_class,
X,
y,
feature_transformer,
label_transformer,
) = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location)
with open("test/default/feature_transformer", "wb") as f:
pickle.dump(feature_transformer, f, pickle.HIGHEST_PROTOCOL)
model = estimator_class(**hyperparams) # estimator_class is LGBMClassifier
model.fit(X, y)
X_test = feature_transformer.transform(X_test)
y_pred = label_transformer.inverse_transform(pd.Series(model.predict(X_test).astype(int)))
print(y_pred)
suggested = suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location)
print(suggested)
suggested = preprocess_and_suggest_hyperparams(
"classification", X_train, y_train, "xgb_limitdepth", location=location
)
print(suggested)
suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location)
suggested = suggest_learner(
"classification",
X_train,
y_train,
estimator_list=["xgboost", "xgb_limitdepth"],
location=location,
)
print(suggested)
def test_suggest_regression():
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
suggested = suggest_hyperparams("regression", X_train, y_train, "lgbm", location=location)
print(suggested)
suggested = preprocess_and_suggest_hyperparams("regression", X_train, y_train, "xgboost", location=location)
print(suggested)
suggested = suggest_hyperparams("regression", X_train, y_train, "xgb_limitdepth", location=location)
print(suggested)
suggested = suggest_learner("regression", X_train, y_train, location=location)
print(suggested)
def test_rf():
from flaml.default import RandomForestClassifier, RandomForestRegressor
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
rf = RandomForestClassifier()
rf.fit(X_train[:100], y_train[:100])
rf.predict(X_train)
rf.predict_proba(X_train)
print(rf)
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
rf = RandomForestRegressor(default_location=location)
rf.fit(X_train[:100], y_train[:100])
rf.predict(X_train)
print(rf)
def test_extratrees():
from flaml.default import ExtraTreesClassifier, ExtraTreesRegressor
X_train, y_train = load_iris(return_X_y=True, as_frame=True)
classifier = ExtraTreesClassifier()
classifier.fit(X_train[:100], y_train[:100])
classifier.predict(X_train)
classifier.predict_proba(X_train)
print(classifier)
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
regressor = ExtraTreesRegressor(default_location=location)
regressor.fit(X_train[:100], y_train[:100])
regressor.predict(X_train)
print(regressor)
def test_lgbm():
from flaml.default import LGBMClassifier, LGBMRegressor
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
classifier = LGBMClassifier(n_jobs=1)
classifier.fit(X_train, y_train)
classifier.predict(X_train, pred_contrib=True)
classifier.predict_proba(X_train)
print(classifier.get_params())
print(classifier)
print(classifier.classes_)
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
regressor = LGBMRegressor(default_location=location)
regressor.fit(X_train, y_train)
regressor.predict(X_train)
print(regressor)
def test_xgboost():
import numpy as np
from flaml.default import XGBClassifier, XGBRegressor
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
classifier = XGBClassifier(max_depth=0)
classifier.fit(X_train[:100], y_train[:100])
classifier.predict(X_train)
classifier.predict_proba(X_train)
print(classifier)
print(classifier.classes_)
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
regressor = XGBRegressor(default_location=location)
regressor.fit(X_train[:100], y_train[:100])
regressor.predict(X_train)
print(regressor)
# Test eval_set with categorical features (Issue: eval_set not preprocessed)
np.random.seed(42)
n = 500
df = pd.DataFrame(
{
"num1": np.random.randn(n),
"num2": np.random.rand(n) * 10,
"cat1": np.random.choice(["A", "B", "C"], size=n),
"cat2": np.random.choice(["X", "Y"], size=n),
"target": np.random.choice([0, 1], size=n),
}
)
X = df.drop(columns="target")
y = df["target"]
X_train_cat, X_valid_cat, y_train_cat, y_valid_cat = train_test_split(X, y, test_size=0.2, random_state=0)
# Convert categorical columns to pandas 'category' dtype
for col in X_train_cat.select_dtypes(include="object").columns:
X_train_cat[col] = X_train_cat[col].astype("category")
X_valid_cat[col] = X_valid_cat[col].astype("category")
# Test XGBClassifier with eval_set
classifier_eval = XGBClassifier(
tree_method="hist",
enable_categorical=True,
eval_metric="logloss",
use_label_encoder=False,
early_stopping_rounds=10,
random_state=0,
n_estimators=10,
)
classifier_eval.fit(X_train_cat, y_train_cat, eval_set=[(X_valid_cat, y_valid_cat)], verbose=False)
y_pred = classifier_eval.predict(X_valid_cat)
assert len(y_pred) == len(y_valid_cat)
# Test XGBRegressor with eval_set
y_reg = df["num1"] # Use num1 as target for regression
X_reg = df.drop(columns=["num1", "target"])
X_train_reg, X_valid_reg, y_train_reg, y_valid_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)
for col in X_train_reg.select_dtypes(include="object").columns:
X_train_reg[col] = X_train_reg[col].astype("category")
X_valid_reg[col] = X_valid_reg[col].astype("category")
regressor_eval = XGBRegressor(
tree_method="hist",
enable_categorical=True,
eval_metric="rmse",
early_stopping_rounds=10,
random_state=0,
n_estimators=10,
)
regressor_eval.fit(X_train_reg, y_train_reg, eval_set=[(X_valid_reg, y_valid_reg)], verbose=False)
y_pred = regressor_eval.predict(X_valid_reg)
assert len(y_pred) == len(y_valid_reg)
def test_nobudget():
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
automl = AutoML()
automl.fit(
X_train[:20],
y_train[:20],
estimator_list=["lgbm", "extra_tree", "rf"],
max_iter=12,
starting_points="data",
log_file_name="test/default/no_budget.txt",
log_type="all",
)
automl.fit(X_train[:20], y_train[:20], estimator_list=["lgbm", "extra_tree", "rf"])
# make sure that zero-shot config out of the search space does not degnerate to low cost init config
assert automl.best_config_per_estimator["extra_tree"]["n_estimators"] > 4
# make sure that the zero-shot config {} is not modified
assert "criterion" not in automl.best_config_per_estimator["rf"]
if __name__ == "__main__":
test_build_portfolio("flaml/default")