FLAML/test/default/test_defaults.py

import pickle
import sys

import pandas as pd
from sklearn.datasets import fetch_california_housing, load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split

from flaml import AutoML
from flaml.default import (
    portfolio,
    preprocess_and_suggest_hyperparams,
    regret,
    suggest_hyperparams,
    suggest_learner,
)


def test_greedy_feedback(path="test/default", strategy="greedy-feedback"):
    # sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
    # portfolio.main()
    # sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
    # portfolio.main()
    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm --strategy {strategy}".split()
    portfolio.main()


def test_build_portfolio(path="test/default", strategy="greedy"):
    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
    portfolio.main()
    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
    portfolio.main()
    sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split()
    portfolio.main()


def test_iris(as_frame=True):
    automl = AutoML()
    automl_settings = {
        "time_budget": 2,
        "metric": "accuracy",
        "task": "classification",
        "log_file_name": "test/iris.log",
        "n_jobs": 1,
        "starting_points": "data",
    }
    X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
    automl.fit(X_train, y_train, **automl_settings)
    automl_settings["starting_points"] = "data:test/default"
    automl.fit(X_train, y_train, **automl_settings)


def test_housing(as_frame=True):
    automl = AutoML()
    automl_settings = {
        "time_budget": 2,
        "task": "regression",
        "estimator_list": ["xgboost", "lgbm"],
        "log_file_name": "test/housing.log",
        "n_jobs": 1,
        "starting_points": "data",
        "max_iter": 0,
    }
    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame, data_home="test")
    automl.fit(X_train, y_train, **automl_settings)


def test_regret():
    sys.argv = "regret.py --result_csv test/default/lgbm/results.csv --task_type binary --output test/default/lgbm/binary_regret.csv".split()
    regret.main()


def test_suggest_classification():
    location = "test/default"
    X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
    suggested = suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location)
    print(suggested)
    suggested = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location)
    print(suggested)
    suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location)
    print(suggested)

    X, y = load_iris(return_X_y=True, as_frame=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    (
        hyperparams,
        estimator_class,
        X,
        y,
        feature_transformer,
        label_transformer,
    ) = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location)
    with open("test/default/feature_transformer", "wb") as f:
        pickle.dump(feature_transformer, f, pickle.HIGHEST_PROTOCOL)
    model = estimator_class(**hyperparams)  # estimator_class is LGBMClassifier
    model.fit(X, y)
    X_test = feature_transformer.transform(X_test)
    y_pred = label_transformer.inverse_transform(pd.Series(model.predict(X_test).astype(int)))
    print(y_pred)
    suggested = suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location)
    print(suggested)
    suggested = preprocess_and_suggest_hyperparams(
        "classification", X_train, y_train, "xgb_limitdepth", location=location
    )
    print(suggested)
    suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location)
    suggested = suggest_learner(
        "classification",
        X_train,
        y_train,
        estimator_list=["xgboost", "xgb_limitdepth"],
        location=location,
    )
    print(suggested)


def test_suggest_regression():
    location = "test/default"
    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    suggested = suggest_hyperparams("regression", X_train, y_train, "lgbm", location=location)
    print(suggested)
    suggested = preprocess_and_suggest_hyperparams("regression", X_train, y_train, "xgboost", location=location)
    print(suggested)
    suggested = suggest_hyperparams("regression", X_train, y_train, "xgb_limitdepth", location=location)
    print(suggested)
    suggested = suggest_learner("regression", X_train, y_train, location=location)
    print(suggested)


def test_rf():
    from flaml.default import RandomForestClassifier, RandomForestRegressor

    X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
    rf = RandomForestClassifier()
    rf.fit(X_train[:100], y_train[:100])
    rf.predict(X_train)
    rf.predict_proba(X_train)
    print(rf)

    location = "test/default"
    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    rf = RandomForestRegressor(default_location=location)
    rf.fit(X_train[:100], y_train[:100])
    rf.predict(X_train)
    print(rf)


def test_extratrees():
    from flaml.default import ExtraTreesClassifier, ExtraTreesRegressor

    X_train, y_train = load_iris(return_X_y=True, as_frame=True)
    classifier = ExtraTreesClassifier()
    classifier.fit(X_train[:100], y_train[:100])
    classifier.predict(X_train)
    classifier.predict_proba(X_train)
    print(classifier)

    location = "test/default"
    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    regressor = ExtraTreesRegressor(default_location=location)
    regressor.fit(X_train[:100], y_train[:100])
    regressor.predict(X_train)
    print(regressor)


def test_lgbm():
    from flaml.default import LGBMClassifier, LGBMRegressor

    X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
    classifier = LGBMClassifier(n_jobs=1)
    classifier.fit(X_train, y_train)
    classifier.predict(X_train, pred_contrib=True)
    classifier.predict_proba(X_train)
    print(classifier.get_params())
    print(classifier)
    print(classifier.classes_)

    location = "test/default"
    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    regressor = LGBMRegressor(default_location=location)
    regressor.fit(X_train, y_train)
    regressor.predict(X_train)
    print(regressor)


def test_xgboost():
    import numpy as np

    from flaml.default import XGBClassifier, XGBRegressor

    X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
    classifier = XGBClassifier(max_depth=0)
    classifier.fit(X_train[:100], y_train[:100])
    classifier.predict(X_train)
    classifier.predict_proba(X_train)
    print(classifier)
    print(classifier.classes_)

    location = "test/default"
    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    regressor = XGBRegressor(default_location=location)
    regressor.fit(X_train[:100], y_train[:100])
    regressor.predict(X_train)
    print(regressor)

    # Test eval_set with categorical features (Issue: eval_set not preprocessed)
    np.random.seed(42)
    n = 500
    df = pd.DataFrame(
        {
            "num1": np.random.randn(n),
            "num2": np.random.rand(n) * 10,
            "cat1": np.random.choice(["A", "B", "C"], size=n),
            "cat2": np.random.choice(["X", "Y"], size=n),
            "target": np.random.choice([0, 1], size=n),
        }
    )

    X = df.drop(columns="target")
    y = df["target"]

    X_train_cat, X_valid_cat, y_train_cat, y_valid_cat = train_test_split(X, y, test_size=0.2, random_state=0)

    # Convert categorical columns to pandas 'category' dtype
    for col in X_train_cat.select_dtypes(include="object").columns:
        X_train_cat[col] = X_train_cat[col].astype("category")
        X_valid_cat[col] = X_valid_cat[col].astype("category")

    # Test XGBClassifier with eval_set
    classifier_eval = XGBClassifier(
        tree_method="hist",
        enable_categorical=True,
        eval_metric="logloss",
        use_label_encoder=False,
        early_stopping_rounds=10,
        random_state=0,
        n_estimators=10,
    )
    classifier_eval.fit(X_train_cat, y_train_cat, eval_set=[(X_valid_cat, y_valid_cat)], verbose=False)
    y_pred = classifier_eval.predict(X_valid_cat)
    assert len(y_pred) == len(y_valid_cat)

    # Test XGBRegressor with eval_set
    y_reg = df["num1"]  # Use num1 as target for regression
    X_reg = df.drop(columns=["num1", "target"])

    X_train_reg, X_valid_reg, y_train_reg, y_valid_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)

    for col in X_train_reg.select_dtypes(include="object").columns:
        X_train_reg[col] = X_train_reg[col].astype("category")
        X_valid_reg[col] = X_valid_reg[col].astype("category")

    regressor_eval = XGBRegressor(
        tree_method="hist",
        enable_categorical=True,
        eval_metric="rmse",
        early_stopping_rounds=10,
        random_state=0,
        n_estimators=10,
    )
    regressor_eval.fit(X_train_reg, y_train_reg, eval_set=[(X_valid_reg, y_valid_reg)], verbose=False)
    y_pred = regressor_eval.predict(X_valid_reg)
    assert len(y_pred) == len(y_valid_reg)


def test_nobudget():
    X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
    automl = AutoML()
    automl.fit(
        X_train[:20],
        y_train[:20],
        estimator_list=["lgbm", "extra_tree", "rf"],
        max_iter=12,
        starting_points="data",
        log_file_name="test/default/no_budget.txt",
        log_type="all",
    )
    automl.fit(X_train[:20], y_train[:20], estimator_list=["lgbm", "extra_tree", "rf"])
    # make sure that zero-shot config out of the search space does not degnerate to low cost init config
    assert automl.best_config_per_estimator["extra_tree"]["n_estimators"] > 4
    # make sure that the zero-shot config {} is not modified
    assert "criterion" not in automl.best_config_per_estimator["rf"]


if __name__ == "__main__":
    test_build_portfolio("flaml/default")