import pickle import sys import pandas as pd from sklearn.datasets import fetch_california_housing, load_breast_cancer, load_iris from sklearn.model_selection import train_test_split from flaml import AutoML from flaml.default import ( portfolio, preprocess_and_suggest_hyperparams, regret, suggest_hyperparams, suggest_learner, ) def test_greedy_feedback(path="test/default", strategy="greedy-feedback"): # sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() # portfolio.main() # sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() # portfolio.main() sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm --strategy {strategy}".split() portfolio.main() def test_build_portfolio(path="test/default", strategy="greedy"): sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task binary --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() portfolio.main() sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task multiclass --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() portfolio.main() sys.argv = f"portfolio.py --output {path} --input {path} --metafeatures {path}/all/metafeatures.csv --task regression --estimator lgbm xgboost xgb_limitdepth rf extra_tree --strategy {strategy}".split() portfolio.main() def test_iris(as_frame=True): automl = AutoML() automl_settings = { "time_budget": 2, "metric": "accuracy", "task": "classification", "log_file_name": "test/iris.log", "n_jobs": 1, "starting_points": "data", } X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame) automl.fit(X_train, y_train, **automl_settings) automl_settings["starting_points"] = "data:test/default" automl.fit(X_train, y_train, **automl_settings) def test_housing(as_frame=True): automl = AutoML() automl_settings = { "time_budget": 2, "task": "regression", "estimator_list": ["xgboost", "lgbm"], "log_file_name": "test/housing.log", "n_jobs": 1, "starting_points": "data", "max_iter": 0, } X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame, data_home="test") automl.fit(X_train, y_train, **automl_settings) def test_regret(): sys.argv = "regret.py --result_csv test/default/lgbm/results.csv --task_type binary --output test/default/lgbm/binary_regret.csv".split() regret.main() def test_suggest_classification(): location = "test/default" X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True) suggested = suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location) print(suggested) suggested = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location) print(suggested) suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location) print(suggested) X, y = load_iris(return_X_y=True, as_frame=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) ( hyperparams, estimator_class, X, y, feature_transformer, label_transformer, ) = preprocess_and_suggest_hyperparams("classification", X_train, y_train, "lgbm", location=location) with open("test/default/feature_transformer", "wb") as f: pickle.dump(feature_transformer, f, pickle.HIGHEST_PROTOCOL) model = estimator_class(**hyperparams) # estimator_class is LGBMClassifier model.fit(X, y) X_test = feature_transformer.transform(X_test) y_pred = label_transformer.inverse_transform(pd.Series(model.predict(X_test).astype(int))) print(y_pred) suggested = suggest_hyperparams("classification", X_train, y_train, "xgboost", location=location) print(suggested) suggested = preprocess_and_suggest_hyperparams( "classification", X_train, y_train, "xgb_limitdepth", location=location ) print(suggested) suggested = suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth", location=location) suggested = suggest_learner( "classification", X_train, y_train, estimator_list=["xgboost", "xgb_limitdepth"], location=location, ) print(suggested) def test_suggest_regression(): location = "test/default" X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test") suggested = suggest_hyperparams("regression", X_train, y_train, "lgbm", location=location) print(suggested) suggested = preprocess_and_suggest_hyperparams("regression", X_train, y_train, "xgboost", location=location) print(suggested) suggested = suggest_hyperparams("regression", X_train, y_train, "xgb_limitdepth", location=location) print(suggested) suggested = suggest_learner("regression", X_train, y_train, location=location) print(suggested) def test_rf(): from flaml.default import RandomForestClassifier, RandomForestRegressor X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True) rf = RandomForestClassifier() rf.fit(X_train[:100], y_train[:100]) rf.predict(X_train) rf.predict_proba(X_train) print(rf) location = "test/default" X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test") rf = RandomForestRegressor(default_location=location) rf.fit(X_train[:100], y_train[:100]) rf.predict(X_train) print(rf) def test_extratrees(): from flaml.default import ExtraTreesClassifier, ExtraTreesRegressor X_train, y_train = load_iris(return_X_y=True, as_frame=True) classifier = ExtraTreesClassifier() classifier.fit(X_train[:100], y_train[:100]) classifier.predict(X_train) classifier.predict_proba(X_train) print(classifier) location = "test/default" X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test") regressor = ExtraTreesRegressor(default_location=location) regressor.fit(X_train[:100], y_train[:100]) regressor.predict(X_train) print(regressor) def test_lgbm(): from flaml.default import LGBMClassifier, LGBMRegressor X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True) classifier = LGBMClassifier(n_jobs=1) classifier.fit(X_train, y_train) classifier.predict(X_train, pred_contrib=True) classifier.predict_proba(X_train) print(classifier.get_params()) print(classifier) print(classifier.classes_) location = "test/default" X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test") regressor = LGBMRegressor(default_location=location) regressor.fit(X_train, y_train) regressor.predict(X_train) print(regressor) def test_xgboost(): import numpy as np from flaml.default import XGBClassifier, XGBRegressor X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True) classifier = XGBClassifier(max_depth=0) classifier.fit(X_train[:100], y_train[:100]) classifier.predict(X_train) classifier.predict_proba(X_train) print(classifier) print(classifier.classes_) location = "test/default" X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test") regressor = XGBRegressor(default_location=location) regressor.fit(X_train[:100], y_train[:100]) regressor.predict(X_train) print(regressor) # Test eval_set with categorical features (Issue: eval_set not preprocessed) np.random.seed(42) n = 500 df = pd.DataFrame( { "num1": np.random.randn(n), "num2": np.random.rand(n) * 10, "cat1": np.random.choice(["A", "B", "C"], size=n), "cat2": np.random.choice(["X", "Y"], size=n), "target": np.random.choice([0, 1], size=n), } ) X = df.drop(columns="target") y = df["target"] X_train_cat, X_valid_cat, y_train_cat, y_valid_cat = train_test_split(X, y, test_size=0.2, random_state=0) # Convert categorical columns to pandas 'category' dtype for col in X_train_cat.select_dtypes(include="object").columns: X_train_cat[col] = X_train_cat[col].astype("category") X_valid_cat[col] = X_valid_cat[col].astype("category") # Test XGBClassifier with eval_set classifier_eval = XGBClassifier( tree_method="hist", enable_categorical=True, eval_metric="logloss", use_label_encoder=False, early_stopping_rounds=10, random_state=0, n_estimators=10, ) classifier_eval.fit(X_train_cat, y_train_cat, eval_set=[(X_valid_cat, y_valid_cat)], verbose=False) y_pred = classifier_eval.predict(X_valid_cat) assert len(y_pred) == len(y_valid_cat) # Test XGBRegressor with eval_set y_reg = df["num1"] # Use num1 as target for regression X_reg = df.drop(columns=["num1", "target"]) X_train_reg, X_valid_reg, y_train_reg, y_valid_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0) for col in X_train_reg.select_dtypes(include="object").columns: X_train_reg[col] = X_train_reg[col].astype("category") X_valid_reg[col] = X_valid_reg[col].astype("category") regressor_eval = XGBRegressor( tree_method="hist", enable_categorical=True, eval_metric="rmse", early_stopping_rounds=10, random_state=0, n_estimators=10, ) regressor_eval.fit(X_train_reg, y_train_reg, eval_set=[(X_valid_reg, y_valid_reg)], verbose=False) y_pred = regressor_eval.predict(X_valid_reg) assert len(y_pred) == len(y_valid_reg) def test_nobudget(): X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True) automl = AutoML() automl.fit( X_train[:20], y_train[:20], estimator_list=["lgbm", "extra_tree", "rf"], max_iter=12, starting_points="data", log_file_name="test/default/no_budget.txt", log_type="all", ) automl.fit(X_train[:20], y_train[:20], estimator_list=["lgbm", "extra_tree", "rf"]) # make sure that zero-shot config out of the search space does not degnerate to low cost init config assert automl.best_config_per_estimator["extra_tree"]["n_estimators"] > 4 # make sure that the zero-shot config {} is not modified assert "criterion" not in automl.best_config_per_estimator["rf"] if __name__ == "__main__": test_build_portfolio("flaml/default")