Files
FLAML/test/spark/test_automl.py

117 lines
3.9 KiB
Python

import os
import numpy as np
import pytest
import scipy.sparse
from flaml import AutoML
from flaml.tune.spark.utils import check_spark
# For spark, we need to put customized learner in a separate file
if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "mylearner.py")):
try:
from test.spark.mylearner import MyLargeLGBM
skip_my_learner = False
except ImportError:
skip_my_learner = True
MyLargeLGBM = None
else:
MyLargeLGBM = None
skip_my_learner = True
os.environ["FLAML_MAX_CONCURRENT"] = "2"
spark_available, _ = check_spark()
skip_spark = not spark_available
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
def test_parallel_xgboost_and_pickle(hpo_method=None, data_size=1000):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 30,
"metric": "ap",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["xgboost"],
"log_type": "all",
"n_jobs": 1,
"n_concurrent_trials": 2,
"hpo_method": hpo_method,
"use_spark": True,
}
X_train = scipy.sparse.eye(data_size)
y_train = np.random.randint(2, size=data_size)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.best_model_for_estimator("xgboost"))
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
# test pickle and load_pickle, should work for prediction
automl_experiment.pickle("automl_xgboost_spark.pkl")
automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
assert automl_loaded.best_estimator == automl_experiment.best_estimator
assert automl_loaded.best_loss == automl_experiment.best_loss
automl_loaded.predict(X_train)
import shutil
shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)
def test_parallel_xgboost_others():
# use random search as the hpo_method
test_parallel_xgboost_and_pickle(hpo_method="random")
@pytest.mark.skip(reason="currently not supporting too large data, will support spark dataframe in the future")
def test_large_dataset():
test_parallel_xgboost_and_pickle(data_size=90000000)
@pytest.mark.skipif(
skip_my_learner,
reason="please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
)
def test_custom_learner(data_size=1000):
automl_experiment = AutoML()
automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
automl_settings = {
"time_budget": 2,
"task": "classification",
"log_file_name": "test/sparse_classification_oom.log",
"estimator_list": ["large_lgbm"],
"log_type": "all",
"n_jobs": 1,
"hpo_method": "random",
"n_concurrent_trials": 2,
"use_spark": True,
}
X_train = scipy.sparse.eye(data_size)
y_train = np.random.randint(2, size=data_size)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.best_model_for_estimator("large_lgbm"))
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
if __name__ == "__main__":
test_parallel_xgboost_and_pickle()
# test_parallel_xgboost_others()
# # test_large_dataset()
# if skip_my_learner:
# print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
# else:
# test_custom_learner()