FLAML/test/spark/test_automl.py

import os

import numpy as np
import pytest
import scipy.sparse

from flaml import AutoML
from flaml.tune.spark.utils import check_spark

# For spark, we need to put customized learner in a separate file
if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "mylearner.py")):
    try:
        from test.spark.mylearner import MyLargeLGBM

        skip_my_learner = False
    except ImportError:
        skip_my_learner = True
        MyLargeLGBM = None
else:
    MyLargeLGBM = None
    skip_my_learner = True

os.environ["FLAML_MAX_CONCURRENT"] = "2"

spark_available, _ = check_spark()
skip_spark = not spark_available

pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]


def test_parallel_xgboost_and_pickle(hpo_method=None, data_size=1000):
    automl_experiment = AutoML()
    automl_settings = {
        "time_budget": 30,
        "metric": "ap",
        "task": "classification",
        "log_file_name": "test/sparse_classification.log",
        "estimator_list": ["xgboost"],
        "log_type": "all",
        "n_jobs": 1,
        "n_concurrent_trials": 2,
        "hpo_method": hpo_method,
        "use_spark": True,
    }
    X_train = scipy.sparse.eye(data_size)
    y_train = np.random.randint(2, size=data_size)

    automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
    print(automl_experiment.predict(X_train))
    print(automl_experiment.model)
    print(automl_experiment.config_history)
    print(automl_experiment.best_model_for_estimator("xgboost"))
    print(automl_experiment.best_iteration)
    print(automl_experiment.best_estimator)

    # test pickle and load_pickle, should work for prediction
    automl_experiment.pickle("automl_xgboost_spark.pkl")
    automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
    assert automl_loaded.best_estimator == automl_experiment.best_estimator
    assert automl_loaded.best_loss == automl_experiment.best_loss
    automl_loaded.predict(X_train)

    import shutil

    shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
    shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)


def test_parallel_xgboost_others():
    # use random search as the hpo_method
    test_parallel_xgboost_and_pickle(hpo_method="random")


@pytest.mark.skip(reason="currently not supporting too large data, will support spark dataframe in the future")
def test_large_dataset():
    test_parallel_xgboost_and_pickle(data_size=90000000)


@pytest.mark.skipif(
    skip_my_learner,
    reason="please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
)
def test_custom_learner(data_size=1000):
    automl_experiment = AutoML()
    automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
    automl_settings = {
        "time_budget": 2,
        "task": "classification",
        "log_file_name": "test/sparse_classification_oom.log",
        "estimator_list": ["large_lgbm"],
        "log_type": "all",
        "n_jobs": 1,
        "hpo_method": "random",
        "n_concurrent_trials": 2,
        "use_spark": True,
    }
    X_train = scipy.sparse.eye(data_size)
    y_train = np.random.randint(2, size=data_size)

    automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
    print(automl_experiment.predict(X_train))
    print(automl_experiment.model)
    print(automl_experiment.config_history)
    print(automl_experiment.best_model_for_estimator("large_lgbm"))
    print(automl_experiment.best_iteration)
    print(automl_experiment.best_estimator)


if __name__ == "__main__":
    test_parallel_xgboost_and_pickle()
    # test_parallel_xgboost_others()
    # # test_large_dataset()
    # if skip_my_learner:
    #     print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
    # else:
    #     test_custom_learner()