import numpy as np import pandas as pd from sklearn.datasets import fetch_openml, load_iris from sklearn.metrics import accuracy_score from sklearn.model_selection import GroupKFold, KFold, train_test_split from flaml.automl import AutoML dataset = "credit-g" def _test(split_type): from sklearn.externals._arff import ArffException automl = AutoML() automl_settings = { "time_budget": 2, # "metric": 'accuracy', "task": "classification", "log_file_name": f"test/{dataset}.log", "model_history": True, "log_training_metric": True, "split_type": split_type, } try: X, y = fetch_openml(name=dataset, return_X_y=True) except (ArffException, ValueError): from sklearn.datasets import load_wine X, y = load_wine(return_X_y=True) if split_type != "time": X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) else: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False) automl.fit(X_train=X_train, y_train=y_train, **automl_settings) pred = automl.predict(X_test) acc = accuracy_score(y_test, pred) print(acc) def _test_uniform(): _test(split_type="uniform") def test_time(): _test(split_type="time") def test_groups_for_classification_task(): from sklearn.externals._arff import ArffException try: X, y = fetch_openml(name=dataset, return_X_y=True) except (ArffException, ValueError): from sklearn.datasets import load_wine X, y = load_wine(return_X_y=True) automl = AutoML() automl_settings = { "time_budget": 2, "task": "classification", "log_file_name": f"test/{dataset}.log", "model_history": True, "eval_method": "cv", "groups": np.random.randint(low=0, high=10, size=len(y)), "estimator_list": ["catboost", "lgbm", "rf", "xgboost", "kneighbor"], "learner_selector": "roundrobin", } automl.fit(X, y, **automl_settings) automl_settings["eval_method"] = "holdout" automl.fit(X, y, **automl_settings) automl_settings["split_type"] = GroupKFold(n_splits=3) try: automl.fit(X, y, **automl_settings) raise RuntimeError("GroupKFold object as split_type should fail when eval_method is holdout") except AssertionError: # eval_method must be 'auto' or 'cv' for custom data splitter. pass automl_settings["eval_method"] = "cv" automl.fit(X, y, **automl_settings) def test_groups_for_regression_task(): """Append nonsensical groups to iris dataset and use it to test that GroupKFold works for regression tasks""" iris_dict_data = load_iris(as_frame=True) # numpy arrays iris_data = iris_dict_data["frame"] # pandas dataframe data + target rng = np.random.default_rng(42) iris_data["cluster"] = rng.integers( low=0, high=5, size=iris_data.shape[0] ) # np.random.randint(0, 5, iris_data.shape[0]) automl = AutoML() X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy() y = iris_data["petal width (cm)"] X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split( X, y, iris_data["cluster"], random_state=42 ) automl_settings = { "max_iter": 5, "time_budget": -1, "metric": "r2", "task": "regression", "estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"], "eval_method": "cv", "split_type": "uniform", "groups": groups_train, } automl.fit(X_train, y_train, **automl_settings) def test_groups_with_sample_weights(): """Verifies that sample weights can be used with group splits i.e. that https://github.com/microsoft/FLAML/issues/1396 remains fixed""" iris_dict_data = load_iris(as_frame=True) # numpy arrays iris_data = iris_dict_data["frame"] # pandas dataframe data + target iris_data["cluster"] = np.random.randint(0, 5, iris_data.shape[0]) automl = AutoML() X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy() y = iris_data["petal width (cm)"] sample_weight = pd.Series(np.random.rand(X.shape[0])) ( X_train, X_test, y_train, y_test, groups_train, groups_test, sample_weight_train, sample_weight_test, ) = train_test_split(X, y, iris_data["cluster"], sample_weight, random_state=42) automl_settings = { "max_iter": 5, "time_budget": -1, "metric": "r2", "task": "regression", "log_file_name": "error.log", "log_type": "all", "estimator_list": ["lgbm"], "eval_method": "cv", "split_type": "group", "groups": groups_train, "sample_weight": sample_weight_train, } automl.fit(X_train, y_train, **automl_settings) assert automl.model is not None def test_stratified_groupkfold(): from minio.error import ServerError from sklearn.model_selection import StratifiedGroupKFold from flaml.automl.data import load_openml_dataset try: X_train, _, y_train, _ = load_openml_dataset(dataset_id=1169, data_dir="test/") except (ServerError, Exception): return splitter = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=0) automl = AutoML() settings = { "time_budget": 6, "metric": "ap", "eval_method": "cv", "split_type": splitter, "groups": X_train["Airline"], "estimator_list": [ "catboost", "lgbm", "rf", "xgboost", "extra_tree", "xgb_limitdepth", "lrl1", ], } automl.fit(X_train=X_train, y_train=y_train, **settings) def test_rank(): from sklearn.externals._arff import ArffException try: X, y = fetch_openml(name=dataset, return_X_y=True) y = y.cat.codes except (ArffException, ValueError): from sklearn.datasets import load_wine X, y = load_wine(return_X_y=True) import numpy as np automl = AutoML() automl_settings = { "time_budget": 2, "task": "rank", "log_file_name": f"test/{dataset}.log", "model_history": True, "eval_method": "cv", "groups": np.array([0] * 200 + [1] * 200 + [2] * 200 + [3] * 200 + [4] * 100 + [5] * 100), # group labels "learner_selector": "roundrobin", } automl.fit(X, y, **automl_settings) automl = AutoML() automl_settings = { "time_budget": 2, "task": "rank", "metric": "ndcg@5", # 5 can be replaced by any number "log_file_name": f"test/{dataset}.log", "model_history": True, "groups": [200] * 4 + [100] * 2, # alternative way: group counts # "estimator_list": ['lgbm', 'xgboost'], # list of ML learners "learner_selector": "roundrobin", } automl.fit(X, y, **automl_settings) def test_object(): from sklearn.externals._arff import ArffException try: X, y = fetch_openml(name=dataset, return_X_y=True) except (ArffException, ValueError): from sklearn.datasets import load_wine X, y = load_wine(return_X_y=True) import numpy as np class TestKFold(KFold): def __init__(self, n_splits): self.n_splits = int(n_splits) def split(self, X): rng = np.random.default_rng() train_num = int(len(X) * 0.8) for _ in range(self.n_splits): permu_idx = rng.permutation(len(X)) yield permu_idx[:train_num], permu_idx[train_num:] def get_n_splits(self, X=None, y=None, groups=None): return self.n_splits automl = AutoML() automl_settings = { "time_budget": 2, "task": "classification", "log_file_name": f"test/{dataset}.log", "model_history": True, "log_training_metric": True, "split_type": TestKFold(5), } automl.fit(X, y, **automl_settings) assert automl._state.eval_method == "cv", "eval_method must be 'cv' for custom data splitter" kf = TestKFold(5) kf.shuffle = True automl_settings["split_type"] = kf automl.fit(X, y, **automl_settings) if __name__ == "__main__": test_groups_for_classification_task()