From d9e74031e0fec3354f3bfd49eea2160385232b67 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 21 Jan 2026 14:38:25 +0800 Subject: [PATCH] Expose task-level and estimator-level preprocessors as public API (#1497) * Initial plan * Add public preprocess() API methods for AutoML and estimators Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add documentation for preprocess() API methods Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add example script demonstrating preprocess() API usage Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Address code review feedback - fix type hints and simplify test logic Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix formatting issues with pre-commit hooks Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Remove example.py, make tests faster --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> Co-authored-by: Li Jiang Co-authored-by: Li Jiang --- flaml/automl/automl.py | 46 +++- flaml/automl/model.py | 29 +++ test/automl/test_preprocess_api.py | 236 ++++++++++++++++++ .../docs/Use-Cases/Task-Oriented-AutoML.md | 58 +++++ 4 files changed, 368 insertions(+), 1 deletion(-) create mode 100644 test/automl/test_preprocess_api.py diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index 1bfdc2c16..cb3fe3785 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -789,7 +789,7 @@ class AutoML(BaseEstimator): def predict( self, - X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame, + X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame, **pred_kwargs, ): """Predict label from features. @@ -855,6 +855,50 @@ class AutoML(BaseEstimator): proba = self._trained_estimator.predict_proba(X, **pred_kwargs) return proba + def preprocess( + self, + X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame, + ): + """Preprocess data using task-level preprocessing. + + This method applies task-level preprocessing transformations to the input data, + including handling of data types, sparse matrices, and feature transformations + that were learned during the fit phase. This should be called before any + estimator-level preprocessing. + + Args: + X: A numpy array or pandas dataframe or pyspark.pandas dataframe + of featurized instances, shape n * m, + or for time series forecast tasks: + a pandas dataframe with the first column containing + timestamp values (datetime type) or an integer n for + the predict steps (only valid when the estimator is + arima or sarimax). Other columns in the dataframe + are assumed to be exogenous variables (categorical + or numeric). + + Returns: + Preprocessed data in the same format as input (numpy array, DataFrame, etc.). + + Raises: + AttributeError: If the model has not been fitted yet. + + Example: + ```python + automl = AutoML() + automl.fit(X_train, y_train, task="classification") + + # Apply task-level preprocessing to new data + X_test_preprocessed = automl.preprocess(X_test) + ``` + """ + if not hasattr(self, "_state") or self._state is None: + raise AttributeError("AutoML instance has not been fitted yet. Please call fit() first.") + if not hasattr(self, "_transformer"): + raise AttributeError("Transformer not initialized. Please call fit() first.") + + return self._state.task.preprocess(X, self._transformer) + def add_learner(self, learner_name, learner_class): """Add a customized learner. diff --git a/flaml/automl/model.py b/flaml/automl/model.py index 0c6c47cec..be99ad8b3 100644 --- a/flaml/automl/model.py +++ b/flaml/automl/model.py @@ -295,6 +295,35 @@ class BaseEstimator(sklearn.base.ClassifierMixin, sklearn.base.BaseEstimator): train_time = self._fit(X_train, y_train, **kwargs) return train_time + def preprocess(self, X): + """Preprocess data using estimator-level preprocessing. + + This method applies estimator-specific preprocessing transformations to the input data. + This is the second level of preprocessing that should be applied after task-level + preprocessing (automl.preprocess()). Different estimator types may apply different + preprocessing steps (e.g., sparse matrix conversion, dataframe handling). + + Args: + X: A numpy array or a dataframe of featurized instances, shape n*m. + + Returns: + Preprocessed data ready for the estimator's predict/fit methods. + + Example: + ```python + automl = AutoML() + automl.fit(X_train, y_train, task="classification") + + # First apply task-level preprocessing + X_test_task = automl.preprocess(X_test) + + # Then apply estimator-level preprocessing + estimator = automl.model + X_test_estimator = estimator.preprocess(X_test_task) + ``` + """ + return self._preprocess(X) + def predict(self, X, **kwargs): """Predict label from features. diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py new file mode 100644 index 000000000..45b9c6143 --- /dev/null +++ b/test/automl/test_preprocess_api.py @@ -0,0 +1,236 @@ +"""Tests for the public preprocessor APIs.""" +import unittest + +import numpy as np +import pandas as pd +from sklearn.datasets import load_breast_cancer, load_diabetes + +from flaml import AutoML + + +class TestPreprocessAPI(unittest.TestCase): + """Test cases for the public preprocess() API methods.""" + + def test_automl_preprocess_before_fit(self): + """Test that calling preprocess before fit raises an error.""" + automl = AutoML() + X_test = np.array([[1, 2, 3], [4, 5, 6]]) + + with self.assertRaises(AttributeError) as context: + automl.preprocess(X_test) + # Check that an error is raised about not being fitted + self.assertIn("fit()", str(context.exception)) + + def test_automl_preprocess_classification(self): + """Test task-level preprocessing for classification.""" + # Load dataset + X, y = load_breast_cancer(return_X_y=True) + X_train, y_train = X[:400], y[:400] + X_test = X[400:450] + + # Train AutoML + automl = AutoML() + automl_settings = { + "max_iter": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Test task-level preprocessing + X_preprocessed = automl.preprocess(X_test) + + # Verify the output is not None and has the right shape + self.assertIsNotNone(X_preprocessed) + self.assertEqual(X_preprocessed.shape[0], X_test.shape[0]) + + def test_automl_preprocess_regression(self): + """Test task-level preprocessing for regression.""" + # Load dataset + X, y = load_diabetes(return_X_y=True) + X_train, y_train = X[:300], y[:300] + X_test = X[300:350] + + # Train AutoML + automl = AutoML() + automl_settings = { + "max_iter": 5, + "task": "regression", + "metric": "r2", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Test task-level preprocessing + X_preprocessed = automl.preprocess(X_test) + + # Verify the output + self.assertIsNotNone(X_preprocessed) + self.assertEqual(X_preprocessed.shape[0], X_test.shape[0]) + + def test_automl_preprocess_with_dataframe(self): + """Test task-level preprocessing with pandas DataFrame.""" + # Create a simple dataset + X_train = pd.DataFrame( + { + "feature1": [1, 2, 3, 4, 5] * 20, + "feature2": [5, 4, 3, 2, 1] * 20, + "category": ["a", "b", "a", "b", "a"] * 20, + } + ) + y_train = pd.Series([0, 1, 0, 1, 0] * 20) + + X_test = pd.DataFrame( + { + "feature1": [6, 7, 8], + "feature2": [1, 2, 3], + "category": ["a", "b", "a"], + } + ) + + # Train AutoML + automl = AutoML() + automl_settings = { + "max_iter": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Test preprocessing + X_preprocessed = automl.preprocess(X_test) + + # Verify the output - check the number of rows matches + self.assertIsNotNone(X_preprocessed) + preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, "__len__") else X_preprocessed.shape[0] + self.assertEqual(preprocessed_len, len(X_test)) + + def test_estimator_preprocess(self): + """Test estimator-level preprocessing.""" + # Load dataset + X, y = load_breast_cancer(return_X_y=True) + X_train, y_train = X[:400], y[:400] + X_test = X[400:450] + + # Train AutoML + automl = AutoML() + automl_settings = { + "max_iter": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Get the trained estimator + estimator = automl.model + self.assertIsNotNone(estimator) + + # First apply task-level preprocessing + X_task_preprocessed = automl.preprocess(X_test) + + # Then apply estimator-level preprocessing + X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed) + + # Verify the output + self.assertIsNotNone(X_estimator_preprocessed) + self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0]) + + def test_preprocess_pipeline(self): + """Test the complete preprocessing pipeline (task-level then estimator-level).""" + # Load dataset + X, y = load_breast_cancer(return_X_y=True) + X_train, y_train = X[:400], y[:400] + X_test = X[400:450] + + # Train AutoML + automl = AutoML() + automl_settings = { + "max_iter": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Apply the complete preprocessing pipeline + X_task_preprocessed = automl.preprocess(X_test) + X_final = automl.model.preprocess(X_task_preprocessed) + + # Verify predictions work with preprocessed data + # The internal predict already does this preprocessing, + # but we verify our manual preprocessing gives consistent results + y_pred_manual = automl.model._model.predict(X_final) + y_pred_auto = automl.predict(X_test) + + # Both should give the same predictions + np.testing.assert_array_equal(y_pred_manual, y_pred_auto) + + def test_preprocess_with_mixed_types(self): + """Test preprocessing with mixed data types.""" + # Create dataset with mixed types + X_train = pd.DataFrame( + { + "numeric1": np.random.rand(100), + "numeric2": np.random.randint(0, 100, 100), + "categorical": np.random.choice(["cat", "dog", "bird"], 100), + "boolean": np.random.choice([True, False], 100), + } + ) + y_train = pd.Series(np.random.randint(0, 2, 100)) + + X_test = pd.DataFrame( + { + "numeric1": np.random.rand(10), + "numeric2": np.random.randint(0, 100, 10), + "categorical": np.random.choice(["cat", "dog", "bird"], 10), + "boolean": np.random.choice([True, False], 10), + } + ) + + # Train AutoML + automl = AutoML() + automl_settings = { + "max_iter": 5, + "task": "classification", + "metric": "accuracy", + "estimator_list": ["lgbm"], + "verbose": 0, + } + automl.fit(X_train, y_train, **automl_settings) + + # Test preprocessing + X_preprocessed = automl.preprocess(X_test) + + # Verify the output + self.assertIsNotNone(X_preprocessed) + + def test_estimator_preprocess_without_automl(self): + """Test that estimator.preprocess() can be used independently.""" + from flaml.automl.model import LGBMEstimator + + # Create a simple estimator + X_train = np.random.rand(100, 5) + y_train = np.random.randint(0, 2, 100) + + estimator = LGBMEstimator(task="classification") + estimator.fit(X_train, y_train) + + # Test preprocessing + X_test = np.random.rand(10, 5) + X_preprocessed = estimator.preprocess(X_test) + + # Verify the output + self.assertIsNotNone(X_preprocessed) + self.assertEqual(X_preprocessed.shape, X_test.shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md index f37af9c37..602602c77 100644 --- a/website/docs/Use-Cases/Task-Oriented-AutoML.md +++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md @@ -726,6 +726,64 @@ plt.barh( ![png](images/feature_importance.png) +### Preprocess data + +FLAML provides two levels of preprocessing that can be accessed as public APIs: + +1. **Task-level preprocessing** (`automl.preprocess()`): This applies transformations that are specific to the task type, such as handling data types, sparse matrices, and feature transformations learned during training. + +1. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost). + +The task-level preprocessing should be applied before the estimator-level preprocessing. + +#### Task-level preprocessing + +```python +from flaml import AutoML +import numpy as np + +# Train the model +automl = AutoML() +automl.fit(X_train, y_train, task="classification", time_budget=60) + +# Apply task-level preprocessing to new data +X_test_preprocessed = automl.preprocess(X_test) + +# Now you can use this with the estimator +predictions = automl.model.predict(X_test_preprocessed) +``` + +#### Estimator-level preprocessing + +```python +# Get the trained estimator +estimator = automl.model + +# Apply task-level preprocessing first +X_test_task = automl.preprocess(X_test) + +# Then apply estimator-level preprocessing +X_test_estimator = estimator.preprocess(X_test_task) + +# Use the fully preprocessed data with the underlying model +predictions = estimator._model.predict(X_test_estimator) +``` + +#### Complete preprocessing pipeline + +For most use cases, the `predict()` method already handles both levels of preprocessing internally. However, if you need to apply preprocessing separately (e.g., for custom inference pipelines or debugging), you can use: + +```python +# Complete preprocessing pipeline +X_task_preprocessed = automl.preprocess(X_test) +X_final = automl.model.preprocess(X_task_preprocessed) + +# This is equivalent to what happens internally in: +predictions = automl.predict(X_test) +``` + +**Note**: The `preprocess()` methods can only be called after `fit()` has been executed, as they rely on the transformations learned during training. + ### Get best configuration We can find the best estimator's name and best configuration by: