From d9e74031e0fec3354f3bfd49eea2160385232b67 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Wed, 21 Jan 2026 14:38:25 +0800
Subject: [PATCH] Expose task-level and estimator-level preprocessors as public
 API (#1497)

* Initial plan

* Add public preprocess() API methods for AutoML and estimators

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Add documentation for preprocess() API methods

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Add example script demonstrating preprocess() API usage

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Address code review feedback - fix type hints and simplify test logic

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Fix formatting issues with pre-commit hooks

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Remove example.py, make tests faster

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
Co-authored-by: Li Jiang <bnujli@gmail.com>
Co-authored-by: Li Jiang <lijiang1@microsoft.com>
---
 flaml/automl/automl.py                        |  46 +++-
 flaml/automl/model.py                         |  29 +++
 test/automl/test_preprocess_api.py            | 236 ++++++++++++++++++
 .../docs/Use-Cases/Task-Oriented-AutoML.md    |  58 +++++
 4 files changed, 368 insertions(+), 1 deletion(-)
 create mode 100644 test/automl/test_preprocess_api.py

diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
index 1bfdc2c16..cb3fe3785 100644
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -789,7 +789,7 @@ class AutoML(BaseEstimator):
 
     def predict(
         self,
-        X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame,
+        X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
         **pred_kwargs,
     ):
         """Predict label from features.
@@ -855,6 +855,50 @@ class AutoML(BaseEstimator):
         proba = self._trained_estimator.predict_proba(X, **pred_kwargs)
         return proba
 
+    def preprocess(
+        self,
+        X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
+    ):
+        """Preprocess data using task-level preprocessing.
+
+        This method applies task-level preprocessing transformations to the input data,
+        including handling of data types, sparse matrices, and feature transformations
+        that were learned during the fit phase. This should be called before any
+        estimator-level preprocessing.
+
+        Args:
+            X: A numpy array or pandas dataframe or pyspark.pandas dataframe
+                of featurized instances, shape n * m,
+                or for time series forecast tasks:
+                    a pandas dataframe with the first column containing
+                    timestamp values (datetime type) or an integer n for
+                    the predict steps (only valid when the estimator is
+                    arima or sarimax). Other columns in the dataframe
+                    are assumed to be exogenous variables (categorical
+                    or numeric).
+
+        Returns:
+            Preprocessed data in the same format as input (numpy array, DataFrame, etc.).
+
+        Raises:
+            AttributeError: If the model has not been fitted yet.
+
+        Example:
+            ```python
+            automl = AutoML()
+            automl.fit(X_train, y_train, task="classification")
+
+            # Apply task-level preprocessing to new data
+            X_test_preprocessed = automl.preprocess(X_test)
+            ```
+        """
+        if not hasattr(self, "_state") or self._state is None:
+            raise AttributeError("AutoML instance has not been fitted yet. Please call fit() first.")
+        if not hasattr(self, "_transformer"):
+            raise AttributeError("Transformer not initialized. Please call fit() first.")
+
+        return self._state.task.preprocess(X, self._transformer)
+
     def add_learner(self, learner_name, learner_class):
         """Add a customized learner.
 
diff --git a/flaml/automl/model.py b/flaml/automl/model.py
index 0c6c47cec..be99ad8b3 100644
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -295,6 +295,35 @@ class BaseEstimator(sklearn.base.ClassifierMixin, sklearn.base.BaseEstimator):
             train_time = self._fit(X_train, y_train, **kwargs)
         return train_time
 
+    def preprocess(self, X):
+        """Preprocess data using estimator-level preprocessing.
+
+        This method applies estimator-specific preprocessing transformations to the input data.
+        This is the second level of preprocessing that should be applied after task-level
+        preprocessing (automl.preprocess()). Different estimator types may apply different
+        preprocessing steps (e.g., sparse matrix conversion, dataframe handling).
+
+        Args:
+            X: A numpy array or a dataframe of featurized instances, shape n*m.
+
+        Returns:
+            Preprocessed data ready for the estimator's predict/fit methods.
+
+        Example:
+            ```python
+            automl = AutoML()
+            automl.fit(X_train, y_train, task="classification")
+
+            # First apply task-level preprocessing
+            X_test_task = automl.preprocess(X_test)
+
+            # Then apply estimator-level preprocessing
+            estimator = automl.model
+            X_test_estimator = estimator.preprocess(X_test_task)
+            ```
+        """
+        return self._preprocess(X)
+
     def predict(self, X, **kwargs):
         """Predict label from features.
 
diff --git a/test/automl/test_preprocess_api.py b/test/automl/test_preprocess_api.py
new file mode 100644
index 000000000..45b9c6143
--- /dev/null
+++ b/test/automl/test_preprocess_api.py
@@ -0,0 +1,236 @@
+"""Tests for the public preprocessor APIs."""
+import unittest
+
+import numpy as np
+import pandas as pd
+from sklearn.datasets import load_breast_cancer, load_diabetes
+
+from flaml import AutoML
+
+
+class TestPreprocessAPI(unittest.TestCase):
+    """Test cases for the public preprocess() API methods."""
+
+    def test_automl_preprocess_before_fit(self):
+        """Test that calling preprocess before fit raises an error."""
+        automl = AutoML()
+        X_test = np.array([[1, 2, 3], [4, 5, 6]])
+
+        with self.assertRaises(AttributeError) as context:
+            automl.preprocess(X_test)
+        # Check that an error is raised about not being fitted
+        self.assertIn("fit()", str(context.exception))
+
+    def test_automl_preprocess_classification(self):
+        """Test task-level preprocessing for classification."""
+        # Load dataset
+        X, y = load_breast_cancer(return_X_y=True)
+        X_train, y_train = X[:400], y[:400]
+        X_test = X[400:450]
+
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "max_iter": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+
+        # Test task-level preprocessing
+        X_preprocessed = automl.preprocess(X_test)
+
+        # Verify the output is not None and has the right shape
+        self.assertIsNotNone(X_preprocessed)
+        self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
+
+    def test_automl_preprocess_regression(self):
+        """Test task-level preprocessing for regression."""
+        # Load dataset
+        X, y = load_diabetes(return_X_y=True)
+        X_train, y_train = X[:300], y[:300]
+        X_test = X[300:350]
+
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "max_iter": 5,
+            "task": "regression",
+            "metric": "r2",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+
+        # Test task-level preprocessing
+        X_preprocessed = automl.preprocess(X_test)
+
+        # Verify the output
+        self.assertIsNotNone(X_preprocessed)
+        self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
+
+    def test_automl_preprocess_with_dataframe(self):
+        """Test task-level preprocessing with pandas DataFrame."""
+        # Create a simple dataset
+        X_train = pd.DataFrame(
+            {
+                "feature1": [1, 2, 3, 4, 5] * 20,
+                "feature2": [5, 4, 3, 2, 1] * 20,
+                "category": ["a", "b", "a", "b", "a"] * 20,
+            }
+        )
+        y_train = pd.Series([0, 1, 0, 1, 0] * 20)
+
+        X_test = pd.DataFrame(
+            {
+                "feature1": [6, 7, 8],
+                "feature2": [1, 2, 3],
+                "category": ["a", "b", "a"],
+            }
+        )
+
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "max_iter": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+
+        # Test preprocessing
+        X_preprocessed = automl.preprocess(X_test)
+
+        # Verify the output - check the number of rows matches
+        self.assertIsNotNone(X_preprocessed)
+        preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, "__len__") else X_preprocessed.shape[0]
+        self.assertEqual(preprocessed_len, len(X_test))
+
+    def test_estimator_preprocess(self):
+        """Test estimator-level preprocessing."""
+        # Load dataset
+        X, y = load_breast_cancer(return_X_y=True)
+        X_train, y_train = X[:400], y[:400]
+        X_test = X[400:450]
+
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "max_iter": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+
+        # Get the trained estimator
+        estimator = automl.model
+        self.assertIsNotNone(estimator)
+
+        # First apply task-level preprocessing
+        X_task_preprocessed = automl.preprocess(X_test)
+
+        # Then apply estimator-level preprocessing
+        X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed)
+
+        # Verify the output
+        self.assertIsNotNone(X_estimator_preprocessed)
+        self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0])
+
+    def test_preprocess_pipeline(self):
+        """Test the complete preprocessing pipeline (task-level then estimator-level)."""
+        # Load dataset
+        X, y = load_breast_cancer(return_X_y=True)
+        X_train, y_train = X[:400], y[:400]
+        X_test = X[400:450]
+
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "max_iter": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+
+        # Apply the complete preprocessing pipeline
+        X_task_preprocessed = automl.preprocess(X_test)
+        X_final = automl.model.preprocess(X_task_preprocessed)
+
+        # Verify predictions work with preprocessed data
+        # The internal predict already does this preprocessing,
+        # but we verify our manual preprocessing gives consistent results
+        y_pred_manual = automl.model._model.predict(X_final)
+        y_pred_auto = automl.predict(X_test)
+
+        # Both should give the same predictions
+        np.testing.assert_array_equal(y_pred_manual, y_pred_auto)
+
+    def test_preprocess_with_mixed_types(self):
+        """Test preprocessing with mixed data types."""
+        # Create dataset with mixed types
+        X_train = pd.DataFrame(
+            {
+                "numeric1": np.random.rand(100),
+                "numeric2": np.random.randint(0, 100, 100),
+                "categorical": np.random.choice(["cat", "dog", "bird"], 100),
+                "boolean": np.random.choice([True, False], 100),
+            }
+        )
+        y_train = pd.Series(np.random.randint(0, 2, 100))
+
+        X_test = pd.DataFrame(
+            {
+                "numeric1": np.random.rand(10),
+                "numeric2": np.random.randint(0, 100, 10),
+                "categorical": np.random.choice(["cat", "dog", "bird"], 10),
+                "boolean": np.random.choice([True, False], 10),
+            }
+        )
+
+        # Train AutoML
+        automl = AutoML()
+        automl_settings = {
+            "max_iter": 5,
+            "task": "classification",
+            "metric": "accuracy",
+            "estimator_list": ["lgbm"],
+            "verbose": 0,
+        }
+        automl.fit(X_train, y_train, **automl_settings)
+
+        # Test preprocessing
+        X_preprocessed = automl.preprocess(X_test)
+
+        # Verify the output
+        self.assertIsNotNone(X_preprocessed)
+
+    def test_estimator_preprocess_without_automl(self):
+        """Test that estimator.preprocess() can be used independently."""
+        from flaml.automl.model import LGBMEstimator
+
+        # Create a simple estimator
+        X_train = np.random.rand(100, 5)
+        y_train = np.random.randint(0, 2, 100)
+
+        estimator = LGBMEstimator(task="classification")
+        estimator.fit(X_train, y_train)
+
+        # Test preprocessing
+        X_test = np.random.rand(10, 5)
+        X_preprocessed = estimator.preprocess(X_test)
+
+        # Verify the output
+        self.assertIsNotNone(X_preprocessed)
+        self.assertEqual(X_preprocessed.shape, X_test.shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md
index f37af9c37..602602c77 100644
--- a/website/docs/Use-Cases/Task-Oriented-AutoML.md
+++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md
@@ -726,6 +726,64 @@ plt.barh(
 
 ![png](images/feature_importance.png)
 
+### Preprocess data
+
+FLAML provides two levels of preprocessing that can be accessed as public APIs:
+
+1. **Task-level preprocessing** (`automl.preprocess()`): This applies transformations that are specific to the task type, such as handling data types, sparse matrices, and feature transformations learned during training.
+
+1. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost).
+
+The task-level preprocessing should be applied before the estimator-level preprocessing.
+
+#### Task-level preprocessing
+
+```python
+from flaml import AutoML
+import numpy as np
+
+# Train the model
+automl = AutoML()
+automl.fit(X_train, y_train, task="classification", time_budget=60)
+
+# Apply task-level preprocessing to new data
+X_test_preprocessed = automl.preprocess(X_test)
+
+# Now you can use this with the estimator
+predictions = automl.model.predict(X_test_preprocessed)
+```
+
+#### Estimator-level preprocessing
+
+```python
+# Get the trained estimator
+estimator = automl.model
+
+# Apply task-level preprocessing first
+X_test_task = automl.preprocess(X_test)
+
+# Then apply estimator-level preprocessing
+X_test_estimator = estimator.preprocess(X_test_task)
+
+# Use the fully preprocessed data with the underlying model
+predictions = estimator._model.predict(X_test_estimator)
+```
+
+#### Complete preprocessing pipeline
+
+For most use cases, the `predict()` method already handles both levels of preprocessing internally. However, if you need to apply preprocessing separately (e.g., for custom inference pipelines or debugging), you can use:
+
+```python
+# Complete preprocessing pipeline
+X_task_preprocessed = automl.preprocess(X_test)
+X_final = automl.model.preprocess(X_task_preprocessed)
+
+# This is equivalent to what happens internally in:
+predictions = automl.predict(X_test)
+```
+
+**Note**: The `preprocess()` methods can only be called after `fit()` has been executed, as they rely on the transformations learned during training.
+
 ### Get best configuration
 
 We can find the best estimator's name and best configuration by: