Expose task-level and estimator-level preprocessors as public API (#1497)

* Initial plan

* Add public preprocess() API methods for AutoML and estimators

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Add documentation for preprocess() API methods

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Add example script demonstrating preprocess() API usage

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Address code review feedback - fix type hints and simplify test logic

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Fix formatting issues with pre-commit hooks

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>

* Remove example.py, make tests faster

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
Co-authored-by: Li Jiang <bnujli@gmail.com>
Co-authored-by: Li Jiang <lijiang1@microsoft.com>
This commit is contained in:
Copilot
2026-01-21 14:38:25 +08:00
committed by GitHub
parent 7ec1414e9b
commit d9e74031e0
4 changed files with 368 additions and 1 deletions

View File

@@ -789,7 +789,7 @@ class AutoML(BaseEstimator):
def predict(
self,
X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame,
X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
**pred_kwargs,
):
"""Predict label from features.
@@ -855,6 +855,50 @@ class AutoML(BaseEstimator):
proba = self._trained_estimator.predict_proba(X, **pred_kwargs)
return proba
def preprocess(
self,
X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
):
"""Preprocess data using task-level preprocessing.
This method applies task-level preprocessing transformations to the input data,
including handling of data types, sparse matrices, and feature transformations
that were learned during the fit phase. This should be called before any
estimator-level preprocessing.
Args:
X: A numpy array or pandas dataframe or pyspark.pandas dataframe
of featurized instances, shape n * m,
or for time series forecast tasks:
a pandas dataframe with the first column containing
timestamp values (datetime type) or an integer n for
the predict steps (only valid when the estimator is
arima or sarimax). Other columns in the dataframe
are assumed to be exogenous variables (categorical
or numeric).
Returns:
Preprocessed data in the same format as input (numpy array, DataFrame, etc.).
Raises:
AttributeError: If the model has not been fitted yet.
Example:
```python
automl = AutoML()
automl.fit(X_train, y_train, task="classification")
# Apply task-level preprocessing to new data
X_test_preprocessed = automl.preprocess(X_test)
```
"""
if not hasattr(self, "_state") or self._state is None:
raise AttributeError("AutoML instance has not been fitted yet. Please call fit() first.")
if not hasattr(self, "_transformer"):
raise AttributeError("Transformer not initialized. Please call fit() first.")
return self._state.task.preprocess(X, self._transformer)
def add_learner(self, learner_name, learner_class):
"""Add a customized learner.

View File

@@ -295,6 +295,35 @@ class BaseEstimator(sklearn.base.ClassifierMixin, sklearn.base.BaseEstimator):
train_time = self._fit(X_train, y_train, **kwargs)
return train_time
def preprocess(self, X):
"""Preprocess data using estimator-level preprocessing.
This method applies estimator-specific preprocessing transformations to the input data.
This is the second level of preprocessing that should be applied after task-level
preprocessing (automl.preprocess()). Different estimator types may apply different
preprocessing steps (e.g., sparse matrix conversion, dataframe handling).
Args:
X: A numpy array or a dataframe of featurized instances, shape n*m.
Returns:
Preprocessed data ready for the estimator's predict/fit methods.
Example:
```python
automl = AutoML()
automl.fit(X_train, y_train, task="classification")
# First apply task-level preprocessing
X_test_task = automl.preprocess(X_test)
# Then apply estimator-level preprocessing
estimator = automl.model
X_test_estimator = estimator.preprocess(X_test_task)
```
"""
return self._preprocess(X)
def predict(self, X, **kwargs):
"""Predict label from features.

View File

@@ -0,0 +1,236 @@
"""Tests for the public preprocessor APIs."""
import unittest
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_diabetes
from flaml import AutoML
class TestPreprocessAPI(unittest.TestCase):
"""Test cases for the public preprocess() API methods."""
def test_automl_preprocess_before_fit(self):
"""Test that calling preprocess before fit raises an error."""
automl = AutoML()
X_test = np.array([[1, 2, 3], [4, 5, 6]])
with self.assertRaises(AttributeError) as context:
automl.preprocess(X_test)
# Check that an error is raised about not being fitted
self.assertIn("fit()", str(context.exception))
def test_automl_preprocess_classification(self):
"""Test task-level preprocessing for classification."""
# Load dataset
X, y = load_breast_cancer(return_X_y=True)
X_train, y_train = X[:400], y[:400]
X_test = X[400:450]
# Train AutoML
automl = AutoML()
automl_settings = {
"max_iter": 5,
"task": "classification",
"metric": "accuracy",
"estimator_list": ["lgbm"],
"verbose": 0,
}
automl.fit(X_train, y_train, **automl_settings)
# Test task-level preprocessing
X_preprocessed = automl.preprocess(X_test)
# Verify the output is not None and has the right shape
self.assertIsNotNone(X_preprocessed)
self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
def test_automl_preprocess_regression(self):
"""Test task-level preprocessing for regression."""
# Load dataset
X, y = load_diabetes(return_X_y=True)
X_train, y_train = X[:300], y[:300]
X_test = X[300:350]
# Train AutoML
automl = AutoML()
automl_settings = {
"max_iter": 5,
"task": "regression",
"metric": "r2",
"estimator_list": ["lgbm"],
"verbose": 0,
}
automl.fit(X_train, y_train, **automl_settings)
# Test task-level preprocessing
X_preprocessed = automl.preprocess(X_test)
# Verify the output
self.assertIsNotNone(X_preprocessed)
self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
def test_automl_preprocess_with_dataframe(self):
"""Test task-level preprocessing with pandas DataFrame."""
# Create a simple dataset
X_train = pd.DataFrame(
{
"feature1": [1, 2, 3, 4, 5] * 20,
"feature2": [5, 4, 3, 2, 1] * 20,
"category": ["a", "b", "a", "b", "a"] * 20,
}
)
y_train = pd.Series([0, 1, 0, 1, 0] * 20)
X_test = pd.DataFrame(
{
"feature1": [6, 7, 8],
"feature2": [1, 2, 3],
"category": ["a", "b", "a"],
}
)
# Train AutoML
automl = AutoML()
automl_settings = {
"max_iter": 5,
"task": "classification",
"metric": "accuracy",
"estimator_list": ["lgbm"],
"verbose": 0,
}
automl.fit(X_train, y_train, **automl_settings)
# Test preprocessing
X_preprocessed = automl.preprocess(X_test)
# Verify the output - check the number of rows matches
self.assertIsNotNone(X_preprocessed)
preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, "__len__") else X_preprocessed.shape[0]
self.assertEqual(preprocessed_len, len(X_test))
def test_estimator_preprocess(self):
"""Test estimator-level preprocessing."""
# Load dataset
X, y = load_breast_cancer(return_X_y=True)
X_train, y_train = X[:400], y[:400]
X_test = X[400:450]
# Train AutoML
automl = AutoML()
automl_settings = {
"max_iter": 5,
"task": "classification",
"metric": "accuracy",
"estimator_list": ["lgbm"],
"verbose": 0,
}
automl.fit(X_train, y_train, **automl_settings)
# Get the trained estimator
estimator = automl.model
self.assertIsNotNone(estimator)
# First apply task-level preprocessing
X_task_preprocessed = automl.preprocess(X_test)
# Then apply estimator-level preprocessing
X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed)
# Verify the output
self.assertIsNotNone(X_estimator_preprocessed)
self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0])
def test_preprocess_pipeline(self):
"""Test the complete preprocessing pipeline (task-level then estimator-level)."""
# Load dataset
X, y = load_breast_cancer(return_X_y=True)
X_train, y_train = X[:400], y[:400]
X_test = X[400:450]
# Train AutoML
automl = AutoML()
automl_settings = {
"max_iter": 5,
"task": "classification",
"metric": "accuracy",
"estimator_list": ["lgbm"],
"verbose": 0,
}
automl.fit(X_train, y_train, **automl_settings)
# Apply the complete preprocessing pipeline
X_task_preprocessed = automl.preprocess(X_test)
X_final = automl.model.preprocess(X_task_preprocessed)
# Verify predictions work with preprocessed data
# The internal predict already does this preprocessing,
# but we verify our manual preprocessing gives consistent results
y_pred_manual = automl.model._model.predict(X_final)
y_pred_auto = automl.predict(X_test)
# Both should give the same predictions
np.testing.assert_array_equal(y_pred_manual, y_pred_auto)
def test_preprocess_with_mixed_types(self):
"""Test preprocessing with mixed data types."""
# Create dataset with mixed types
X_train = pd.DataFrame(
{
"numeric1": np.random.rand(100),
"numeric2": np.random.randint(0, 100, 100),
"categorical": np.random.choice(["cat", "dog", "bird"], 100),
"boolean": np.random.choice([True, False], 100),
}
)
y_train = pd.Series(np.random.randint(0, 2, 100))
X_test = pd.DataFrame(
{
"numeric1": np.random.rand(10),
"numeric2": np.random.randint(0, 100, 10),
"categorical": np.random.choice(["cat", "dog", "bird"], 10),
"boolean": np.random.choice([True, False], 10),
}
)
# Train AutoML
automl = AutoML()
automl_settings = {
"max_iter": 5,
"task": "classification",
"metric": "accuracy",
"estimator_list": ["lgbm"],
"verbose": 0,
}
automl.fit(X_train, y_train, **automl_settings)
# Test preprocessing
X_preprocessed = automl.preprocess(X_test)
# Verify the output
self.assertIsNotNone(X_preprocessed)
def test_estimator_preprocess_without_automl(self):
"""Test that estimator.preprocess() can be used independently."""
from flaml.automl.model import LGBMEstimator
# Create a simple estimator
X_train = np.random.rand(100, 5)
y_train = np.random.randint(0, 2, 100)
estimator = LGBMEstimator(task="classification")
estimator.fit(X_train, y_train)
# Test preprocessing
X_test = np.random.rand(10, 5)
X_preprocessed = estimator.preprocess(X_test)
# Verify the output
self.assertIsNotNone(X_preprocessed)
self.assertEqual(X_preprocessed.shape, X_test.shape)
if __name__ == "__main__":
unittest.main()

View File

@@ -726,6 +726,64 @@ plt.barh(
![png](images/feature_importance.png)
### Preprocess data
FLAML provides two levels of preprocessing that can be accessed as public APIs:
1. **Task-level preprocessing** (`automl.preprocess()`): This applies transformations that are specific to the task type, such as handling data types, sparse matrices, and feature transformations learned during training.
1. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost).
The task-level preprocessing should be applied before the estimator-level preprocessing.
#### Task-level preprocessing
```python
from flaml import AutoML
import numpy as np
# Train the model
automl = AutoML()
automl.fit(X_train, y_train, task="classification", time_budget=60)
# Apply task-level preprocessing to new data
X_test_preprocessed = automl.preprocess(X_test)
# Now you can use this with the estimator
predictions = automl.model.predict(X_test_preprocessed)
```
#### Estimator-level preprocessing
```python
# Get the trained estimator
estimator = automl.model
# Apply task-level preprocessing first
X_test_task = automl.preprocess(X_test)
# Then apply estimator-level preprocessing
X_test_estimator = estimator.preprocess(X_test_task)
# Use the fully preprocessed data with the underlying model
predictions = estimator._model.predict(X_test_estimator)
```
#### Complete preprocessing pipeline
For most use cases, the `predict()` method already handles both levels of preprocessing internally. However, if you need to apply preprocessing separately (e.g., for custom inference pipelines or debugging), you can use:
```python
# Complete preprocessing pipeline
X_task_preprocessed = automl.preprocess(X_test)
X_final = automl.model.preprocess(X_task_preprocessed)
# This is equivalent to what happens internally in:
predictions = automl.predict(X_test)
```
**Note**: The `preprocess()` methods can only be called after `fit()` has been executed, as they rely on the transformations learned during training.
### Get best configuration
We can find the best estimator's name and best configuration by: