mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-09 02:09:16 +08:00
Expose task-level and estimator-level preprocessors as public API (#1497)
* Initial plan * Add public preprocess() API methods for AutoML and estimators Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add documentation for preprocess() API methods Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add example script demonstrating preprocess() API usage Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Address code review feedback - fix type hints and simplify test logic Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix formatting issues with pre-commit hooks Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Remove example.py, make tests faster --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com> Co-authored-by: Li Jiang <lijiang1@microsoft.com>
This commit is contained in:
@@ -789,7 +789,7 @@ class AutoML(BaseEstimator):
|
||||
|
||||
def predict(
|
||||
self,
|
||||
X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame,
|
||||
X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
|
||||
**pred_kwargs,
|
||||
):
|
||||
"""Predict label from features.
|
||||
@@ -855,6 +855,50 @@ class AutoML(BaseEstimator):
|
||||
proba = self._trained_estimator.predict_proba(X, **pred_kwargs)
|
||||
return proba
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
|
||||
):
|
||||
"""Preprocess data using task-level preprocessing.
|
||||
|
||||
This method applies task-level preprocessing transformations to the input data,
|
||||
including handling of data types, sparse matrices, and feature transformations
|
||||
that were learned during the fit phase. This should be called before any
|
||||
estimator-level preprocessing.
|
||||
|
||||
Args:
|
||||
X: A numpy array or pandas dataframe or pyspark.pandas dataframe
|
||||
of featurized instances, shape n * m,
|
||||
or for time series forecast tasks:
|
||||
a pandas dataframe with the first column containing
|
||||
timestamp values (datetime type) or an integer n for
|
||||
the predict steps (only valid when the estimator is
|
||||
arima or sarimax). Other columns in the dataframe
|
||||
are assumed to be exogenous variables (categorical
|
||||
or numeric).
|
||||
|
||||
Returns:
|
||||
Preprocessed data in the same format as input (numpy array, DataFrame, etc.).
|
||||
|
||||
Raises:
|
||||
AttributeError: If the model has not been fitted yet.
|
||||
|
||||
Example:
|
||||
```python
|
||||
automl = AutoML()
|
||||
automl.fit(X_train, y_train, task="classification")
|
||||
|
||||
# Apply task-level preprocessing to new data
|
||||
X_test_preprocessed = automl.preprocess(X_test)
|
||||
```
|
||||
"""
|
||||
if not hasattr(self, "_state") or self._state is None:
|
||||
raise AttributeError("AutoML instance has not been fitted yet. Please call fit() first.")
|
||||
if not hasattr(self, "_transformer"):
|
||||
raise AttributeError("Transformer not initialized. Please call fit() first.")
|
||||
|
||||
return self._state.task.preprocess(X, self._transformer)
|
||||
|
||||
def add_learner(self, learner_name, learner_class):
|
||||
"""Add a customized learner.
|
||||
|
||||
|
||||
@@ -295,6 +295,35 @@ class BaseEstimator(sklearn.base.ClassifierMixin, sklearn.base.BaseEstimator):
|
||||
train_time = self._fit(X_train, y_train, **kwargs)
|
||||
return train_time
|
||||
|
||||
def preprocess(self, X):
|
||||
"""Preprocess data using estimator-level preprocessing.
|
||||
|
||||
This method applies estimator-specific preprocessing transformations to the input data.
|
||||
This is the second level of preprocessing that should be applied after task-level
|
||||
preprocessing (automl.preprocess()). Different estimator types may apply different
|
||||
preprocessing steps (e.g., sparse matrix conversion, dataframe handling).
|
||||
|
||||
Args:
|
||||
X: A numpy array or a dataframe of featurized instances, shape n*m.
|
||||
|
||||
Returns:
|
||||
Preprocessed data ready for the estimator's predict/fit methods.
|
||||
|
||||
Example:
|
||||
```python
|
||||
automl = AutoML()
|
||||
automl.fit(X_train, y_train, task="classification")
|
||||
|
||||
# First apply task-level preprocessing
|
||||
X_test_task = automl.preprocess(X_test)
|
||||
|
||||
# Then apply estimator-level preprocessing
|
||||
estimator = automl.model
|
||||
X_test_estimator = estimator.preprocess(X_test_task)
|
||||
```
|
||||
"""
|
||||
return self._preprocess(X)
|
||||
|
||||
def predict(self, X, **kwargs):
|
||||
"""Predict label from features.
|
||||
|
||||
|
||||
236
test/automl/test_preprocess_api.py
Normal file
236
test/automl/test_preprocess_api.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""Tests for the public preprocessor APIs."""
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.datasets import load_breast_cancer, load_diabetes
|
||||
|
||||
from flaml import AutoML
|
||||
|
||||
|
||||
class TestPreprocessAPI(unittest.TestCase):
|
||||
"""Test cases for the public preprocess() API methods."""
|
||||
|
||||
def test_automl_preprocess_before_fit(self):
|
||||
"""Test that calling preprocess before fit raises an error."""
|
||||
automl = AutoML()
|
||||
X_test = np.array([[1, 2, 3], [4, 5, 6]])
|
||||
|
||||
with self.assertRaises(AttributeError) as context:
|
||||
automl.preprocess(X_test)
|
||||
# Check that an error is raised about not being fitted
|
||||
self.assertIn("fit()", str(context.exception))
|
||||
|
||||
def test_automl_preprocess_classification(self):
|
||||
"""Test task-level preprocessing for classification."""
|
||||
# Load dataset
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X_train, y_train = X[:400], y[:400]
|
||||
X_test = X[400:450]
|
||||
|
||||
# Train AutoML
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"task": "classification",
|
||||
"metric": "accuracy",
|
||||
"estimator_list": ["lgbm"],
|
||||
"verbose": 0,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
# Test task-level preprocessing
|
||||
X_preprocessed = automl.preprocess(X_test)
|
||||
|
||||
# Verify the output is not None and has the right shape
|
||||
self.assertIsNotNone(X_preprocessed)
|
||||
self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
|
||||
|
||||
def test_automl_preprocess_regression(self):
|
||||
"""Test task-level preprocessing for regression."""
|
||||
# Load dataset
|
||||
X, y = load_diabetes(return_X_y=True)
|
||||
X_train, y_train = X[:300], y[:300]
|
||||
X_test = X[300:350]
|
||||
|
||||
# Train AutoML
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"task": "regression",
|
||||
"metric": "r2",
|
||||
"estimator_list": ["lgbm"],
|
||||
"verbose": 0,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
# Test task-level preprocessing
|
||||
X_preprocessed = automl.preprocess(X_test)
|
||||
|
||||
# Verify the output
|
||||
self.assertIsNotNone(X_preprocessed)
|
||||
self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
|
||||
|
||||
def test_automl_preprocess_with_dataframe(self):
|
||||
"""Test task-level preprocessing with pandas DataFrame."""
|
||||
# Create a simple dataset
|
||||
X_train = pd.DataFrame(
|
||||
{
|
||||
"feature1": [1, 2, 3, 4, 5] * 20,
|
||||
"feature2": [5, 4, 3, 2, 1] * 20,
|
||||
"category": ["a", "b", "a", "b", "a"] * 20,
|
||||
}
|
||||
)
|
||||
y_train = pd.Series([0, 1, 0, 1, 0] * 20)
|
||||
|
||||
X_test = pd.DataFrame(
|
||||
{
|
||||
"feature1": [6, 7, 8],
|
||||
"feature2": [1, 2, 3],
|
||||
"category": ["a", "b", "a"],
|
||||
}
|
||||
)
|
||||
|
||||
# Train AutoML
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"task": "classification",
|
||||
"metric": "accuracy",
|
||||
"estimator_list": ["lgbm"],
|
||||
"verbose": 0,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
# Test preprocessing
|
||||
X_preprocessed = automl.preprocess(X_test)
|
||||
|
||||
# Verify the output - check the number of rows matches
|
||||
self.assertIsNotNone(X_preprocessed)
|
||||
preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, "__len__") else X_preprocessed.shape[0]
|
||||
self.assertEqual(preprocessed_len, len(X_test))
|
||||
|
||||
def test_estimator_preprocess(self):
|
||||
"""Test estimator-level preprocessing."""
|
||||
# Load dataset
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X_train, y_train = X[:400], y[:400]
|
||||
X_test = X[400:450]
|
||||
|
||||
# Train AutoML
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"task": "classification",
|
||||
"metric": "accuracy",
|
||||
"estimator_list": ["lgbm"],
|
||||
"verbose": 0,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
# Get the trained estimator
|
||||
estimator = automl.model
|
||||
self.assertIsNotNone(estimator)
|
||||
|
||||
# First apply task-level preprocessing
|
||||
X_task_preprocessed = automl.preprocess(X_test)
|
||||
|
||||
# Then apply estimator-level preprocessing
|
||||
X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed)
|
||||
|
||||
# Verify the output
|
||||
self.assertIsNotNone(X_estimator_preprocessed)
|
||||
self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0])
|
||||
|
||||
def test_preprocess_pipeline(self):
|
||||
"""Test the complete preprocessing pipeline (task-level then estimator-level)."""
|
||||
# Load dataset
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X_train, y_train = X[:400], y[:400]
|
||||
X_test = X[400:450]
|
||||
|
||||
# Train AutoML
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"task": "classification",
|
||||
"metric": "accuracy",
|
||||
"estimator_list": ["lgbm"],
|
||||
"verbose": 0,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
# Apply the complete preprocessing pipeline
|
||||
X_task_preprocessed = automl.preprocess(X_test)
|
||||
X_final = automl.model.preprocess(X_task_preprocessed)
|
||||
|
||||
# Verify predictions work with preprocessed data
|
||||
# The internal predict already does this preprocessing,
|
||||
# but we verify our manual preprocessing gives consistent results
|
||||
y_pred_manual = automl.model._model.predict(X_final)
|
||||
y_pred_auto = automl.predict(X_test)
|
||||
|
||||
# Both should give the same predictions
|
||||
np.testing.assert_array_equal(y_pred_manual, y_pred_auto)
|
||||
|
||||
def test_preprocess_with_mixed_types(self):
|
||||
"""Test preprocessing with mixed data types."""
|
||||
# Create dataset with mixed types
|
||||
X_train = pd.DataFrame(
|
||||
{
|
||||
"numeric1": np.random.rand(100),
|
||||
"numeric2": np.random.randint(0, 100, 100),
|
||||
"categorical": np.random.choice(["cat", "dog", "bird"], 100),
|
||||
"boolean": np.random.choice([True, False], 100),
|
||||
}
|
||||
)
|
||||
y_train = pd.Series(np.random.randint(0, 2, 100))
|
||||
|
||||
X_test = pd.DataFrame(
|
||||
{
|
||||
"numeric1": np.random.rand(10),
|
||||
"numeric2": np.random.randint(0, 100, 10),
|
||||
"categorical": np.random.choice(["cat", "dog", "bird"], 10),
|
||||
"boolean": np.random.choice([True, False], 10),
|
||||
}
|
||||
)
|
||||
|
||||
# Train AutoML
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"task": "classification",
|
||||
"metric": "accuracy",
|
||||
"estimator_list": ["lgbm"],
|
||||
"verbose": 0,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
# Test preprocessing
|
||||
X_preprocessed = automl.preprocess(X_test)
|
||||
|
||||
# Verify the output
|
||||
self.assertIsNotNone(X_preprocessed)
|
||||
|
||||
def test_estimator_preprocess_without_automl(self):
|
||||
"""Test that estimator.preprocess() can be used independently."""
|
||||
from flaml.automl.model import LGBMEstimator
|
||||
|
||||
# Create a simple estimator
|
||||
X_train = np.random.rand(100, 5)
|
||||
y_train = np.random.randint(0, 2, 100)
|
||||
|
||||
estimator = LGBMEstimator(task="classification")
|
||||
estimator.fit(X_train, y_train)
|
||||
|
||||
# Test preprocessing
|
||||
X_test = np.random.rand(10, 5)
|
||||
X_preprocessed = estimator.preprocess(X_test)
|
||||
|
||||
# Verify the output
|
||||
self.assertIsNotNone(X_preprocessed)
|
||||
self.assertEqual(X_preprocessed.shape, X_test.shape)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -726,6 +726,64 @@ plt.barh(
|
||||
|
||||

|
||||
|
||||
### Preprocess data
|
||||
|
||||
FLAML provides two levels of preprocessing that can be accessed as public APIs:
|
||||
|
||||
1. **Task-level preprocessing** (`automl.preprocess()`): This applies transformations that are specific to the task type, such as handling data types, sparse matrices, and feature transformations learned during training.
|
||||
|
||||
1. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost).
|
||||
|
||||
The task-level preprocessing should be applied before the estimator-level preprocessing.
|
||||
|
||||
#### Task-level preprocessing
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
import numpy as np
|
||||
|
||||
# Train the model
|
||||
automl = AutoML()
|
||||
automl.fit(X_train, y_train, task="classification", time_budget=60)
|
||||
|
||||
# Apply task-level preprocessing to new data
|
||||
X_test_preprocessed = automl.preprocess(X_test)
|
||||
|
||||
# Now you can use this with the estimator
|
||||
predictions = automl.model.predict(X_test_preprocessed)
|
||||
```
|
||||
|
||||
#### Estimator-level preprocessing
|
||||
|
||||
```python
|
||||
# Get the trained estimator
|
||||
estimator = automl.model
|
||||
|
||||
# Apply task-level preprocessing first
|
||||
X_test_task = automl.preprocess(X_test)
|
||||
|
||||
# Then apply estimator-level preprocessing
|
||||
X_test_estimator = estimator.preprocess(X_test_task)
|
||||
|
||||
# Use the fully preprocessed data with the underlying model
|
||||
predictions = estimator._model.predict(X_test_estimator)
|
||||
```
|
||||
|
||||
#### Complete preprocessing pipeline
|
||||
|
||||
For most use cases, the `predict()` method already handles both levels of preprocessing internally. However, if you need to apply preprocessing separately (e.g., for custom inference pipelines or debugging), you can use:
|
||||
|
||||
```python
|
||||
# Complete preprocessing pipeline
|
||||
X_task_preprocessed = automl.preprocess(X_test)
|
||||
X_final = automl.model.preprocess(X_task_preprocessed)
|
||||
|
||||
# This is equivalent to what happens internally in:
|
||||
predictions = automl.predict(X_test)
|
||||
```
|
||||
|
||||
**Note**: The `preprocess()` methods can only be called after `fit()` has been executed, as they rely on the transformations learned during training.
|
||||
|
||||
### Get best configuration
|
||||
|
||||
We can find the best estimator's name and best configuration by:
|
||||
|
||||
Reference in New Issue
Block a user