From 7ec1414e9b109094daff4dbebe005cddc4929d17 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 21 Jan 2026 14:19:23 +0800 Subject: [PATCH] Clarify period parameter and automatic label lagging in time series forecasting (#1495) * Initial plan * Add comprehensive documentation for period parameter and automatic label lagging Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Address code review feedback on docstring clarity Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Clarify period vs prediction output length per @thinkall's feedback Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Refine terminology per code review feedback Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Run pre-commit formatting fixes Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> Co-authored-by: Li Jiang --- flaml/automl/automl.py | 16 ++++++++++ flaml/automl/time_series/sklearn.py | 31 ++++++++++++++----- .../Examples/AutoML-Time series forecast.md | 19 ++++++++++++ 3 files changed, 59 insertions(+), 7 deletions(-) diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index 493045022..1bfdc2c16 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -1013,6 +1013,14 @@ class AutoML(BaseEstimator): the searched learners, such as sample_weight. Below are a few examples of estimator-specific parameters: period: int | forecast horizon for all time series forecast tasks. + This is the number of time steps ahead to forecast (e.g., period=12 means + forecasting 12 steps into the future). This represents the forecast horizon + used during model training. Note: during prediction, the output length + equals the length of X_test. FLAML automatically handles feature + engineering for you - sklearn-based models (lgbm, rf, xgboost, etc.) will have + lagged features created automatically, while time series native models (prophet, + arima, sarimax) use their built-in forecasting capabilities. You do NOT need + to manually create lagged features of the target variable. gpu_per_trial: float, default = 0 | A float of the number of gpus per trial, only used by TransformersEstimator, XGBoostSklearnEstimator, and TemporalFusionTransformerEstimator. @@ -2107,6 +2115,14 @@ class AutoML(BaseEstimator): the searched learners, such as sample_weight. Below are a few examples of estimator-specific parameters: period: int | forecast horizon for all time series forecast tasks. + This is the number of time steps ahead to forecast (e.g., period=12 means + forecasting 12 steps into the future). This represents the forecast horizon + used during model training. Note: during prediction, the output length + equals the length of X_test. FLAML automatically handles feature + engineering for you - sklearn-based models (lgbm, rf, xgboost, etc.) will have + lagged features created automatically, while time series native models (prophet, + arima, sarimax) use their built-in forecasting capabilities. You do NOT need + to manually create lagged features of the target variable. gpu_per_trial: float, default = 0 | A float of the number of gpus per trial, only used by TransformersEstimator, XGBoostSklearnEstimator, and TemporalFusionTransformerEstimator. diff --git a/flaml/automl/time_series/sklearn.py b/flaml/automl/time_series/sklearn.py index ebe18ed74..eb7172b90 100644 --- a/flaml/automl/time_series/sklearn.py +++ b/flaml/automl/time_series/sklearn.py @@ -17,24 +17,30 @@ from sklearn.preprocessing import StandardScaler def make_lag_features(X: pd.DataFrame, y: pd.Series, lags: int): - """Transform input data X, y into autoregressive form - shift - them appropriately based on horizon and create `lags` columns. + """Transform input data X, y into autoregressive form by creating `lags` columns. + + This function is called automatically by FLAML during the training process + to convert time series data into a format suitable for sklearn-based regression + models (e.g., lgbm, rf, xgboost). Users do NOT need to manually call this function + or create lagged features themselves. Parameters ---------- X : pandas.DataFrame - Input features. + Input feature DataFrame, which may contain temporal features and/or exogenous variables. y : array_like, (1d) - Target vector. + Target vector (time series values to forecast). - horizon : int - length of X for `predict` method + lags : int + Number of lagged time steps to use as features. Returns ------- pandas.DataFrame - shifted dataframe with `lags` columns + Shifted dataframe with `lags` columns for each original feature. + The target variable y is also lagged to prevent data leakage + (i.e., we use y(t-1), y(t-2), ..., y(t-lags) to predict y(t)). """ lag_features = [] @@ -55,6 +61,17 @@ def make_lag_features(X: pd.DataFrame, y: pd.Series, lags: int): class SklearnWrapper: + """Wrapper class for using sklearn-based models for time series forecasting. + + This wrapper automatically handles the transformation of time series data into + a supervised learning format by creating lagged features. It trains separate + models for each step in the forecast horizon. + + Users typically don't interact with this class directly - it's used internally + by FLAML when sklearn-based estimators (lgbm, rf, xgboost, etc.) are selected + for time series forecasting tasks. + """ + def __init__( self, model_class: type, diff --git a/website/docs/Examples/AutoML-Time series forecast.md b/website/docs/Examples/AutoML-Time series forecast.md index 2214d9f0b..9e28f1004 100644 --- a/website/docs/Examples/AutoML-Time series forecast.md +++ b/website/docs/Examples/AutoML-Time series forecast.md @@ -8,6 +8,25 @@ Install the [automl,ts_forecast] option. pip install "flaml[automl,ts_forecast]" ``` +### Understanding the `period` Parameter + +The `period` parameter (also called **horizon** in the code) specifies the **forecast horizon** - the number of future time steps the model is trained to predict. For example: + +- `period=12` means you want to forecast 12 time steps ahead (e.g., 12 months, 12 days) +- `period=7` means you want to forecast 7 time steps ahead + +**Important Note on Prediction**: During the prediction stage, the output length equals the length of `X_test`. This means you can generate predictions for any number of time steps by providing the corresponding timestamps in `X_test`, regardless of the `period` value used during training. + +#### Automatic Feature Engineering + +**Important**: You do NOT need to manually lag the target variable before training. FLAML handles this automatically: + +- **For sklearn-based models** (lgbm, rf, xgboost, extra_tree, catboost): FLAML automatically creates lagged features of both the target variable and any exogenous variables. This transforms the time series forecasting problem into a supervised learning regression problem. + +- **For time series native models** (prophet, arima, sarimax, holt-winters): These models have built-in time series forecasting capabilities and handle temporal dependencies natively. + +The automatic lagging is implemented internally when you call `automl.fit()` with `task="ts_forecast"` or `task="ts_forecast_classification"`, so you can focus on providing clean input data without worrying about feature engineering. + ### Simple NumPy Example ```python