mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-09 02:09:16 +08:00
time series forecasting with panel datasets (#541)
* time series forecasting with panel datasets - integrate Temporal Fusion Transformer as a learner based on pytorchforecasting Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update setup.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update test_forecast.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update setup.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update setup.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update model.py and test_forecast.py - remove blank lines Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update model.py to prevent errors Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update automl.py and data.py - change forecast task name - update documentation for fit() method Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update test_forecast.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update test_forecast.py - add performance test - use 'fit_kwargs_by_estimator' Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * add time index function Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update test_forecast.py performance test Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update data.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update automl.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update data.py to prevent type error Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update setup.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update for pytorch forecasting tft on panel datasets Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update automl.py documentations Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * - rename estimator - add 'gpu_per_trial' for tft estimator Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update test_forecast.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * include ts panel forecasting as an example Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update model.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update documentations Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update automl_time_series_forecast.ipynb Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update documentations Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * "weights_summary" argument deprecated and removed for pl.Trainer() Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update model.py tft estimator prediction method Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update model.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update `fit_kwargs` documentation Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * update automl.py Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
141
flaml/automl.py
141
flaml/automl.py
@@ -44,6 +44,8 @@ from .data import (
|
||||
TOKENCLASSIFICATION,
|
||||
TS_FORECAST,
|
||||
TS_FORECASTREGRESSION,
|
||||
TS_FORECASTPANEL,
|
||||
TS_TIMESTAMP_COL,
|
||||
REGRESSION,
|
||||
_is_nlp_task,
|
||||
NLG_TASKS,
|
||||
@@ -582,7 +584,7 @@ class AutoML(BaseEstimator):
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
"auto" -> uniform.
|
||||
For ts_forecast tasks, must be "auto" or 'time'.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
hpo_method: str, default="auto" | The hyperparameter
|
||||
optimization method. By default, CFO is used for sequential
|
||||
@@ -897,7 +899,7 @@ class AutoML(BaseEstimator):
|
||||
|
||||
Args:
|
||||
X: A numpy array of featurized instances, shape n * m,
|
||||
or for ts_forecast tasks:
|
||||
or for time series forcast tasks:
|
||||
a pandas dataframe with the first column containing
|
||||
timestamp values (datetime type) or an integer n for
|
||||
the predict steps (only valid when the estimator is
|
||||
@@ -1275,18 +1277,38 @@ class AutoML(BaseEstimator):
|
||||
# if eval_method = holdout, make holdout data
|
||||
if self._split_type == "time":
|
||||
if self._state.task in TS_FORECAST:
|
||||
num_samples = X_train_all.shape[0]
|
||||
period = self._state.fit_kwargs[
|
||||
"period"
|
||||
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
||||
assert (
|
||||
period < num_samples
|
||||
), f"period={period}>#examples={num_samples}"
|
||||
split_idx = num_samples - period
|
||||
X_train = X_train_all[:split_idx]
|
||||
y_train = y_train_all[:split_idx]
|
||||
X_val = X_train_all[split_idx:]
|
||||
y_val = y_train_all[split_idx:]
|
||||
if self._state.task == TS_FORECASTPANEL:
|
||||
X_train_all["time_idx"] -= X_train_all["time_idx"].min()
|
||||
X_train_all["time_idx"] = X_train_all["time_idx"].astype("int")
|
||||
ids = self._state.fit_kwargs["group_ids"].copy()
|
||||
ids.append(TS_TIMESTAMP_COL)
|
||||
ids.append("time_idx")
|
||||
y_train_all = pd.DataFrame(y_train_all)
|
||||
y_train_all[ids] = X_train_all[ids]
|
||||
X_train_all = X_train_all.sort_values(ids)
|
||||
y_train_all = y_train_all.sort_values(ids)
|
||||
training_cutoff = X_train_all["time_idx"].max() - period
|
||||
X_train = X_train_all[lambda x: x.time_idx <= training_cutoff]
|
||||
y_train = y_train_all[
|
||||
lambda x: x.time_idx <= training_cutoff
|
||||
].drop(columns=ids)
|
||||
X_val = X_train_all[lambda x: x.time_idx > training_cutoff]
|
||||
y_val = y_train_all[
|
||||
lambda x: x.time_idx > training_cutoff
|
||||
].drop(columns=ids)
|
||||
else:
|
||||
num_samples = X_train_all.shape[0]
|
||||
assert (
|
||||
period < num_samples
|
||||
), f"period={period}>#examples={num_samples}"
|
||||
split_idx = num_samples - period
|
||||
X_train = X_train_all[:split_idx]
|
||||
y_train = y_train_all[:split_idx]
|
||||
X_val = X_train_all[split_idx:]
|
||||
y_val = y_train_all[split_idx:]
|
||||
else:
|
||||
if (
|
||||
"sample_weight" in self._state.fit_kwargs
|
||||
@@ -1456,7 +1478,10 @@ class AutoML(BaseEstimator):
|
||||
)
|
||||
elif self._split_type == "time":
|
||||
# logger.info("Using TimeSeriesSplit")
|
||||
if self._state.task in TS_FORECAST:
|
||||
if (
|
||||
self._state.task in TS_FORECAST
|
||||
and self._state.task is not TS_FORECASTPANEL
|
||||
):
|
||||
period = self._state.fit_kwargs[
|
||||
"period"
|
||||
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
||||
@@ -1468,6 +1493,14 @@ class AutoML(BaseEstimator):
|
||||
)
|
||||
logger.info(f"Using nsplits={n_splits} due to data size limit.")
|
||||
self._state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
|
||||
elif self._state.task is TS_FORECASTPANEL:
|
||||
n_groups = X_train.groupby(
|
||||
self._state.fit_kwargs.get("group_ids")
|
||||
).ngroups
|
||||
period = self._state.fit_kwargs.get("period")
|
||||
self._state.kf = TimeSeriesSplit(
|
||||
n_splits=n_splits, test_size=period * n_groups
|
||||
)
|
||||
else:
|
||||
self._state.kf = TimeSeriesSplit(n_splits=n_splits)
|
||||
elif isinstance(self._split_type, str):
|
||||
@@ -1555,13 +1588,13 @@ class AutoML(BaseEstimator):
|
||||
Args:
|
||||
log_file_name: A string of the log file name.
|
||||
X_train: A numpy array or dataframe of training data in shape n*m.
|
||||
For ts_forecast tasks, the first column of X_train
|
||||
For time series forecast tasks, the first column of X_train
|
||||
must be the timestamp column (datetime type). Other
|
||||
columns in the dataframe are assumed to be exogenous
|
||||
variables (categorical or numeric).
|
||||
y_train: A numpy array or series of labels in shape n*1.
|
||||
dataframe: A dataframe of training data including label column.
|
||||
For ts_forecast tasks, dataframe must be specified and should
|
||||
For time series forecast tasks, dataframe must be specified and should
|
||||
have at least two columns: timestamp and label, where the first
|
||||
column is the timestamp column (datetime type). Other columns
|
||||
in the dataframe are assumed to be exogenous variables
|
||||
@@ -1588,7 +1621,7 @@ class AutoML(BaseEstimator):
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
"auto" -> uniform.
|
||||
For ts_forecast tasks, must be "auto" or 'time'.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
groups: None or array-like | Group labels (with matching length to
|
||||
y_train) or groups counts (with sum equal to length of y_train)
|
||||
@@ -1634,10 +1667,29 @@ class AutoML(BaseEstimator):
|
||||
```
|
||||
|
||||
**fit_kwargs: Other key word arguments to pass to fit() function of
|
||||
the searched learners, such as sample_weight. Include:
|
||||
period: int | forecast horizon for ts_forecast tasks.
|
||||
the searched learners, such as sample_weight. Below are a few examples of
|
||||
estimator-specific parameters:
|
||||
period: int | forecast horizon for all time series forecast tasks.
|
||||
gpu_per_trial: float, default = 0 | A float of the number of gpus per trial,
|
||||
only used by TransformersEstimator and XGBoostSklearnEstimator.
|
||||
only used by TransformersEstimator, XGBoostSklearnEstimator, and
|
||||
TemporalFusionTransformerEstimator.
|
||||
group_ids: list of strings of column names identifying a time series, only
|
||||
used by TemporalFusionTransformerEstimator, required for
|
||||
'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object
|
||||
from PyTorchForecasting.
|
||||
For other parameters to describe your dataset, refer to
|
||||
[TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).
|
||||
To specify your variables, use `static_categoricals`, `static_reals`,
|
||||
`time_varying_known_categoricals`, `time_varying_known_reals`,
|
||||
`time_varying_unknown_categoricals`, `time_varying_unknown_reals`,
|
||||
`variable_groups`. To provide more information on your data, use
|
||||
`max_encoder_length`, `min_encoder_length`, `lags`.
|
||||
log_dir: str, default = "lightning_logs" | Folder into which to log results
|
||||
for tensorboard, only used by TemporalFusionTransformerEstimator.
|
||||
max_epochs: int, default = 20 | Maximum number of epochs to run training,
|
||||
only used by TemporalFusionTransformerEstimator.
|
||||
batch_size: int, default = 64 | Batch size for training model, only
|
||||
used by TemporalFusionTransformerEstimator.
|
||||
"""
|
||||
task = task or self._settings.get("task")
|
||||
eval_method = eval_method or self._settings.get("eval_method")
|
||||
@@ -1771,11 +1823,15 @@ class AutoML(BaseEstimator):
|
||||
elif self._state.task in TS_FORECAST:
|
||||
assert split_type in ["auto", "time"]
|
||||
self._split_type = "time"
|
||||
|
||||
assert isinstance(
|
||||
self._state.fit_kwargs.get("period"),
|
||||
int, # NOTE: _decide_split_type is before kwargs is updated to fit_kwargs_by_estimator
|
||||
), f"missing a required integer 'period' for '{TS_FORECAST}' task."
|
||||
if self._state.fit_kwargs.get("group_ids"):
|
||||
self._state.task == TS_FORECASTPANEL
|
||||
assert isinstance(
|
||||
self._state.fit_kwargs.get("group_ids"), list
|
||||
), f"missing a required List[str] 'group_ids' for '{TS_FORECASTPANEL}' task."
|
||||
elif self._state.task == "rank":
|
||||
assert (
|
||||
self._state.groups is not None
|
||||
@@ -2082,13 +2138,13 @@ class AutoML(BaseEstimator):
|
||||
|
||||
Args:
|
||||
X_train: A numpy array or a pandas dataframe of training data in
|
||||
shape (n, m). For ts_forecast tasks, the first column of X_train
|
||||
shape (n, m). For time series forecsat tasks, the first column of X_train
|
||||
must be the timestamp column (datetime type). Other columns in
|
||||
the dataframe are assumed to be exogenous variables (categorical or numeric).
|
||||
When using ray, X_train can be a ray.ObjectRef.
|
||||
y_train: A numpy array or a pandas series of labels in shape (n, ).
|
||||
dataframe: A dataframe of training data including label column.
|
||||
For ts_forecast tasks, dataframe must be specified and must have
|
||||
For time series forecast tasks, dataframe must be specified and must have
|
||||
at least two columns, timestamp and label, where the first
|
||||
column is the timestamp column (datetime type). Other columns in
|
||||
the dataframe are assumed to be exogenous variables (categorical or numeric).
|
||||
@@ -2139,7 +2195,7 @@ class AutoML(BaseEstimator):
|
||||
```
|
||||
task: A string of the task type, e.g.,
|
||||
'classification', 'regression', 'ts_forecast_regression',
|
||||
'ts_forecast_classification', 'rank', 'seq-classification',
|
||||
'ts_forecast_classification', 'ts_forecast_panel', 'rank', 'seq-classification',
|
||||
'seq-regression', 'summarization'.
|
||||
n_jobs: An integer of the number of threads for training | default=-1.
|
||||
Use all available resources when n_jobs == -1.
|
||||
@@ -2204,7 +2260,7 @@ class AutoML(BaseEstimator):
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
"auto" -> uniform.
|
||||
For ts_forecast tasks, must be "auto" or 'time'.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
hpo_method: str, default="auto" | The hyperparameter
|
||||
optimization method. By default, CFO is used for sequential
|
||||
@@ -2305,15 +2361,46 @@ class AutoML(BaseEstimator):
|
||||
"transformer": {
|
||||
"output_dir": "test/data/output/",
|
||||
"fp16": False,
|
||||
},
|
||||
"tft": {
|
||||
"max_encoder_length": 1,
|
||||
"min_encoder_length": 1,
|
||||
"static_categoricals": [],
|
||||
"static_reals": [],
|
||||
"time_varying_known_categoricals": [],
|
||||
"time_varying_known_reals": [],
|
||||
"time_varying_unknown_categoricals": [],
|
||||
"time_varying_unknown_reals": [],
|
||||
"variable_groups": {},
|
||||
"lags": {},
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**fit_kwargs: Other key word arguments to pass to fit() function of
|
||||
the searched learners, such as sample_weight. Include:
|
||||
period: int | forecast horizon for ts_forecast tasks.
|
||||
the searched learners, such as sample_weight. Below are a few examples of
|
||||
estimator-specific parameters:
|
||||
period: int | forecast horizon for all time series forecast tasks.
|
||||
gpu_per_trial: float, default = 0 | A float of the number of gpus per trial,
|
||||
only used by TransformersEstimator and XGBoostSklearnEstimator.
|
||||
only used by TransformersEstimator, XGBoostSklearnEstimator, and
|
||||
TemporalFusionTransformerEstimator.
|
||||
group_ids: list of strings of column names identifying a time series, only
|
||||
used by TemporalFusionTransformerEstimator, required for
|
||||
'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object
|
||||
from PyTorchForecasting.
|
||||
For other parameters to describe your dataset, refer to
|
||||
[TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).
|
||||
To specify your variables, use `static_categoricals`, `static_reals`,
|
||||
`time_varying_known_categoricals`, `time_varying_known_reals`,
|
||||
`time_varying_unknown_categoricals`, `time_varying_unknown_reals`,
|
||||
`variable_groups`. To provide more information on your data, use
|
||||
`max_encoder_length`, `min_encoder_length`, `lags`.
|
||||
log_dir: str, default = "lightning_logs" | Folder into which to log results
|
||||
for tensorboard, only used by TemporalFusionTransformerEstimator.
|
||||
max_epochs: int, default = 20 | Maximum number of epochs to run training,
|
||||
only used by TemporalFusionTransformerEstimator.
|
||||
batch_size: int, default = 64 | Batch size for training model, only
|
||||
used by TemporalFusionTransformerEstimator.
|
||||
"""
|
||||
|
||||
self._state._start_time_flag = self._start_time_flag = time.time()
|
||||
@@ -2581,6 +2668,8 @@ class AutoML(BaseEstimator):
|
||||
estimator_list = ["lgbm", "xgboost", "xgb_limitdepth"]
|
||||
elif _is_nlp_task(self._state.task):
|
||||
estimator_list = ["transformer"]
|
||||
elif self._state.task == TS_FORECASTPANEL:
|
||||
estimator_list = ["tft"]
|
||||
else:
|
||||
try:
|
||||
import catboost
|
||||
|
||||
@@ -32,9 +32,11 @@ TS_FORECASTREGRESSION = (
|
||||
"ts_forecast_regression",
|
||||
)
|
||||
TS_FORECASTCLASSIFICATION = "ts_forecast_classification"
|
||||
TS_FORECASTPANEL = "ts_forecast_panel"
|
||||
TS_FORECAST = (
|
||||
*TS_FORECASTREGRESSION,
|
||||
TS_FORECASTCLASSIFICATION,
|
||||
TS_FORECASTPANEL,
|
||||
)
|
||||
TS_TIMESTAMP_COL = "ds"
|
||||
TS_VALUE_COL = "y"
|
||||
@@ -248,6 +250,26 @@ def concat(X1, X2):
|
||||
return np.concatenate([X1, X2])
|
||||
|
||||
|
||||
def add_time_idx_col(X):
|
||||
unique_dates = X[TS_TIMESTAMP_COL].drop_duplicates().sort_values(ascending=True)
|
||||
# assume no missing timestamps
|
||||
freq = pd.infer_freq(unique_dates)
|
||||
if freq == "MS":
|
||||
X["time_idx"] = X[TS_TIMESTAMP_COL].dt.year * 12 + X[TS_TIMESTAMP_COL].dt.month
|
||||
elif freq == "Y":
|
||||
X["time_idx"] = X[TS_TIMESTAMP_COL].dt.year
|
||||
else:
|
||||
# using time frequency to generate all time stamps and then indexing for time_idx
|
||||
# full_range = pd.date_range(X[TS_TIMESTAMP_COL].min(), X[TS_TIMESTAMP_COL].max(), freq=freq).to_list()
|
||||
# X["time_idx"] = [full_range.index(time) for time in X[TS_TIMESTAMP_COL]]
|
||||
# taking minimum difference in timestamp
|
||||
timestamps = unique_dates.view("int64")
|
||||
freq = int(timestamps.diff().mode())
|
||||
X["time_idx"] = timestamps - timestamps.min() / freq
|
||||
X["time_idx"] = X["time_idx"].astype("int")
|
||||
return X
|
||||
|
||||
|
||||
class DataTransformer:
|
||||
"""Transform input training data."""
|
||||
|
||||
@@ -281,6 +303,9 @@ class DataTransformer:
|
||||
drop = False
|
||||
if task in TS_FORECAST:
|
||||
X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL})
|
||||
if task is TS_FORECASTPANEL:
|
||||
if "time_idx" not in X:
|
||||
X = add_time_idx_col(X)
|
||||
ds_col = X.pop(TS_TIMESTAMP_COL)
|
||||
if isinstance(y, Series):
|
||||
y = y.rename(TS_VALUE_COL)
|
||||
|
||||
@@ -37,6 +37,7 @@ from .model import (
|
||||
ARIMA,
|
||||
SARIMAX,
|
||||
TransformersEstimator,
|
||||
TemporalFusionTransformerEstimator,
|
||||
TransformersEstimatorModelSelection,
|
||||
)
|
||||
from .data import CLASSIFICATION, group_counts, TS_FORECAST
|
||||
@@ -122,6 +123,8 @@ def get_estimator_class(task, estimator_name):
|
||||
estimator_class = SARIMAX
|
||||
elif estimator_name == "transformer":
|
||||
estimator_class = TransformersEstimator
|
||||
elif estimator_name == "tft":
|
||||
estimator_class = TemporalFusionTransformerEstimator
|
||||
elif estimator_name == "transformer_ms":
|
||||
estimator_class = TransformersEstimatorModelSelection
|
||||
else:
|
||||
|
||||
188
flaml/model.py
188
flaml/model.py
@@ -23,6 +23,7 @@ from . import tune
|
||||
from .data import (
|
||||
group_counts,
|
||||
CLASSIFICATION,
|
||||
add_time_idx_col,
|
||||
TS_FORECASTREGRESSION,
|
||||
TS_TIMESTAMP_COL,
|
||||
TS_VALUE_COL,
|
||||
@@ -2152,6 +2153,193 @@ class XGBoostLimitDepth_TS(TS_SKLearn):
|
||||
base_class = XGBoostLimitDepthEstimator
|
||||
|
||||
|
||||
class TemporalFusionTransformerEstimator(SKLearnEstimator):
|
||||
"""The class for tuning Temporal Fusion Transformer"""
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, data_size, pred_horizon, **params):
|
||||
space = {
|
||||
"gradient_clip_val": {
|
||||
"domain": tune.loguniform(lower=0.01, upper=100.0),
|
||||
"init_value": 0.01,
|
||||
},
|
||||
"hidden_size": {
|
||||
"domain": tune.lograndint(lower=8, upper=512),
|
||||
"init_value": 16,
|
||||
},
|
||||
"hidden_continuous_size": {
|
||||
"domain": tune.randint(lower=1, upper=65),
|
||||
"init_value": 8,
|
||||
},
|
||||
"attention_head_size": {
|
||||
"domain": tune.randint(lower=1, upper=5),
|
||||
"init_value": 4,
|
||||
},
|
||||
"dropout": {
|
||||
"domain": tune.uniform(lower=0.1, upper=0.3),
|
||||
"init_value": 0.1,
|
||||
},
|
||||
"learning_rate": {
|
||||
"domain": tune.loguniform(lower=0.00001, upper=1.0),
|
||||
"init_value": 0.001,
|
||||
},
|
||||
}
|
||||
return space
|
||||
|
||||
def transform_ds(self, X_train, y_train, **kwargs):
|
||||
y_train = DataFrame(y_train, columns=[TS_VALUE_COL])
|
||||
self.data = X_train.join(y_train)
|
||||
|
||||
max_prediction_length = kwargs["period"]
|
||||
self.max_encoder_length = kwargs["max_encoder_length"]
|
||||
training_cutoff = self.data["time_idx"].max() - max_prediction_length
|
||||
|
||||
from pytorch_forecasting import TimeSeriesDataSet
|
||||
from pytorch_forecasting.data import GroupNormalizer
|
||||
|
||||
self.group_ids = kwargs["group_ids"].copy()
|
||||
training = TimeSeriesDataSet(
|
||||
self.data[lambda x: x.time_idx <= training_cutoff],
|
||||
time_idx="time_idx",
|
||||
target=TS_VALUE_COL,
|
||||
group_ids=self.group_ids,
|
||||
min_encoder_length=kwargs.get(
|
||||
"min_encoder_length", self.max_encoder_length // 2
|
||||
), # keep encoder length long (as it is in the validation set)
|
||||
max_encoder_length=self.max_encoder_length,
|
||||
min_prediction_length=1,
|
||||
max_prediction_length=max_prediction_length,
|
||||
static_categoricals=kwargs.get("static_categoricals", []),
|
||||
static_reals=kwargs.get("static_reals", []),
|
||||
time_varying_known_categoricals=kwargs.get(
|
||||
"time_varying_known_categoricals", []
|
||||
),
|
||||
time_varying_known_reals=kwargs.get("time_varying_known_reals", []),
|
||||
time_varying_unknown_categoricals=kwargs.get(
|
||||
"time_varying_unknown_categoricals", []
|
||||
),
|
||||
time_varying_unknown_reals=kwargs.get("time_varying_unknown_reals", []),
|
||||
variable_groups=kwargs.get(
|
||||
"variable_groups", {}
|
||||
), # group of categorical variables can be treated as one variable
|
||||
lags=kwargs.get("lags", {}),
|
||||
target_normalizer=GroupNormalizer(
|
||||
groups=kwargs["group_ids"], transformation="softplus"
|
||||
), # use softplus and normalize by group
|
||||
add_relative_time_idx=True,
|
||||
add_target_scales=True,
|
||||
add_encoder_length=True,
|
||||
)
|
||||
|
||||
# create validation set (predict=True) which means to predict the last max_prediction_length points in time
|
||||
# for each series
|
||||
validation = TimeSeriesDataSet.from_dataset(
|
||||
training, self.data, predict=True, stop_randomization=True
|
||||
)
|
||||
|
||||
# create dataloaders for model
|
||||
batch_size = kwargs.get("batch_size", 64)
|
||||
train_dataloader = training.to_dataloader(
|
||||
train=True, batch_size=batch_size, num_workers=0
|
||||
)
|
||||
val_dataloader = validation.to_dataloader(
|
||||
train=False, batch_size=batch_size * 10, num_workers=0
|
||||
)
|
||||
|
||||
return training, train_dataloader, val_dataloader
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
import copy
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
import torch
|
||||
from pytorch_forecasting import TemporalFusionTransformer
|
||||
from pytorch_forecasting.metrics import QuantileLoss
|
||||
import tensorboard as tb
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
current_time = time.time()
|
||||
training, train_dataloader, val_dataloader = self.transform_ds(
|
||||
X_train, y_train, **kwargs
|
||||
)
|
||||
params = self.params.copy()
|
||||
gradient_clip_val = params.pop("gradient_clip_val")
|
||||
params.pop("n_jobs")
|
||||
max_epochs = kwargs.get("max_epochs", 20)
|
||||
early_stop_callback = EarlyStopping(
|
||||
monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min"
|
||||
)
|
||||
lr_logger = LearningRateMonitor() # log the learning rate
|
||||
logger = TensorBoardLogger(
|
||||
kwargs.get("log_dir", "lightning_logs")
|
||||
) # logging results to a tensorboard
|
||||
default_trainer_kwargs = dict(
|
||||
gpus=self._kwargs.get("gpu_per_trial", [0])
|
||||
if torch.cuda.is_available()
|
||||
else None,
|
||||
max_epochs=max_epochs,
|
||||
gradient_clip_val=gradient_clip_val,
|
||||
callbacks=[lr_logger, early_stop_callback],
|
||||
logger=logger,
|
||||
)
|
||||
trainer = pl.Trainer(
|
||||
**default_trainer_kwargs,
|
||||
)
|
||||
tft = TemporalFusionTransformer.from_dataset(
|
||||
training,
|
||||
**params,
|
||||
lstm_layers=2, # 2 is mostly optimal according to documentation
|
||||
output_size=7, # 7 quantiles by default
|
||||
loss=QuantileLoss(),
|
||||
log_interval=10, # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
|
||||
reduce_on_plateau_patience=4,
|
||||
)
|
||||
# fit network
|
||||
trainer.fit(
|
||||
tft,
|
||||
train_dataloaders=train_dataloader,
|
||||
val_dataloaders=val_dataloader,
|
||||
)
|
||||
best_model_path = trainer.checkpoint_callback.best_model_path
|
||||
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
|
||||
train_time = time.time() - current_time
|
||||
self._model = best_tft
|
||||
return train_time
|
||||
|
||||
def predict(self, X):
|
||||
import pandas as pd
|
||||
|
||||
ids = self.group_ids.copy()
|
||||
ids.append(TS_TIMESTAMP_COL)
|
||||
encoder_data = self.data[
|
||||
lambda x: x.time_idx > x.time_idx.max() - self.max_encoder_length
|
||||
]
|
||||
# following pytorchforecasting example, make all target values equal to the last data
|
||||
last_data_cols = self.group_ids.copy()
|
||||
last_data_cols.append(TS_VALUE_COL)
|
||||
last_data = self.data[lambda x: x.time_idx == x.time_idx.max()][last_data_cols]
|
||||
decoder_data = X
|
||||
if "time_idx" not in decoder_data:
|
||||
decoder_data = add_time_idx_col(decoder_data)
|
||||
decoder_data["time_idx"] += (
|
||||
encoder_data["time_idx"].max() + 1 - decoder_data["time_idx"].min()
|
||||
)
|
||||
# decoder_data[TS_VALUE_COL] = 0
|
||||
decoder_data = decoder_data.merge(last_data, how="inner", on=self.group_ids)
|
||||
decoder_data = decoder_data.sort_values(ids)
|
||||
new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)
|
||||
new_prediction_data["time_idx"] = new_prediction_data["time_idx"].astype("int")
|
||||
new_raw_predictions = self._model.predict(new_prediction_data)
|
||||
index = [decoder_data[idx].to_numpy() for idx in ids]
|
||||
predictions = pd.Series(new_raw_predictions.numpy().ravel(), index=index)
|
||||
return predictions
|
||||
|
||||
|
||||
class suppress_stdout_stderr(object):
|
||||
def __init__(self):
|
||||
# Open a pair of null files
|
||||
|
||||
File diff suppressed because one or more lines are too long
2
setup.py
2
setup.py
@@ -65,6 +65,7 @@ setuptools.setup(
|
||||
"rouge_score",
|
||||
"hcrystalball==0.1.10",
|
||||
"seqeval",
|
||||
"pytorch-forecasting>=0.9.0",
|
||||
],
|
||||
"catboost": ["catboost>=0.26"],
|
||||
"blendsearch": ["optuna==2.8.0"],
|
||||
@@ -98,6 +99,7 @@ setuptools.setup(
|
||||
"prophet>=1.0.1",
|
||||
"statsmodels>=0.12.2",
|
||||
"hcrystalball==0.1.10",
|
||||
"pytorch-forecasting>=0.9.0",
|
||||
],
|
||||
"benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
|
||||
},
|
||||
|
||||
@@ -60,7 +60,9 @@ def test_forecast_automl(budget=5):
|
||||
""" compute different metric values on testing dataset"""
|
||||
from flaml.ml import sklearn_metric_loss_score
|
||||
|
||||
print("mape", "=", sklearn_metric_loss_score("mape", y_pred, y_test))
|
||||
mape = sklearn_metric_loss_score("mape", y_pred, y_test)
|
||||
print("mape", "=", mape)
|
||||
assert mape <= 0.005, "the mape of flaml should be less than 0.005"
|
||||
from flaml.data import get_output_from_log
|
||||
|
||||
(
|
||||
@@ -415,7 +417,7 @@ def test_forecast_classification(budget=5):
|
||||
|
||||
print(y_test)
|
||||
print(y_pred)
|
||||
print("accuracy", "=", 1 - sklearn_metric_loss_score("accuracy", y_test, y_pred))
|
||||
print("accuracy", "=", 1 - sklearn_metric_loss_score("accuracy", y_pred, y_test))
|
||||
from flaml.data import get_output_from_log
|
||||
|
||||
(
|
||||
@@ -440,9 +442,159 @@ def test_forecast_classification(budget=5):
|
||||
# plt.show()
|
||||
|
||||
|
||||
def get_stalliion_data():
|
||||
from pytorch_forecasting.data.examples import get_stallion_data
|
||||
|
||||
data = get_stallion_data()
|
||||
# add time index - For datasets with no missing values, FLAML will automate this process
|
||||
data["time_idx"] = data["date"].dt.year * 12 + data["date"].dt.month
|
||||
data["time_idx"] -= data["time_idx"].min()
|
||||
# add additional features
|
||||
data["month"] = data.date.dt.month.astype(str).astype(
|
||||
"category"
|
||||
) # categories have be strings
|
||||
data["log_volume"] = np.log(data.volume + 1e-8)
|
||||
data["avg_volume_by_sku"] = data.groupby(
|
||||
["time_idx", "sku"], observed=True
|
||||
).volume.transform("mean")
|
||||
data["avg_volume_by_agency"] = data.groupby(
|
||||
["time_idx", "agency"], observed=True
|
||||
).volume.transform("mean")
|
||||
# we want to encode special days as one variable and thus need to first reverse one-hot encoding
|
||||
special_days = [
|
||||
"easter_day",
|
||||
"good_friday",
|
||||
"new_year",
|
||||
"christmas",
|
||||
"labor_day",
|
||||
"independence_day",
|
||||
"revolution_day_memorial",
|
||||
"regional_games",
|
||||
"beer_capital",
|
||||
"music_fest",
|
||||
]
|
||||
data[special_days] = (
|
||||
data[special_days]
|
||||
.apply(lambda x: x.map({0: "-", 1: x.name}))
|
||||
.astype("category")
|
||||
)
|
||||
return data, special_days
|
||||
|
||||
|
||||
def test_forecast_panel(budget=5):
|
||||
data, special_days = get_stalliion_data()
|
||||
time_horizon = 6 # predict six months
|
||||
training_cutoff = data["time_idx"].max() - time_horizon
|
||||
data["time_idx"] = data["time_idx"].astype("int")
|
||||
ts_col = data.pop("date")
|
||||
data.insert(0, "date", ts_col)
|
||||
# FLAML assumes input is not sorted, but we sort here for comparison purposes with y_test
|
||||
data = data.sort_values(["agency", "sku", "date"])
|
||||
X_train = data[lambda x: x.time_idx <= training_cutoff]
|
||||
X_test = data[lambda x: x.time_idx > training_cutoff]
|
||||
y_train = X_train.pop("volume")
|
||||
y_test = X_test.pop("volume")
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"time_budget": budget, # total running time in seconds
|
||||
"metric": "mape", # primary metric
|
||||
"task": "ts_forecast_panel", # task type
|
||||
"log_file_name": "test/stallion_forecast.log", # flaml log file
|
||||
"eval_method": "holdout",
|
||||
}
|
||||
fit_kwargs_by_estimator = {
|
||||
"tft": {
|
||||
"max_encoder_length": 24,
|
||||
"static_categoricals": ["agency", "sku"],
|
||||
"static_reals": ["avg_population_2017", "avg_yearly_household_income_2017"],
|
||||
"time_varying_known_categoricals": ["special_days", "month"],
|
||||
"variable_groups": {
|
||||
"special_days": special_days
|
||||
}, # group of categorical variables can be treated as one variable
|
||||
"time_varying_known_reals": [
|
||||
"time_idx",
|
||||
"price_regular",
|
||||
"discount_in_percent",
|
||||
],
|
||||
"time_varying_unknown_categoricals": [],
|
||||
"time_varying_unknown_reals": [
|
||||
"y", # always need a 'y' column for the target column
|
||||
"log_volume",
|
||||
"industry_volume",
|
||||
"soda_volume",
|
||||
"avg_max_temp",
|
||||
"avg_volume_by_agency",
|
||||
"avg_volume_by_sku",
|
||||
],
|
||||
"batch_size": 256,
|
||||
"max_epochs": 1,
|
||||
"gpu_per_trial": -1,
|
||||
}
|
||||
}
|
||||
"""The main flaml automl API"""
|
||||
automl.fit(
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
**settings,
|
||||
period=time_horizon,
|
||||
group_ids=["agency", "sku"],
|
||||
fit_kwargs_by_estimator=fit_kwargs_by_estimator,
|
||||
)
|
||||
""" retrieve best config and best learner"""
|
||||
print("Best ML leaner:", automl.best_estimator)
|
||||
print("Best hyperparmeter config:", automl.best_config)
|
||||
print(f"Best mape on validation data: {automl.best_loss}")
|
||||
print(f"Training duration of best run: {automl.best_config_train_time}s")
|
||||
print(automl.model.estimator)
|
||||
""" pickle and save the automl object """
|
||||
import pickle
|
||||
|
||||
with open("automl.pkl", "wb") as f:
|
||||
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||
""" compute predictions of testing dataset """
|
||||
y_pred = automl.predict(X_test)
|
||||
""" compute different metric values on testing dataset"""
|
||||
from flaml.ml import sklearn_metric_loss_score
|
||||
|
||||
print(y_test)
|
||||
print(y_pred)
|
||||
print("mape", "=", sklearn_metric_loss_score("mape", y_pred, y_test))
|
||||
|
||||
def smape(y_pred, y_test):
|
||||
import numpy as np
|
||||
|
||||
y_test, y_pred = np.array(y_test), np.array(y_pred)
|
||||
return round(
|
||||
np.mean(np.abs(y_pred - y_test) / ((np.abs(y_pred) + np.abs(y_test)) / 2))
|
||||
* 100,
|
||||
2,
|
||||
)
|
||||
|
||||
print("smape", "=", smape(y_pred, y_test))
|
||||
# TODO: compute prediction for a specific time series
|
||||
# """compute prediction for a specific time series"""
|
||||
# a01_sku01_preds = automl.predict(X_test[(X_test["agency"] == "Agency_01") & (X_test["sku"] == "SKU_01")])
|
||||
# print("Agency01 SKU_01 predictions: ", a01_sku01_preds)
|
||||
from flaml.data import get_output_from_log
|
||||
|
||||
(
|
||||
time_history,
|
||||
best_valid_loss_history,
|
||||
valid_loss_history,
|
||||
config_history,
|
||||
metric_history,
|
||||
) = get_output_from_log(filename=settings["log_file_name"], time_budget=budget)
|
||||
for config in config_history:
|
||||
print(config)
|
||||
print(automl.resource_attr)
|
||||
print(automl.max_resource)
|
||||
print(automl.min_resource)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_forecast_automl(60)
|
||||
test_multivariate_forecast_num(60)
|
||||
test_multivariate_forecast_cat(60)
|
||||
test_multivariate_forecast_num(5)
|
||||
test_multivariate_forecast_cat(5)
|
||||
test_numpy()
|
||||
test_forecast_classification(60)
|
||||
test_forecast_classification(5)
|
||||
test_forecast_panel(5)
|
||||
|
||||
@@ -28,7 +28,7 @@ print(automl.predict(X_train[84:]))
|
||||
|
||||
#### Sample output
|
||||
|
||||
```python
|
||||
```
|
||||
[flaml.automl: 01-21 08:01:20] {2018} INFO - task = ts_forecast
|
||||
[flaml.automl: 01-21 08:01:20] {2020} INFO - Data split method: time
|
||||
[flaml.automl: 01-21 08:01:20] {2024} INFO - Evaluation method: holdout
|
||||
@@ -502,7 +502,7 @@ print(automl.predict(multi_X_test))
|
||||
|
||||
#### Sample Output
|
||||
|
||||
```python
|
||||
```
|
||||
[flaml.automl: 02-28 21:32:26] {2458} INFO - iteration 15, current learner xgboost
|
||||
[flaml.automl: 02-28 21:32:26] {2620} INFO - at 6.2s, estimator xgboost's best error=0.0959, best estimator prophet's best error=0.0592
|
||||
[flaml.automl: 02-28 21:32:26] {2458} INFO - iteration 16, current learner extra_tree
|
||||
@@ -594,7 +594,8 @@ print("True label", discrete_y_test)
|
||||
```
|
||||
|
||||
#### Sample Output
|
||||
```python
|
||||
|
||||
```
|
||||
[flaml.automl: 02-28 21:53:03] {2060} INFO - task = ts_forecast_classification
|
||||
[flaml.automl: 02-28 21:53:03] {2062} INFO - Data split method: time
|
||||
[flaml.automl: 02-28 21:53:03] {2066} INFO - Evaluation method: holdout
|
||||
@@ -679,4 +680,886 @@ print("True label", discrete_y_test)
|
||||
[flaml.automl: 02-28 21:53:04] {2235} INFO - Time taken to find the best model: 0.8547139167785645
|
||||
```
|
||||
|
||||
### Forecasting with Panel Datasets
|
||||
|
||||
Panel time series datasets involves multiple individual time series. For example, see Stallion demand dataset from PyTorch Forecasting, orginally from Kaggle.
|
||||
|
||||
```python
|
||||
def get_stalliion_data():
|
||||
from pytorch_forecasting.data.examples import get_stallion_data
|
||||
|
||||
data = get_stallion_data()
|
||||
# add time index - For datasets with no missing values, FLAML will automate this process
|
||||
data["time_idx"] = data["date"].dt.year * 12 + data["date"].dt.month
|
||||
data["time_idx"] -= data["time_idx"].min()
|
||||
# add additional features
|
||||
data["month"] = data.date.dt.month.astype(str).astype(
|
||||
"category"
|
||||
) # categories have be strings
|
||||
data["log_volume"] = np.log(data.volume + 1e-8)
|
||||
data["avg_volume_by_sku"] = data.groupby(
|
||||
["time_idx", "sku"], observed=True
|
||||
).volume.transform("mean")
|
||||
data["avg_volume_by_agency"] = data.groupby(
|
||||
["time_idx", "agency"], observed=True
|
||||
).volume.transform("mean")
|
||||
# we want to encode special days as one variable and thus need to first reverse one-hot encoding
|
||||
special_days = [
|
||||
"easter_day",
|
||||
"good_friday",
|
||||
"new_year",
|
||||
"christmas",
|
||||
"labor_day",
|
||||
"independence_day",
|
||||
"revolution_day_memorial",
|
||||
"regional_games",
|
||||
"beer_capital",
|
||||
"music_fest",
|
||||
]
|
||||
data[special_days] = (
|
||||
data[special_days]
|
||||
.apply(lambda x: x.map({0: "-", 1: x.name}))
|
||||
.astype("category")
|
||||
)
|
||||
return data, special_days
|
||||
|
||||
data, special_days = get_stalliion_data()
|
||||
time_horizon = 6 # predict six months
|
||||
training_cutoff = data["time_idx"].max() - time_horizon
|
||||
data["time_idx"] = data["time_idx"].astype("int")
|
||||
ts_col = data.pop("date")
|
||||
data.insert(0, "date", ts_col)
|
||||
# FLAML assumes input is not sorted, but we sort here for comparison purposes with y_test
|
||||
data = data.sort_values(["agency", "sku", "date"])
|
||||
X_train = data[lambda x: x.time_idx <= training_cutoff]
|
||||
X_test = data[lambda x: x.time_idx > training_cutoff]
|
||||
y_train = X_train.pop("volume")
|
||||
y_test = X_test.pop("volume")
|
||||
automl = AutoML()
|
||||
# Configure settings for FLAML model
|
||||
settings = {
|
||||
"time_budget": budget, # total running time in seconds
|
||||
"metric": "mape", # primary metric
|
||||
"task": "ts_forecast_panel", # task type
|
||||
"log_file_name": "test/stallion_forecast.log", # flaml log file
|
||||
"eval_method": "holdout",
|
||||
}
|
||||
# Specify kwargs for TimeSeriesDataSet used by TemporalFusionTransformerEstimator
|
||||
fit_kwargs_by_estimator = {
|
||||
"tft": {
|
||||
"max_encoder_length": 24,
|
||||
"static_categoricals": ["agency", "sku"],
|
||||
"static_reals": ["avg_population_2017", "avg_yearly_household_income_2017"],
|
||||
"time_varying_known_categoricals": ["special_days", "month"],
|
||||
"variable_groups": {
|
||||
"special_days": special_days
|
||||
}, # group of categorical variables can be treated as one variable
|
||||
"time_varying_known_reals": [
|
||||
"time_idx",
|
||||
"price_regular",
|
||||
"discount_in_percent",
|
||||
],
|
||||
"time_varying_unknown_categoricals": [],
|
||||
"time_varying_unknown_reals": [
|
||||
"y", # always need a 'y' column for the target column
|
||||
"log_volume",
|
||||
"industry_volume",
|
||||
"soda_volume",
|
||||
"avg_max_temp",
|
||||
"avg_volume_by_agency",
|
||||
"avg_volume_by_sku",
|
||||
],
|
||||
"batch_size": 256,
|
||||
"max_epochs": 1,
|
||||
"gpu_per_trial": -1,
|
||||
}
|
||||
}
|
||||
# Train the model
|
||||
automl.fit(
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
**settings,
|
||||
period=time_horizon,
|
||||
group_ids=["agency", "sku"],
|
||||
fit_kwargs_by_estimator=fit_kwargs_by_estimator,
|
||||
)
|
||||
# Compute predictions of testing dataset
|
||||
y_pred = automl.predict(X_test)
|
||||
print(y_test)
|
||||
print(y_pred)
|
||||
# best model
|
||||
print(automl.model.estimator)
|
||||
```
|
||||
|
||||
#### Sample Output
|
||||
|
||||
```
|
||||
[flaml.automl: 07-28 21:26:03] {2478} INFO - task = ts_forecast_panel
|
||||
[flaml.automl: 07-28 21:26:03] {2480} INFO - Data split method: time
|
||||
[flaml.automl: 07-28 21:26:03] {2483} INFO - Evaluation method: holdout
|
||||
[flaml.automl: 07-28 21:26:03] {2552} INFO - Minimizing error metric: mape
|
||||
[flaml.automl: 07-28 21:26:03] {2694} INFO - List of ML learners in AutoML Run: ['tft']
|
||||
[flaml.automl: 07-28 21:26:03] {2986} INFO - iteration 0, current learner tft
|
||||
GPU available: False, used: False
|
||||
TPU available: False, using: 0 TPU cores
|
||||
IPU available: False, using: 0 IPUs
|
||||
|
||||
| Name | Type | Params
|
||||
----------------------------------------------------------------------------------------
|
||||
0 | loss | QuantileLoss | 0
|
||||
1 | logging_metrics | ModuleList | 0
|
||||
2 | input_embeddings | MultiEmbedding | 1.3 K
|
||||
3 | prescalers | ModuleDict | 256
|
||||
4 | static_variable_selection | VariableSelectionNetwork | 3.4 K
|
||||
5 | encoder_variable_selection | VariableSelectionNetwork | 8.0 K
|
||||
6 | decoder_variable_selection | VariableSelectionNetwork | 2.7 K
|
||||
7 | static_context_variable_selection | GatedResidualNetwork | 1.1 K
|
||||
8 | static_context_initial_hidden_lstm | GatedResidualNetwork | 1.1 K
|
||||
9 | static_context_initial_cell_lstm | GatedResidualNetwork | 1.1 K
|
||||
10 | static_context_enrichment | GatedResidualNetwork | 1.1 K
|
||||
11 | lstm_encoder | LSTM | 4.4 K
|
||||
12 | lstm_decoder | LSTM | 4.4 K
|
||||
13 | post_lstm_gate_encoder | GatedLinearUnit | 544
|
||||
14 | post_lstm_add_norm_encoder | AddNorm | 32
|
||||
15 | static_enrichment | GatedResidualNetwork | 1.4 K
|
||||
16 | multihead_attn | InterpretableMultiHeadAttention | 676
|
||||
17 | post_attn_gate_norm | GateAddNorm | 576
|
||||
18 | pos_wise_ff | GatedResidualNetwork | 1.1 K
|
||||
19 | pre_output_gate_norm | GateAddNorm | 576
|
||||
20 | output_layer | Linear | 119
|
||||
----------------------------------------------------------------------------------------
|
||||
33.6 K Trainable params
|
||||
0 Non-trainable params
|
||||
33.6 K Total params
|
||||
0.135 Total estimated model params size (MB)
|
||||
|
||||
Epoch 19: 100%|██████████| 129/129 [00:56<00:00, 2.27it/s, loss=45.9, v_num=2, train_loss_step=43.00, val_loss=65.20, train_loss_epoch=46.50]
|
||||
|
||||
[flaml.automl: 07-28 21:46:46] {3114} INFO - Estimated sufficient time budget=12424212s. Estimated necessary time budget=12424s.
|
||||
[flaml.automl: 07-28 21:46:46] {3161} INFO - at 1242.6s,\testimator tft's best error=1324290483134574.7500,\tbest estimator tft's best error=1324290483134574.7500
|
||||
GPU available: False, used: False
|
||||
TPU available: False, using: 0 TPU cores
|
||||
IPU available: False, using: 0 IPUs
|
||||
|
||||
| Name | Type | Params
|
||||
----------------------------------------------------------------------------------------
|
||||
0 | loss | QuantileLoss | 0
|
||||
1 | logging_metrics | ModuleList | 0
|
||||
2 | input_embeddings | MultiEmbedding | 1.3 K
|
||||
3 | prescalers | ModuleDict | 256
|
||||
4 | static_variable_selection | VariableSelectionNetwork | 3.4 K
|
||||
5 | encoder_variable_selection | VariableSelectionNetwork | 8.0 K
|
||||
6 | decoder_variable_selection | VariableSelectionNetwork | 2.7 K
|
||||
7 | static_context_variable_selection | GatedResidualNetwork | 1.1 K
|
||||
8 | static_context_initial_hidden_lstm | GatedResidualNetwork | 1.1 K
|
||||
9 | static_context_initial_cell_lstm | GatedResidualNetwork | 1.1 K
|
||||
10 | static_context_enrichment | GatedResidualNetwork | 1.1 K
|
||||
11 | lstm_encoder | LSTM | 4.4 K
|
||||
12 | lstm_decoder | LSTM | 4.4 K
|
||||
13 | post_lstm_gate_encoder | GatedLinearUnit | 544
|
||||
14 | post_lstm_add_norm_encoder | AddNorm | 32
|
||||
15 | static_enrichment | GatedResidualNetwork | 1.4 K
|
||||
16 | multihead_attn | InterpretableMultiHeadAttention | 676
|
||||
17 | post_attn_gate_norm | GateAddNorm | 576
|
||||
18 | pos_wise_ff | GatedResidualNetwork | 1.1 K
|
||||
19 | pre_output_gate_norm | GateAddNorm | 576
|
||||
20 | output_layer | Linear | 119
|
||||
----------------------------------------------------------------------------------------
|
||||
33.6 K Trainable params
|
||||
0 Non-trainable params
|
||||
33.6 K Total params
|
||||
0.135 Total estimated model params size (MB)
|
||||
Epoch 19: 100%|██████████| 145/145 [01:03<00:00, 2.28it/s, loss=45.2, v_num=3, train_loss_step=46.30, val_loss=67.60, train_loss_epoch=48.10]
|
||||
[flaml.automl: 07-28 22:08:05] {3425} INFO - retrain tft for 1279.6s
|
||||
[flaml.automl: 07-28 22:08:05] {3432} INFO - retrained model: TemporalFusionTransformer(
|
||||
(loss): QuantileLoss()
|
||||
(logging_metrics): ModuleList(
|
||||
(0): SMAPE()
|
||||
(1): MAE()
|
||||
(2): RMSE()
|
||||
(3): MAPE()
|
||||
)
|
||||
(input_embeddings): MultiEmbedding(
|
||||
(embeddings): ModuleDict(
|
||||
(agency): Embedding(58, 16)
|
||||
(sku): Embedding(25, 10)
|
||||
(special_days): TimeDistributedEmbeddingBag(11, 6, mode=sum)
|
||||
(month): Embedding(12, 6)
|
||||
)
|
||||
)
|
||||
(prescalers): ModuleDict(
|
||||
(avg_population_2017): Linear(in_features=1, out_features=8, bias=True)
|
||||
(avg_yearly_household_income_2017): Linear(in_features=1, out_features=8, bias=True)
|
||||
(encoder_length): Linear(in_features=1, out_features=8, bias=True)
|
||||
(y_center): Linear(in_features=1, out_features=8, bias=True)
|
||||
(y_scale): Linear(in_features=1, out_features=8, bias=True)
|
||||
(time_idx): Linear(in_features=1, out_features=8, bias=True)
|
||||
(price_regular): Linear(in_features=1, out_features=8, bias=True)
|
||||
(discount_in_percent): Linear(in_features=1, out_features=8, bias=True)
|
||||
(relative_time_idx): Linear(in_features=1, out_features=8, bias=True)
|
||||
(y): Linear(in_features=1, out_features=8, bias=True)
|
||||
(log_volume): Linear(in_features=1, out_features=8, bias=True)
|
||||
(industry_volume): Linear(in_features=1, out_features=8, bias=True)
|
||||
(soda_volume): Linear(in_features=1, out_features=8, bias=True)
|
||||
(avg_max_temp): Linear(in_features=1, out_features=8, bias=True)
|
||||
(avg_volume_by_agency): Linear(in_features=1, out_features=8, bias=True)
|
||||
(avg_volume_by_sku): Linear(in_features=1, out_features=8, bias=True)
|
||||
)
|
||||
(static_variable_selection): VariableSelectionNetwork(
|
||||
(flattened_grn): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((7,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=66, out_features=7, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=7, out_features=7, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=7, out_features=14, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((7,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(single_variable_grns): ModuleDict(
|
||||
(agency): ResampleNorm(
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(sku): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(avg_population_2017): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(avg_yearly_household_income_2017): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(encoder_length): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(y_center): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(y_scale): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(prescalers): ModuleDict(
|
||||
(avg_population_2017): Linear(in_features=1, out_features=8, bias=True)
|
||||
(avg_yearly_household_income_2017): Linear(in_features=1, out_features=8, bias=True)
|
||||
(encoder_length): Linear(in_features=1, out_features=8, bias=True)
|
||||
(y_center): Linear(in_features=1, out_features=8, bias=True)
|
||||
(y_scale): Linear(in_features=1, out_features=8, bias=True)
|
||||
)
|
||||
(softmax): Softmax(dim=-1)
|
||||
)
|
||||
(encoder_variable_selection): VariableSelectionNetwork(
|
||||
(flattened_grn): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((13,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=100, out_features=13, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(context): Linear(in_features=16, out_features=13, bias=False)
|
||||
(fc2): Linear(in_features=13, out_features=13, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=13, out_features=26, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((13,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(single_variable_grns): ModuleDict(
|
||||
(special_days): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(month): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(time_idx): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(price_regular): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(discount_in_percent): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(relative_time_idx): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(y): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(log_volume): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(industry_volume): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(soda_volume): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(avg_max_temp): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(avg_volume_by_agency): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(avg_volume_by_sku): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(prescalers): ModuleDict(
|
||||
(time_idx): Linear(in_features=1, out_features=8, bias=True)
|
||||
(price_regular): Linear(in_features=1, out_features=8, bias=True)
|
||||
(discount_in_percent): Linear(in_features=1, out_features=8, bias=True)
|
||||
(relative_time_idx): Linear(in_features=1, out_features=8, bias=True)
|
||||
(y): Linear(in_features=1, out_features=8, bias=True)
|
||||
(log_volume): Linear(in_features=1, out_features=8, bias=True)
|
||||
(industry_volume): Linear(in_features=1, out_features=8, bias=True)
|
||||
(soda_volume): Linear(in_features=1, out_features=8, bias=True)
|
||||
(avg_max_temp): Linear(in_features=1, out_features=8, bias=True)
|
||||
(avg_volume_by_agency): Linear(in_features=1, out_features=8, bias=True)
|
||||
(avg_volume_by_sku): Linear(in_features=1, out_features=8, bias=True)
|
||||
)
|
||||
(softmax): Softmax(dim=-1)
|
||||
)
|
||||
(decoder_variable_selection): VariableSelectionNetwork(
|
||||
(flattened_grn): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((6,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=44, out_features=6, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(context): Linear(in_features=16, out_features=6, bias=False)
|
||||
(fc2): Linear(in_features=6, out_features=6, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=6, out_features=12, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((6,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(single_variable_grns): ModuleDict(
|
||||
(special_days): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(month): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(time_idx): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(price_regular): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(discount_in_percent): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(relative_time_idx): GatedResidualNetwork(
|
||||
(resample_norm): ResampleNorm(
|
||||
(resample): TimeDistributedInterpolation()
|
||||
(gate): Sigmoid()
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(fc1): Linear(in_features=8, out_features=8, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=8, out_features=8, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=8, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
(prescalers): ModuleDict(
|
||||
(time_idx): Linear(in_features=1, out_features=8, bias=True)
|
||||
(price_regular): Linear(in_features=1, out_features=8, bias=True)
|
||||
(discount_in_percent): Linear(in_features=1, out_features=8, bias=True)
|
||||
(relative_time_idx): Linear(in_features=1, out_features=8, bias=True)
|
||||
)
|
||||
(softmax): Softmax(dim=-1)
|
||||
)
|
||||
(static_context_variable_selection): GatedResidualNetwork(
|
||||
(fc1): Linear(in_features=16, out_features=16, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=16, out_features=16, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(static_context_initial_hidden_lstm): GatedResidualNetwork(
|
||||
(fc1): Linear(in_features=16, out_features=16, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=16, out_features=16, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(static_context_initial_cell_lstm): GatedResidualNetwork(
|
||||
(fc1): Linear(in_features=16, out_features=16, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=16, out_features=16, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(static_context_enrichment): GatedResidualNetwork(
|
||||
(fc1): Linear(in_features=16, out_features=16, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=16, out_features=16, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(lstm_encoder): LSTM(16, 16, num_layers=2, batch_first=True, dropout=0.1)
|
||||
(lstm_decoder): LSTM(16, 16, num_layers=2, batch_first=True, dropout=0.1)
|
||||
(post_lstm_gate_encoder): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(post_lstm_gate_decoder): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(post_lstm_add_norm_encoder): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(post_lstm_add_norm_decoder): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
(static_enrichment): GatedResidualNetwork(
|
||||
(fc1): Linear(in_features=16, out_features=16, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(context): Linear(in_features=16, out_features=16, bias=False)
|
||||
(fc2): Linear(in_features=16, out_features=16, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(multihead_attn): InterpretableMultiHeadAttention(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(v_layer): Linear(in_features=16, out_features=4, bias=True)
|
||||
(q_layers): ModuleList(
|
||||
(0): Linear(in_features=16, out_features=4, bias=True)
|
||||
(1): Linear(in_features=16, out_features=4, bias=True)
|
||||
(2): Linear(in_features=16, out_features=4, bias=True)
|
||||
(3): Linear(in_features=16, out_features=4, bias=True)
|
||||
)
|
||||
(k_layers): ModuleList(
|
||||
(0): Linear(in_features=16, out_features=4, bias=True)
|
||||
(1): Linear(in_features=16, out_features=4, bias=True)
|
||||
(2): Linear(in_features=16, out_features=4, bias=True)
|
||||
(3): Linear(in_features=16, out_features=4, bias=True)
|
||||
)
|
||||
(attention): ScaledDotProductAttention(
|
||||
(softmax): Softmax(dim=2)
|
||||
)
|
||||
(w_h): Linear(in_features=4, out_features=16, bias=False)
|
||||
)
|
||||
(post_attn_gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
(pos_wise_ff): GatedResidualNetwork(
|
||||
(fc1): Linear(in_features=16, out_features=16, bias=True)
|
||||
(elu): ELU(alpha=1.0)
|
||||
(fc2): Linear(in_features=16, out_features=16, bias=True)
|
||||
(gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(dropout): Dropout(p=0.1, inplace=False)
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
)
|
||||
(pre_output_gate_norm): GateAddNorm(
|
||||
(glu): GatedLinearUnit(
|
||||
(fc): Linear(in_features=16, out_features=32, bias=True)
|
||||
)
|
||||
(add_norm): AddNorm(
|
||||
(norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
)
|
||||
(output_layer): Linear(in_features=16, out_features=7, bias=True)
|
||||
)
|
||||
[flaml.automl: 07-28 22:08:05] {2725} INFO - fit succeeded
|
||||
[flaml.automl: 07-28 22:08:05] {2726} INFO - Time taken to find the best model: 1242.6435902118683
|
||||
[flaml.automl: 07-28 22:08:05] {2737} WARNING - Time taken to find the best model is 414% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
```
|
||||
|
||||
[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_time_series_forecast.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_time_series_forecast.ipynb)
|
||||
@@ -12,6 +12,7 @@
|
||||
- 'regression': regression.
|
||||
- 'ts_forecast': time series forecasting.
|
||||
- 'ts_forecast_classification': time series forecasting for classification.
|
||||
- 'ts_forecast_panel': time series forecasting for panel datasets (multiple time series).
|
||||
- 'rank': learning to rank.
|
||||
- 'seq-classification': sequence classification.
|
||||
- 'seq-regression': sequence regression.
|
||||
@@ -119,6 +120,7 @@ The estimator list can contain one or more estimator names, each corresponding t
|
||||
- 'arima': ARIMA for task "ts_forecast". Hyperparameters: p, d, q.
|
||||
- 'sarimax': SARIMAX for task "ts_forecast". Hyperparameters: p, d, q, P, D, Q, s.
|
||||
- 'transformer': Huggingface transformer models for task "seq-classification", "seq-regression", "multichoice-classification", "token-classification" and "summarization". Hyperparameters: learning_rate, num_train_epochs, per_device_train_batch_size, warmup_ratio, weight_decay, adam_epsilon, seed.
|
||||
- 'temporal_fusion_transform': TemporalFusionTransformerEstimator for task "ts_forecast_panel". Hyperparameters: gradient_clip_val, hidden_size, hidden_continuous_size, attention_head_size, dropout, learning_rate.
|
||||
* Custom estimator. Use custom estimator for:
|
||||
- tuning an estimator that is not built-in;
|
||||
- customizing search space for a built-in estimator.
|
||||
|
||||
Reference in New Issue
Block a user