Add multioutput_train_size parameter for manual validation set specification in multi-output tasks

Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com>
2026-02-09 02:09:16 +08:00 · 2026-01-21 06:40:23 +00:00
parent deecedaa0d
commit 79410bbf4d
2 changed files with 82 additions and 0 deletions
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -361,6 +361,11 @@ class AutoML(BaseEstimator):
        }
        ```
            mlflow_logging: boolean, default=True | Whether to log the training results to mlflow. Not valid if mlflow is not installed.
+            multioutput_train_size: int, float or None, default=None | For multi-output regression tasks with
+                "holdout" evaluation, allows manual specification of validation set by concatenating training and
+                validation data and specifying where to split. If int, represents the number of samples in the
+                training set. If float (between 0.0 and 1.0), represents the proportion of the dataset to include
+                in the training set. If None, no split is performed. Only used when X_val and y_val are not provided.

        """
        if ERROR:
@@ -419,6 +424,7 @@ class AutoML(BaseEstimator):
        settings["custom_hp"] = settings.get("custom_hp", {})
        settings["skip_transform"] = settings.get("skip_transform", False)
        settings["mlflow_logging"] = settings.get("mlflow_logging", True)
+        settings["multioutput_train_size"] = settings.get("multioutput_train_size", None)

        self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor"
        self.best_run_id = None
@@ -1720,6 +1726,28 @@ class AutoML(BaseEstimator):
        """
        return self._metric_constraints

+    def _train_val_split(self, X, y, train_size):
+        """Split concatenated training and validation data.
+
+        Args:
+            X: Combined training and validation features
+            y: Combined training and validation labels
+            train_size: int or float - if int, number of samples for training set;
+                       if float, proportion of samples for training set
+
+        Returns:
+            X_train, X_val, y_train, y_val
+        """
+        if isinstance(train_size, float):
+            train_size = int(len(X) * train_size)
+        
+        X_train = X[:train_size]
+        X_val = X[train_size:]
+        y_train = y[:train_size]
+        y_val = y[train_size:]
+        
+        return X_train, X_val, y_train, y_val
+
    def _prepare_data(self, eval_method, split_ratio, n_splits):
        self._state.task.prepare_data(
            self._state,
@@ -1793,6 +1821,7 @@ class AutoML(BaseEstimator):
        mlflow_logging=None,
        fit_kwargs_by_estimator=None,
        mlflow_exp_name=None,
+        multioutput_train_size=None,
        **fit_kwargs,
    ):
        """Find a model for a given task.
@@ -2110,6 +2139,11 @@ class AutoML(BaseEstimator):
            }
        }
        ```
+            multioutput_train_size: int, float or None, default=None | For multi-output regression tasks with
+                "holdout" evaluation, allows manual specification of validation set by concatenating training and
+                validation data and specifying where to split. If int, represents the number of samples in the
+                training set. If float (between 0.0 and 1.0), represents the proportion of the dataset to include
+                in the training set. If None, no split is performed. Only used when X_val and y_val are not provided.

            **fit_kwargs: Other key word arguments to pass to fit() function of
                the searched learners, such as sample_weight. Below are a few examples of
@@ -2312,6 +2346,19 @@ class AutoML(BaseEstimator):
                    self.mlflow_integration.only_history = True
            except KeyError:
                logger.info("Not in Fabric, Skipped")
+        
+        # Handle multioutput_train_size parameter
+        multioutput_train_size = (
+            self._settings.get("multioutput_train_size") if multioutput_train_size is None else multioutput_train_size
+        )
+        if multioutput_train_size is not None and X_val is None and y_val is None:
+            # Split the concatenated training data into train and validation sets
+            X_train, X_val, y_train, y_val = self._train_val_split(X_train, y_train, multioutput_train_size)
+            logger.info(
+                f"Split data using multioutput_train_size={multioutput_train_size}: "
+                f"train size={len(X_train)}, val size={len(X_val)}"
+            )
+        
        task.validate_data(
            self,
            self._state,
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -244,6 +244,41 @@ def test_multioutput():
    print(model.predict(X_test))


+def test_multioutput_train_size():
+    """Test multioutput_train_size parameter for manual validation set specification."""
+    from sklearn.multioutput import MultiOutputRegressor
+
+    # create multi-output regression data
+    X, y = make_regression(n_samples=100, n_features=10, n_targets=3, random_state=42)
+
+    # Concatenate what would be training and validation data
+    # Simulate having 70 samples for training and 30 for validation
+    train_size = 70
+
+    # train the model using multioutput_train_size
+    model = MultiOutputRegressor(
+        AutoML(task="regression", time_budget=1, eval_method="holdout", multioutput_train_size=train_size)
+    )
+    model.fit(X, y)
+
+    # predict on a subset
+    predictions = model.predict(X[:10])
+    
+    # Verify predictions have correct shape
+    assert predictions.shape == (10, 3), f"Expected shape (10, 3), got {predictions.shape}"
+    print(f"Predictions shape: {predictions.shape}")
+    print(f"Sample predictions:\n{predictions[:3]}")
+
+    # Test with float train_size (proportion)
+    model2 = MultiOutputRegressor(
+        AutoML(task="regression", time_budget=1, eval_method="holdout", multioutput_train_size=0.7)
+    )
+    model2.fit(X, y)
+    predictions2 = model2.predict(X[:10])
+    assert predictions2.shape == (10, 3), f"Expected shape (10, 3), got {predictions2.shape}"
+    print(f"Model with float train_size also works correctly")
+
+
@pytest.mark.parametrize(
    "estimator",
    [