Install wheel and setuptools (#1443 )

Fix typo, add quotes to python-version (#1442 )
Fix CD actions (#1441 )
2026-02-15 05:09:16 +08:00 · 2025-05-28 12:56:48 +08:00 · 2025-05-28 12:24:00 +08:00 · 2025-05-28 10:45:27 +08:00 · 2025-05-27 21:34:21 +08:00 · 2025-05-27 15:32:56 +08:00
43 changed files with 1268 additions and 207 deletions
--- a/.github/workflows/CD.yml
+++ b/.github/workflows/CD.yml
@@ -12,26 +12,17 @@ jobs:
  deploy:
    strategy:
      matrix:
-        os: ['ubuntu-latest']
-        python-version: [3.8]
+        os: ["ubuntu-latest"]
+        python-version: ["3.10"]
    runs-on: ${{ matrix.os }}
    environment: package
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
-      - name: Cache conda
-        uses: actions/cache@v3
+        uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
        with:
-          path: ~/conda_pkgs_dir
-          key: conda-${{ matrix.os }}-python-${{ matrix.python-version }}-${{ hashFiles('environment.yml') }}
-      - name: Setup Miniconda
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          auto-activate-base: false
-          activate-environment: hcrystalball
          python-version: ${{ matrix.python-version }}
-          use-only-tar-bz2: true
      - name: Install from source
        # This is required for the pre-commit tests
        shell: pwsh
@@ -42,7 +33,7 @@ jobs:
      - name: Build
        shell: pwsh
        run: |
-          pip install twine
+          pip install twine wheel setuptools
          python setup.py sdist bdist_wheel
      - name: Publish to PyPI
        env:
--- a/.github/workflows/deploy-website.yml
+++ b/.github/workflows/deploy-website.yml
@@ -37,11 +37,11 @@ jobs:
      - name: setup python
        uses: actions/setup-python@v4
        with:
-          python-version: "3.8"
+          python-version: "3.10"
      - name: pydoc-markdown install
        run: |
          python -m pip install --upgrade pip
-          pip install pydoc-markdown==4.5.0
+          pip install pydoc-markdown==4.7.0
      - name: pydoc-markdown run
        run: |
          pydoc-markdown
@@ -73,11 +73,11 @@ jobs:
      - name: setup python
        uses: actions/setup-python@v4
        with:
-          python-version: "3.8"
+          python-version: "3.10"
      - name: pydoc-markdown install
        run: |
          python -m pip install --upgrade pip
-          pip install pydoc-markdown==4.5.0
+          pip install pydoc-markdown==4.7.0
      - name: pydoc-markdown run
        run: |
          pydoc-markdown
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -14,6 +14,12 @@ on:
      - 'setup.py'
  pull_request:
    branches: ['main']
+    paths:
+      - 'flaml/**'
+      - 'test/**'
+      - 'notebook/**'
+      - '.github/workflows/python-package.yml'
+      - 'setup.py'
  merge_group:
    types: [checks_requested]

@@ -30,7 +36,7 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, macos-latest, windows-2019]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11"]
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
@@ -50,7 +56,7 @@ jobs:
          export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
      - name: Install packages and dependencies
        run: |
-          python -m pip install --upgrade pip wheel
+          python -m pip install --upgrade pip wheel setuptools
          pip install -e .
          python -c "import flaml"
          pip install -e .[test]
@@ -85,7 +91,7 @@ jobs:
      - name: Test with pytest
        if: matrix.python-version != '3.10'
        run: |
-          pytest test
+          pytest test/
      - name: Coverage
        if: matrix.python-version == '3.10'
        run: |
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 # basic setup
-FROM mcr.microsoft.com/devcontainers/python:3.8
+FROM mcr.microsoft.com/devcontainers/python:3.10
 RUN apt-get update && apt-get -y update
 RUN apt-get install -y sudo git npm

--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ FLAML has a .NET implementation in [ML.NET](http://dot.net/ml), an open-source,

 ## Installation

-FLAML requires **Python version >= 3.8**. It can be installed from pip:
+FLAML requires **Python version >= 3.9**. It can be installed from pip:

 ```bash
 pip install flaml
--- a/flaml/autogen/agentchat/contrib/math_user_proxy_agent.py
+++ b/flaml/autogen/agentchat/contrib/math_user_proxy_agent.py
@@ -156,7 +156,7 @@ class MathUserProxyAgent(UserProxyAgent):
                    when the number of auto reply reaches the max_consecutive_auto_reply or when is_termination_msg is True.
            default_auto_reply (str or dict or None): the default auto reply message when no code execution or llm based reply is generated.
            max_invalid_q_per_step (int): (ADDED) the maximum number of invalid queries per step.
-            **kwargs (dict): other kwargs in [UserProxyAgent](user_proxy_agent#__init__).
+            **kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).
        """
        super().__init__(
            name=name,
--- a/flaml/autogen/agentchat/contrib/retrieve_user_proxy_agent.py
+++ b/flaml/autogen/agentchat/contrib/retrieve_user_proxy_agent.py
@@ -123,7 +123,7 @@ class RetrieveUserProxyAgent(UserProxyAgent):
                    can be found at `https://www.sbert.net/docs/pretrained_models.html`. The default model is a
                    fast model. If you want to use a high performance model, `all-mpnet-base-v2` is recommended.
                - customized_prompt (Optional, str): the customized prompt for the retrieve chat. Default is None.
-            **kwargs (dict): other kwargs in [UserProxyAgent](user_proxy_agent#__init__).
+            **kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).
        """
        super().__init__(
            name=name,
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -10,6 +10,7 @@ import os
 import random
 import sys
 import time
+from concurrent.futures import as_completed
 from functools import partial
 from typing import Callable, List, Optional, Union

@@ -187,9 +188,16 @@ class AutoML(BaseEstimator):
            mem_thres: A float of the memory size constraint in bytes.
            pred_time_limit: A float of the prediction latency constraint in seconds.
                It refers to the average prediction time per row in validation data.
-            train_time_limit: A float of the training time constraint in seconds.
+            train_time_limit: None or a float of the training time constraint in seconds for each trial.
+                Only valid for sequential search.
            verbose: int, default=3 | Controls the verbosity, higher means more
                messages.
+                verbose=0: logger level = CRITICAL
+                verbose=1: logger level = ERROR
+                verbose=2: logger level = WARNING
+                verbose=3: logger level = INFO
+                verbose=4: logger level = DEBUG
+                verbose>5: logger level = NOTSET
            retrain_full: bool or str, default=True | whether to retrain the
                selected model on the full training data when using holdout.
                True - retrain only after search finishes; False - no retraining;
@@ -424,6 +432,8 @@ class AutoML(BaseEstimator):
            If `model_history` was set to True, then the returned model is trained.
        """
        state = self._search_states.get(estimator_name)
+        if state and estimator_name == self._best_estimator:
+            return self.model
        return state and getattr(state, "trained_estimator", None)

    @property
@@ -1332,7 +1342,8 @@ class AutoML(BaseEstimator):
            mem_thres: A float of the memory size constraint in bytes.
            pred_time_limit: A float of the prediction latency constraint in seconds.
                It refers to the average prediction time per row in validation data.
-            train_time_limit: None or a float of the training time constraint in seconds.
+            train_time_limit: None or a float of the training time constraint in seconds for each trial.
+                Only valid for sequential search.
            X_val: None or a numpy array or a pandas dataframe of validation data.
            y_val: None or a numpy array or a pandas series of validation labels.
            sample_weight_val: None or a numpy array of the sample weight of
@@ -1345,6 +1356,12 @@ class AutoML(BaseEstimator):
                for training data.
            verbose: int, default=3 | Controls the verbosity, higher means more
                messages.
+                verbose=0: logger level = CRITICAL
+                verbose=1: logger level = ERROR
+                verbose=2: logger level = WARNING
+                verbose=3: logger level = INFO
+                verbose=4: logger level = DEBUG
+                verbose>5: logger level = NOTSET
            retrain_full: bool or str, default=True | whether to retrain the
                selected model on the full training data when using holdout.
                True - retrain only after search finishes; False - no retraining;
@@ -1623,6 +1640,13 @@ class AutoML(BaseEstimator):
            _ch.setFormatter(logger_formatter)
            logger.addHandler(_ch)

+        if model_history:
+            logger.warning(
+                "With `model_history` set to `True` by default, all intermediate models are retained in memory, "
+                "which may significantly increase memory usage and slow down training. "
+                "Consider setting `model_history=False` to optimize memory and accelerate the training process."
+            )
+
        if not use_ray and not use_spark and n_concurrent_trials > 1:
            if ray_available:
                logger.warning(
@@ -1708,7 +1732,7 @@ class AutoML(BaseEstimator):
                if not (mlflow.active_run() is not None or is_autolog_enabled()):
                    self.mlflow_integration.only_history = True
            except KeyError:
-                print("Not in Fabric, Skipped")
+                logger.info("Not in Fabric, Skipped")
        task.validate_data(
            self,
            self._state,
@@ -2529,6 +2553,21 @@ class AutoML(BaseEstimator):
            self._selected = state = self._search_states[estimator]
            state.best_config_sample_size = self._state.data_size[0]
            state.best_config = state.init_config[0] if state.init_config else {}
+            self._track_iter = 0
+            self._config_history[self._track_iter] = (estimator, state.best_config, self._state.time_from_start)
+            self._best_iteration = self._track_iter
+            state.val_loss = getattr(state, "val_loss", float("inf"))
+            state.best_loss = getattr(state, "best_loss", float("inf"))
+            state.config = getattr(state, "config", state.best_config.copy())
+            state.metric_for_logging = getattr(state, "metric_for_logging", None)
+            state.sample_size = getattr(state, "sample_size", self._state.data_size[0])
+            state.learner_class = getattr(state, "learner_class", self._state.learner_classes.get(estimator))
+            if hasattr(self, "mlflow_integration") and self.mlflow_integration:
+                self.mlflow_integration.record_state(
+                    automl=self,
+                    search_state=state,
+                    estimator=estimator,
+                )
        elif self._use_ray is False and self._use_spark is False:
            self._search_sequential()
        else:
@@ -2700,16 +2739,47 @@ class AutoML(BaseEstimator):
                            ):
                                if mlflow.active_run() is None:
                                    mlflow.start_run(run_id=self.mlflow_integration.parent_run_id)
-                                self.mlflow_integration.log_model(
-                                    self._trained_estimator.model,
-                                    self.best_estimator,
-                                    signature=self.estimator_signature,
-                                )
-                                self.mlflow_integration.pickle_and_log_automl_artifacts(
-                                    self, self.model, self.best_estimator, signature=self.pipeline_signature
-                                )
+                                if self.best_estimator.endswith("_spark"):
+                                    self.mlflow_integration.log_model(
+                                        self._trained_estimator.model,
+                                        self.best_estimator,
+                                        signature=self.estimator_signature,
+                                        run_id=self.mlflow_integration.parent_run_id,
+                                    )
+                                else:
+                                    self.mlflow_integration.pickle_and_log_automl_artifacts(
+                                        self,
+                                        self.model,
+                                        self.best_estimator,
+                                        signature=self.pipeline_signature,
+                                        run_id=self.mlflow_integration.parent_run_id,
+                                    )
                else:
-                    logger.info("not retraining because the time budget is too small.")
+                    logger.warning("not retraining because the time budget is too small.")
+        self.wait_futures()
+
+    def wait_futures(self):
+        if self.mlflow_integration is not None:
+            logger.debug("Collecting results from submitted record_state tasks")
+            t1 = time.perf_counter()
+            for future in as_completed(self.mlflow_integration.futures):
+                _task = self.mlflow_integration.futures[future]
+                try:
+                    result = future.result()
+                    logger.debug(f"Result for record_state task {_task}: {result}")
+                except Exception as e:
+                    logger.warning(f"Exception for record_state task {_task}: {e}")
+            for future in as_completed(self.mlflow_integration.futures_log_model):
+                _task = self.mlflow_integration.futures_log_model[future]
+                try:
+                    result = future.result()
+                    logger.debug(f"Result for log_model task {_task}: {result}")
+                except Exception as e:
+                    logger.warning(f"Exception for log_model task {_task}: {e}")
+            t2 = time.perf_counter()
+            logger.debug(f"Collecting results from tasks submitted to executors costs {t2-t1} seconds.")
+        else:
+            logger.debug("No futures to wait for.")

    def __del__(self):
        if (
--- a/flaml/automl/contrib/histgb.py
+++ b/flaml/automl/contrib/histgb.py
@@ -1,7 +1,7 @@
 try:
    from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
-except ImportError:
-    pass
+except ImportError as e:
+    print(f"scikit-learn is required for HistGradientBoostingEstimator. Please install it; error: {e}")

 from flaml import tune
 from flaml.automl.model import SKLearnEstimator
--- a/flaml/automl/data.py
+++ b/flaml/automl/data.py
@@ -2,13 +2,17 @@
 #  * Copyright (c) Microsoft Corporation. All rights reserved.
 #  * Licensed under the MIT License. See LICENSE file in the
 #  * project root for license information.
+import json
 import os
-from datetime import datetime
+import random
+import uuid
+from datetime import datetime, timedelta
+from decimal import ROUND_HALF_UP, Decimal
 from typing import TYPE_CHECKING, Union

 import numpy as np

-from flaml.automl.spark import DataFrame, Series, pd, ps, psDataFrame, psSeries
+from flaml.automl.spark import DataFrame, F, Series, T, pd, ps, psDataFrame, psSeries
 from flaml.automl.training_log import training_log_reader

 try:
@@ -19,6 +23,7 @@ except ImportError:
 if TYPE_CHECKING:
    from flaml.automl.task import Task

+
 TS_TIMESTAMP_COL = "ds"
 TS_VALUE_COL = "y"

@@ -445,3 +450,331 @@ class DataTransformer:
 def group_counts(groups):
    _, i, c = np.unique(groups, return_counts=True, return_index=True)
    return c[np.argsort(i)]
+
+
+def get_random_dataframe(n_rows: int = 200, ratio_none: float = 0.1, seed: int = 42) -> DataFrame:
+    """Generate a random pandas DataFrame with various data types for testing.
+    This function creates a DataFrame with multiple column types including:
+    - Timestamps
+    - Integers
+    - Floats
+    - Categorical values
+    - Booleans
+    - Lists (tags)
+    - Decimal strings
+    - UUIDs
+    - Binary data (as hex strings)
+    - JSON blobs
+    - Nullable text fields
+    Parameters
+    ----------
+    n_rows : int, default=200
+        Number of rows in the generated DataFrame
+    ratio_none : float, default=0.1
+        Probability of generating None values in applicable columns
+    seed : int, default=42
+        Random seed for reproducibility
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame with 14 columns of various data types
+    Examples
+    --------
+    >>> df = get_random_dataframe(100, 0.05, 123)
+    >>> df.shape
+    (100, 14)
+    >>> df.dtypes
+    timestamp       datetime64[ns]
+    id                       int64
+    score                  float64
+    status                  object
+    flag                    object
+    count                   object
+    value                   object
+    tags                    object
+    rating                  object
+    uuid                    object
+    binary                  object
+    json_blob               object
+    category              category
+    nullable_text           object
+    dtype: object
+    """
+
+    np.random.seed(seed)
+    random.seed(seed)
+
+    def random_tags():
+        tags = ["AI", "ML", "data", "robotics", "vision"]
+        return random.sample(tags, k=random.randint(1, 3)) if random.random() > ratio_none else None
+
+    def random_decimal():
+        return (
+            str(Decimal(random.uniform(1, 5)).quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
+            if random.random() > ratio_none
+            else None
+        )
+
+    def random_json_blob():
+        blob = {"a": random.randint(1, 10), "b": random.random()}
+        return json.dumps(blob) if random.random() > ratio_none else None
+
+    def random_binary():
+        return bytes(random.randint(0, 255) for _ in range(4)).hex() if random.random() > ratio_none else None
+
+    data = {
+        "timestamp": [
+            datetime(2020, 1, 1) + timedelta(days=np.random.randint(0, 1000)) if np.random.rand() > ratio_none else None
+            for _ in range(n_rows)
+        ],
+        "id": range(1, n_rows + 1),
+        "score": np.random.uniform(0, 100, n_rows),
+        "status": np.random.choice(
+            ["active", "inactive", "pending", None],
+            size=n_rows,
+            p=[(1 - ratio_none) / 3, (1 - ratio_none) / 3, (1 - ratio_none) / 3, ratio_none],
+        ),
+        "flag": np.random.choice(
+            [True, False, None], size=n_rows, p=[(1 - ratio_none) / 2, (1 - ratio_none) / 2, ratio_none]
+        ),
+        "count": [np.random.randint(0, 100) if np.random.rand() > ratio_none else None for _ in range(n_rows)],
+        "value": [round(np.random.normal(50, 15), 2) if np.random.rand() > ratio_none else None for _ in range(n_rows)],
+        "tags": [random_tags() for _ in range(n_rows)],
+        "rating": [random_decimal() for _ in range(n_rows)],
+        "uuid": [str(uuid.uuid4()) if np.random.rand() > ratio_none else None for _ in range(n_rows)],
+        "binary": [random_binary() for _ in range(n_rows)],
+        "json_blob": [random_json_blob() for _ in range(n_rows)],
+        "category": pd.Categorical(
+            np.random.choice(
+                ["A", "B", "C", None],
+                size=n_rows,
+                p=[(1 - ratio_none) / 3, (1 - ratio_none) / 3, (1 - ratio_none) / 3, ratio_none],
+            )
+        ),
+        "nullable_text": [random.choice(["Good", "Bad", "Average", None]) for _ in range(n_rows)],
+    }
+
+    return pd.DataFrame(data)
+
+
+def auto_convert_dtypes_spark(
+    df: psDataFrame,
+    na_values: list = None,
+    category_threshold: float = 0.3,
+    convert_threshold: float = 0.6,
+    sample_ratio: float = 0.1,
+) -> tuple[psDataFrame, dict]:
+    """Automatically convert data types in a PySpark DataFrame using heuristics.
+
+    This function analyzes a sample of the DataFrame to infer appropriate data types
+    and applies the conversions. It handles timestamps, numeric values, booleans,
+    and categorical fields.
+
+    Args:
+        df: A PySpark DataFrame to convert.
+        na_values: List of strings to be considered as NA/NaN. Defaults to
+            ['NA', 'na', 'NULL', 'null', ''].
+        category_threshold: Maximum ratio of unique values to total values
+            to consider a column categorical. Defaults to 0.3.
+        convert_threshold: Minimum ratio of successfully converted values required
+            to apply a type conversion. Defaults to 0.6.
+        sample_ratio: Fraction of data to sample for type inference. Defaults to 0.1.
+
+    Returns:
+        tuple: (The DataFrame with converted types, A dictionary mapping column names to
+                their inferred types as strings)
+
+    Note:
+        - 'category' in the schema dict is conceptual as PySpark doesn't have a true
+            category type like pandas
+        - The function uses sampling for efficiency with large datasets
+    """
+    n_rows = df.count()
+    if na_values is None:
+        na_values = ["NA", "na", "NULL", "null", ""]
+
+    # Normalize NA-like values
+    for colname, coltype in df.dtypes:
+        if coltype == "string":
+            df = df.withColumn(
+                colname,
+                F.when(F.trim(F.lower(F.col(colname))).isin([v.lower() for v in na_values]), None).otherwise(
+                    F.col(colname)
+                ),
+            )
+
+    schema = {}
+    for colname in df.columns:
+        # Sample once at an appropriate ratio
+        sample_ratio_to_use = min(1.0, sample_ratio if n_rows * sample_ratio > 100 else 100 / n_rows)
+        col_sample = df.select(colname).sample(withReplacement=False, fraction=sample_ratio_to_use).dropna()
+        sample_count = col_sample.count()
+
+        inferred_type = "string"  # Default
+
+        if col_sample.dtypes[0][1] != "string":
+            schema[colname] = col_sample.dtypes[0][1]
+            continue
+
+        if sample_count == 0:
+            schema[colname] = "string"
+            continue
+
+        # Check if timestamp
+        ts_col = col_sample.withColumn("parsed", F.to_timestamp(F.col(colname)))
+
+        # Check numeric
+        if (
+            col_sample.withColumn("n", F.col(colname).cast("double")).filter("n is not null").count()
+            >= sample_count * convert_threshold
+        ):
+            # All whole numbers?
+            all_whole = (
+                col_sample.withColumn("n", F.col(colname).cast("double"))
+                .filter("n is not null")
+                .withColumn("frac", F.abs(F.col("n") % 1))
+                .filter("frac > 0.000001")
+                .count()
+                == 0
+            )
+            inferred_type = "int" if all_whole else "double"
+
+        # Check low-cardinality (category-like)
+        elif (
+            sample_count > 0
+            and col_sample.select(F.countDistinct(F.col(colname))).collect()[0][0] / sample_count <= category_threshold
+        ):
+            inferred_type = "category"  # Will just be string, but marked as such
+
+        # Check if timestamp
+        elif ts_col.filter(F.col("parsed").isNotNull()).count() >= sample_count * convert_threshold:
+            inferred_type = "timestamp"
+
+        schema[colname] = inferred_type
+
+    # Apply inferred schema
+    for colname, inferred_type in schema.items():
+        if inferred_type == "int":
+            df = df.withColumn(colname, F.col(colname).cast(T.IntegerType()))
+        elif inferred_type == "double":
+            df = df.withColumn(colname, F.col(colname).cast(T.DoubleType()))
+        elif inferred_type == "boolean":
+            df = df.withColumn(
+                colname,
+                F.when(F.lower(F.col(colname)).isin("true", "yes", "1"), True)
+                .when(F.lower(F.col(colname)).isin("false", "no", "0"), False)
+                .otherwise(None),
+            )
+        elif inferred_type == "timestamp":
+            df = df.withColumn(colname, F.to_timestamp(F.col(colname)))
+        elif inferred_type == "category":
+            df = df.withColumn(colname, F.col(colname).cast(T.StringType()))  # Marked conceptually
+
+        # otherwise keep as string (or original type)
+
+    return df, schema
+
+
+def auto_convert_dtypes_pandas(
+    df: DataFrame,
+    na_values: list = None,
+    category_threshold: float = 0.3,
+    convert_threshold: float = 0.6,
+    sample_ratio: float = 1.0,
+) -> tuple[DataFrame, dict]:
+    """Automatically convert data types in a pandas DataFrame using heuristics.
+
+    This function analyzes the DataFrame to infer appropriate data types
+    and applies the conversions. It handles timestamps, timedeltas, numeric values,
+    and categorical fields.
+
+    Args:
+        df: A pandas DataFrame to convert.
+        na_values: List of strings to be considered as NA/NaN. Defaults to
+            ['NA', 'na', 'NULL', 'null', ''].
+        category_threshold: Maximum ratio of unique values to total values
+            to consider a column categorical. Defaults to 0.3.
+        convert_threshold: Minimum ratio of successfully converted values required
+            to apply a type conversion. Defaults to 0.6.
+        sample_ratio: Fraction of data to sample for type inference. Not used in pandas version
+            but included for API compatibility. Defaults to 1.0.
+
+    Returns:
+        tuple: (The DataFrame with converted types, A dictionary mapping column names to
+                their inferred types as strings)
+    """
+    if na_values is None:
+        na_values = {"NA", "na", "NULL", "null", ""}
+
+    df_converted = df.convert_dtypes()
+    schema = {}
+
+    # Sample if needed (for API compatibility)
+    if sample_ratio < 1.0:
+        df = df.sample(frac=sample_ratio)
+
+    n_rows = len(df)
+
+    for col in df.columns:
+        series = df[col]
+        # Replace NA-like values if string
+        series_cleaned = series.map(lambda x: np.nan if isinstance(x, str) and x.strip() in na_values else x)
+
+        # Skip conversion if already non-object data type, except bool which can potentially be categorical
+        if (
+            not isinstance(series_cleaned.dtype, pd.BooleanDtype)
+            and not isinstance(series_cleaned.dtype, pd.StringDtype)
+            and series_cleaned.dtype != "object"
+        ):
+            # Keep the original data type for non-object dtypes
+            df_converted[col] = series
+            schema[col] = str(series_cleaned.dtype)
+            continue
+
+        # print(f"type: {series_cleaned.dtype}, column: {series_cleaned.name}")
+
+        if not isinstance(series_cleaned.dtype, pd.BooleanDtype):
+            # Try numeric (int or float)
+            numeric = pd.to_numeric(series_cleaned, errors="coerce")
+            if numeric.notna().sum() >= n_rows * convert_threshold:
+                if (numeric.dropna() % 1 == 0).all():
+                    try:
+                        df_converted[col] = numeric.astype("int")  # Nullable integer
+                        schema[col] = "int"
+                        continue
+                    except Exception:
+                        pass
+                df_converted[col] = numeric.astype("double")
+                schema[col] = "double"
+                continue
+
+            # Try datetime
+            datetime_converted = pd.to_datetime(series_cleaned, errors="coerce")
+            if datetime_converted.notna().sum() >= n_rows * convert_threshold:
+                df_converted[col] = datetime_converted
+                schema[col] = "timestamp"
+                continue
+
+            # Try timedelta
+            try:
+                timedelta_converted = pd.to_timedelta(series_cleaned, errors="coerce")
+                if timedelta_converted.notna().sum() >= n_rows * convert_threshold:
+                    df_converted[col] = timedelta_converted
+                    schema[col] = "timedelta"
+                    continue
+            except TypeError:
+                pass
+
+        # Try category
+        try:
+            unique_ratio = series_cleaned.nunique(dropna=True) / n_rows if n_rows > 0 else 1.0
+            if unique_ratio <= category_threshold:
+                df_converted[col] = series_cleaned.astype("category")
+                schema[col] = "category"
+                continue
+        except Exception:
+            pass
+        df_converted[col] = series_cleaned.astype("string")
+        schema[col] = "string"
+
+    return df_converted, schema
--- a/flaml/automl/logger.py
+++ b/flaml/automl/logger.py
@@ -1,7 +1,37 @@
 import logging
+import os
+
+
+class ColoredFormatter(logging.Formatter):
+    # ANSI escape codes for colors
+    COLORS = {
+        # logging.DEBUG: "\033[36m",  # Cyan
+        # logging.INFO: "\033[32m",   # Green
+        logging.WARNING: "\033[33m",  # Yellow
+        logging.ERROR: "\033[31m",  # Red
+        logging.CRITICAL: "\033[1;31m",  # Bright Red
+    }
+    RESET = "\033[0m"  # Reset to default
+
+    def __init__(self, fmt, datefmt, use_color=True):
+        super().__init__(fmt, datefmt)
+        self.use_color = use_color
+
+    def format(self, record):
+        formatted = super().format(record)
+        if self.use_color:
+            color = self.COLORS.get(record.levelno, "")
+            if color:
+                return f"{color}{formatted}{self.RESET}"
+        return formatted
+

 logger = logging.getLogger(__name__)
-logger_formatter = logging.Formatter(
-    "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
+use_color = True
+if os.getenv("FLAML_LOG_NO_COLOR"):
+    use_color = False
+
+logger_formatter = ColoredFormatter(
+    "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S", use_color
 )
 logger.propagate = False
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -9,6 +9,7 @@ import os
 import shutil
 import signal
 import sys
+import threading
 import time
 import warnings
 from contextlib import contextmanager
@@ -89,21 +90,25 @@ def limit_resource(memory_limit, time_limit):
            except ValueError:
                # According to https://bugs.python.org/issue40518, it's a mac-specific error.
                pass
-    main_thread = False
-    if time_limit is not None:
+    alarm_set = False
+    if time_limit is not None and threading.current_thread() is threading.main_thread():
        try:
            signal.signal(signal.SIGALRM, TimeoutHandler)
            signal.alarm(int(time_limit) or 1)
-            main_thread = True
+            alarm_set = True
        except ValueError:
            pass
+
    try:
        yield
    finally:
-        if main_thread:
+        if alarm_set:
            signal.alarm(0)
        if memory_limit > 0:
-            resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
+            try:
+                resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
+            except ValueError:
+                pass


 class BaseEstimator:
@@ -130,7 +135,7 @@ class BaseEstimator:
        self._task = task if isinstance(task, Task) else task_factory(task, None, None)
        self.params = self.config2params(config)
        self.estimator_class = self._model = None
-        if "_estimator_type" in config:
+        if "_estimator_type" in self.params:
            self._estimator_type = self.params.pop("_estimator_type")
        else:
            self._estimator_type = "classifier" if self._task.is_classification() else "regressor"
@@ -1691,7 +1696,7 @@ class XGBoostEstimator(SKLearnEstimator):
        # use_label_encoder is deprecated in 1.7.
        if xgboost_version < "1.7.0":
            params["use_label_encoder"] = params.get("use_label_encoder", False)
-        if "n_jobs" in config:
+        if "n_jobs" in params:
            params["nthread"] = params.pop("n_jobs")
        return params

@@ -1891,7 +1896,7 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
        params = super().config2params(config)
        if "max_leaves" in params:
            params["max_leaf_nodes"] = params.get("max_leaf_nodes", params.pop("max_leaves"))
-        if not self._task.is_classification() and "criterion" in config:
+        if not self._task.is_classification() and "criterion" in params:
            params.pop("criterion")
        if "random_state" not in params:
            params["random_state"] = 12032022
@@ -2344,7 +2349,7 @@ class SGDEstimator(SKLearnEstimator):
        params["loss"] = params.get("loss", None)
        if params["loss"] is None and self._task.is_classification():
            params["loss"] = "log_loss" if SKLEARN_VERSION >= "1.1" else "log"
-        if not self._task.is_classification():
+        if not self._task.is_classification() and "n_jobs" in params:
            params.pop("n_jobs")

        if params.get("penalty") != "elasticnet":
--- a/flaml/automl/task/generic_task.py
+++ b/flaml/automl/task/generic_task.py
@@ -769,10 +769,10 @@ class GenericTask(Task):
            if not is_spark_dataframe:
                y_train, y_val = y_train_split[train_index], y_train_split[val_index]
                if weight is not None:
-                    fit_kwargs["sample_weight"], weight_val = (
-                        weight[train_index],
-                        weight[val_index],
+                    fit_kwargs["sample_weight"] = (
+                        weight[train_index] if isinstance(weight, np.ndarray) else weight.iloc[train_index]
                    )
+                    weight_val = weight[val_index] if isinstance(weight, np.ndarray) else weight.iloc[val_index]
                if groups is not None:
                    fit_kwargs["groups"] = (
                        groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]
--- a/flaml/fabric/mlflow.py
+++ b/flaml/fabric/mlflow.py
@@ -1,10 +1,14 @@
+import atexit
+import functools
 import json
+import logging
 import os
 import pickle
 import random
-import sys
 import tempfile
 import time
+import warnings
+from concurrent.futures import ThreadPoolExecutor, wait
 from typing import MutableMapping

 import mlflow
@@ -12,14 +16,15 @@ import pandas as pd
 from mlflow.entities import Metric, Param, RunTag
 from mlflow.exceptions import MlflowException
 from mlflow.utils.autologging_utils import AUTOLOGGING_INTEGRATIONS, autologging_is_disabled
+from packaging.requirements import Requirement
 from scipy.sparse import issparse
 from sklearn import tree

 try:
-    from pyspark.ml import Pipeline as SparkPipeline
+    from pyspark.ml import PipelineModel as SparkPipelineModel
 except ImportError:

-    class SparkPipeline:
+    class SparkPipelineModel:
        pass


@@ -32,6 +37,84 @@ from flaml.version import __version__

 SEARCH_MAX_RESULTS = 5000  # Each train should not have more than 5000 trials
 IS_RENAME_CHILD_RUN = os.environ.get("FLAML_IS_RENAME_CHILD_RUN", "false").lower() == "true"
+REMOVE_REQUIREMENT_LIST = [
+    "synapseml-cognitive",
+    "synapseml-core",
+    "synapseml-deep-learning",
+    "synapseml-internal",
+    "synapseml-mlflow",
+    "synapseml-opencv",
+    "synapseml-vw",
+    "synapseml-lightgbm",
+    "synapseml-utils",
+    "nni",
+    "optuna",
+]
+OPTIONAL_REMOVE_REQUIREMENT_LIST = ["pytorch-lightning", "transformers"]
+
+os.environ["MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR"] = os.environ.get("MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR", "false")
+
+MLFLOW_NUM_WORKERS = int(os.environ.get("FLAML_MLFLOW_NUM_WORKERS", os.cpu_count() * 4 if os.cpu_count() else 2))
+executor = ThreadPoolExecutor(max_workers=MLFLOW_NUM_WORKERS)
+atexit.register(lambda: executor.shutdown(wait=True))
+
+IS_CLEAN_LOGS = os.environ.get("FLAML_IS_CLEAN_LOGS", "1")
+if IS_CLEAN_LOGS == "1":
+    logging.getLogger("synapse.ml").setLevel(logging.CRITICAL)
+    logging.getLogger("mlflow.utils").setLevel(logging.CRITICAL)
+    logging.getLogger("mlflow.utils.environment").setLevel(logging.CRITICAL)
+    logging.getLogger("mlflow.models.model").setLevel(logging.CRITICAL)
+    warnings.simplefilter("ignore", category=FutureWarning)
+    warnings.simplefilter("ignore", category=UserWarning)
+
+
+def convert_requirement(requirement_list: list[str]):
+    ret = (
+        [Requirement(s.strip().lower()) for s in requirement_list]
+        if mlflow.__version__ <= "2.17.0"
+        else requirement_list
+    )
+    return ret
+
+
+def time_it(func_or_code=None):
+    """
+    Decorator or function that measures execution time.
+
+    Can be used in three ways:
+    1. As a decorator with no arguments: @time_it
+    2. As a decorator with arguments: @time_it()
+    3. As a function call with a string of code to execute and time: time_it("some_code()")
+
+    Args:
+        func_or_code (callable or str, optional): Either a function to decorate or
+            a string of code to execute and time.
+
+    Returns:
+        callable or None: Returns a decorated function if used as a decorator,
+            or None if used to execute a string of code.
+    """
+
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            start_time = time.time()
+            result = func(*args, **kwargs)
+            end_time = time.time()
+            logger.debug(f"Execution of {func.__name__} took {end_time - start_time:.4f} seconds")
+            return result
+
+        return wrapper
+
+    if callable(func_or_code):
+        return decorator(func_or_code)
+    elif func_or_code is None:
+        return decorator
+    else:
+        start_time = time.time()
+        exec(func_or_code)
+        end_time = time.time()
+        logger.debug(f"Execution\n```\n{func_or_code}\n```\ntook {end_time - start_time:.4f} seconds")


 def flatten_dict(d: MutableMapping, sep: str = ".") -> MutableMapping:
@@ -49,23 +132,28 @@ def is_autolog_enabled():
    return not all(autologging_is_disabled(k) for k in AUTOLOGGING_INTEGRATIONS.keys())


-def get_mlflow_log_latency(model_history=False):
+def get_mlflow_log_latency(model_history=False, delete_run=True):
+    try:
+        FLAML_MLFLOW_LOG_LATENCY = float(os.getenv("FLAML_MLFLOW_LOG_LATENCY", 0))
+    except ValueError:
+        FLAML_MLFLOW_LOG_LATENCY = 0
+    if FLAML_MLFLOW_LOG_LATENCY >= 0.1:
+        return FLAML_MLFLOW_LOG_LATENCY
    st = time.time()
    with mlflow.start_run(nested=True, run_name="get_mlflow_log_latency") as run:
        if model_history:
            sk_model = tree.DecisionTreeClassifier()
-            mlflow.sklearn.log_model(sk_model, "sk_models")
-            mlflow.sklearn.log_model(Pipeline([("estimator", sk_model)]), "sk_pipeline")
+            mlflow.sklearn.log_model(sk_model, "model")
            with tempfile.TemporaryDirectory() as tmpdir:
-                pickle_fpath = os.path.join(tmpdir, f"tmp_{int(time.time()*1000)}")
+                pickle_fpath = os.path.join(tmpdir, f"tmp_{int(time.time() * 1000)}")
                with open(pickle_fpath, "wb") as f:
                    pickle.dump(sk_model, f)
-                mlflow.log_artifact(pickle_fpath, "sk_model1")
-                mlflow.log_artifact(pickle_fpath, "sk_model2")
+                mlflow.log_artifact(pickle_fpath, "sk_model")
        mlflow.set_tag("synapseml.ui.visible", "false")  # not shown inline in fabric
-    mlflow.delete_run(run.info.run_id)
+    if delete_run:
+        mlflow.delete_run(run.info.run_id)
    et = time.time()
-    return et - st
+    return 3 * (et - st)


 def infer_signature(X_train=None, y_train=None, dataframe=None, label=None):
@@ -98,12 +186,76 @@ def infer_signature(X_train=None, y_train=None, dataframe=None, label=None):
                )


+def update_and_install_requirements(
+    run_id=None,
+    model_name=None,
+    model_version=None,
+    remove_list=None,
+    artifact_path="model",
+    dst_path=None,
+    install_with_ipython=False,
+):
+    if not (run_id or (model_name and model_version)):
+        raise ValueError(
+            "Please provide `run_id` or both `model_name` and `model_version`. If all three are provided, `run_id` will be used."
+        )
+
+    if install_with_ipython:
+        from IPython import get_ipython
+
+    if not remove_list:
+        remove_list = [
+            "synapseml-cognitive",
+            "synapseml-core",
+            "synapseml-deep-learning",
+            "synapseml-internal",
+            "synapseml-mlflow",
+            "synapseml-opencv",
+            "synapseml-vw",
+            "synapseml-lightgbm",
+            "synapseml-utils",
+            "flaml",  # flaml is needed for AutoML models, should be pre-installed in the runtime
+            "pyspark",  # fabric internal pyspark should be pre-installed in the runtime
+        ]
+
+    # Download model artifacts
+    client = mlflow.MlflowClient()
+    if not run_id:
+        run_id = client.get_model_version(model_name, model_version).run_id
+    if not dst_path:
+        dst_path = os.path.join(tempfile.gettempdir(), "model_artifacts")
+    os.makedirs(dst_path, exist_ok=True)
+    client.download_artifacts(run_id, artifact_path, dst_path)
+    requirements_path = os.path.join(dst_path, artifact_path, "requirements.txt")
+    with open(requirements_path) as f:
+        reqs = f.read().splitlines()
+        old_reqs = [Requirement(req) for req in reqs if req]
+        old_reqs_dict = {req.name: str(req) for req in old_reqs}
+        for req in remove_list:
+            req = Requirement(req)
+            if req.name in old_reqs_dict:
+                old_reqs_dict.pop(req.name, None)
+        new_reqs_list = list(old_reqs_dict.values())
+
+    with open(requirements_path, "w") as f:
+        f.write("\n".join(new_reqs_list))
+
+    if install_with_ipython:
+        get_ipython().run_line_magic("pip", f"install -r {requirements_path} -q")
+    else:
+        logger.info(f"You can run `pip install -r {requirements_path}` to install dependencies.")
+    return requirements_path
+
+
 def _mlflow_wrapper(evaluation_func, mlflow_exp_id, mlflow_config=None, extra_tags=None, autolog=False):
    def wrapped(*args, **kwargs):
        if mlflow_config is not None:
-            from synapse.ml.mlflow import set_mlflow_env_config
+            try:
+                from synapse.ml.mlflow import set_mlflow_env_config

-            set_mlflow_env_config(mlflow_config)
+                set_mlflow_env_config(mlflow_config)
+            except Exception:
+                pass
        import mlflow

        if mlflow_exp_id is not None:
@@ -124,7 +276,20 @@ def _mlflow_wrapper(evaluation_func, mlflow_exp_id, mlflow_config=None, extra_ta


 def _get_notebook_name():
-    return None
+    try:
+        import re
+
+        from synapse.ml.mlflow import get_mlflow_env_config
+        from synapse.ml.mlflow.shared_platform_utils import get_artifact
+
+        notebook_id = get_mlflow_env_config(False).artifact_id
+        current_notebook = get_artifact(notebook_id)
+        notebook_name = re.sub("\\W+", "-", current_notebook.displayName).strip()
+
+        return notebook_name
+    except Exception as e:
+        logger.debug(f"Failed to get notebook name: {e}")
+        return None


 def safe_json_dumps(obj):
@@ -163,6 +328,8 @@ class MLflowIntegration:
        self.has_model = False
        self.only_history = False
        self._do_log_model = True
+        self.futures = {}
+        self.futures_log_model = {}

        self.extra_tag = (
            extra_tag
@@ -170,6 +337,9 @@ class MLflowIntegration:
            else {"extra_tag.sid": f"flaml_{__version__}_{int(time.time())}_{random.randint(1001, 9999)}"}
        )
        self.start_time = time.time()
+        self.experiment_type = experiment_type
+        self.update_autolog_state()
+
        self.mlflow_client = mlflow.tracking.MlflowClient()
        parent_run_info = mlflow.active_run().info if mlflow.active_run() is not None else None
        if parent_run_info:
@@ -188,8 +358,6 @@ class MLflowIntegration:
                mlflow.set_experiment(experiment_name=mlflow_exp_name)
            self.experiment_id = mlflow.tracking.fluent._active_experiment_id
        self.experiment_name = mlflow.get_experiment(self.experiment_id).name
-        self.experiment_type = experiment_type
-        self.update_autolog_state()

        if self.autolog:
            # only end user created parent run in autolog scenario
@@ -197,9 +365,12 @@ class MLflowIntegration:

    def set_mlflow_config(self):
        if self.driver_mlflow_env_config is not None:
-            from synapse.ml.mlflow import set_mlflow_env_config
+            try:
+                from synapse.ml.mlflow import set_mlflow_env_config

-            set_mlflow_env_config(self.driver_mlflow_env_config)
+                set_mlflow_env_config(self.driver_mlflow_env_config)
+            except Exception:
+                pass

    def wrap_evaluation_function(self, evaluation_function):
        wrapped_evaluation_function = _mlflow_wrapper(
@@ -267,6 +438,7 @@ class MLflowIntegration:
        else:
            _tags = []
        self.mlflow_client.log_batch(run_id=target_id, metrics=_metrics, params=[], tags=_tags)
+        return f"Successfully copy_mlflow_run run_id {src_id} to run_id {target_id}"

    def record_trial(self, result, trial, metric):
        if isinstance(result, dict):
@@ -334,12 +506,31 @@ class MLflowIntegration:
                self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id)
                self.has_summary = True

-    def log_model(self, model, estimator, signature=None):
+    def log_model(self, model, estimator, signature=None, run_id=None):
        if not self._do_log_model:
            return
        logger.debug(f"logging model {estimator}")
+        ret_message = f"Successfully log_model {estimator} to run_id {run_id}"
+        optional_remove_list = (
+            [] if estimator in ["transformer", "transformer_ms", "tcn", "tft"] else OPTIONAL_REMOVE_REQUIREMENT_LIST
+        )
+        run = mlflow.active_run()
+        if run and run.info.run_id == self.parent_run_id:
+            logger.debug(
+                f"Current active run_id {run.info.run_id} == parent_run_id {self.parent_run_id}, Starting run_id {run_id}"
+            )
+            mlflow.start_run(run_id=run_id, nested=True)
+        elif run and run.info.run_id != run_id:
+            ret_message = (
+                f"Error: Should log_model {estimator} to run_id {run_id}, but logged to run_id {run.info.run_id}"
+            )
+            logger.error(ret_message)
+        else:
+            logger.debug(f"No active run, start run_id {run_id}")
+            mlflow.start_run(run_id=run_id)
+        logger.debug(f"logged model {estimator} to run_id {mlflow.active_run().info.run_id}")
        if estimator.endswith("_spark"):
-            mlflow.spark.log_model(model, estimator, signature=signature)
+            # mlflow.spark.log_model(model, estimator, signature=signature)
            mlflow.spark.log_model(model, "model", signature=signature)
        elif estimator in ["lgbm"]:
            mlflow.lightgbm.log_model(model, estimator, signature=signature)
@@ -352,42 +543,93 @@ class MLflowIntegration:
        elif estimator in ["prophet"]:
            mlflow.prophet.log_model(model, estimator, signature=signature)
        elif estimator in ["orbit"]:
-            pass
+            logger.warning(f"Unsupported model: {estimator}. No model logged.")
        else:
            mlflow.sklearn.log_model(model, estimator, signature=signature)
+        future = executor.submit(
+            lambda: mlflow.models.model.update_model_requirements(
+                model_uri=f"runs:/{run_id}/{'model' if estimator.endswith('_spark') else estimator}",
+                operation="remove",
+                requirement_list=convert_requirement(REMOVE_REQUIREMENT_LIST + optional_remove_list),
+            )
+        )
+        self.futures[future] = f"run_{run_id}_requirements_updated"
+        if not run or run.info.run_id == self.parent_run_id:
+            logger.debug(f"Ending current run_id {mlflow.active_run().info.run_id}")
+            mlflow.end_run()
+        return ret_message

-    def _pickle_and_log_artifact(self, obj, artifact_name, pickle_fname="temp_.pkl"):
+    def _pickle_and_log_artifact(self, obj, artifact_name, pickle_fname="temp_.pkl", run_id=None):
        if not self._do_log_model:
-            return
+            return True
        with tempfile.TemporaryDirectory() as tmpdir:
            pickle_fpath = os.path.join(tmpdir, pickle_fname)
            try:
                with open(pickle_fpath, "wb") as f:
                    pickle.dump(obj, f)
-                mlflow.log_artifact(pickle_fpath, artifact_name)
+                mlflow.log_artifact(pickle_fpath, artifact_name, run_id)
+                return True
            except Exception as e:
-                logger.debug(f"Failed to pickle and log artifact {artifact_name}, error: {e}")
+                logger.debug(f"Failed to pickle and log {artifact_name}, error: {e}")
+                return False

-    def pickle_and_log_automl_artifacts(self, automl, model, estimator, signature=None):
+    def _log_pipeline(self, pipeline, flavor_name, pipeline_name, signature, run_id, estimator=None):
+        logger.debug(f"logging pipeline {flavor_name}:{pipeline_name}:{estimator}")
+        ret_message = f"Successfully _log_pipeline {flavor_name}:{pipeline_name}:{estimator} to run_id {run_id}"
+        optional_remove_list = (
+            [] if estimator in ["transformer", "transformer_ms", "tcn", "tft"] else OPTIONAL_REMOVE_REQUIREMENT_LIST
+        )
+        run = mlflow.active_run()
+        if run and run.info.run_id == self.parent_run_id:
+            logger.debug(
+                f"Current active run_id {run.info.run_id} == parent_run_id {self.parent_run_id}, Starting run_id {run_id}"
+            )
+            mlflow.start_run(run_id=run_id, nested=True)
+        elif run and run.info.run_id != run_id:
+            ret_message = f"Error: Should _log_pipeline {flavor_name}:{pipeline_name}:{estimator} model to run_id {run_id}, but logged to run_id {run.info.run_id}"
+            logger.error(ret_message)
+        else:
+            logger.debug(f"No active run, start run_id {run_id}")
+            mlflow.start_run(run_id=run_id)
+        logger.debug(
+            f"logging pipeline {flavor_name}:{pipeline_name}:{estimator} to run_id {mlflow.active_run().info.run_id}"
+        )
+        if flavor_name == "sklearn":
+            mlflow.sklearn.log_model(pipeline, pipeline_name, signature=signature)
+        elif flavor_name == "spark":
+            mlflow.spark.log_model(pipeline, pipeline_name, signature=signature)
+        else:
+            logger.warning(f"Unsupported pipeline flavor: {flavor_name}. No model logged.")
+        future = executor.submit(
+            lambda: mlflow.models.model.update_model_requirements(
+                model_uri=f"runs:/{run_id}/{pipeline_name}",
+                operation="remove",
+                requirement_list=convert_requirement(REMOVE_REQUIREMENT_LIST + optional_remove_list),
+            )
+        )
+        self.futures[future] = f"run_{run_id}_requirements_updated"
+        if not run or run.info.run_id == self.parent_run_id:
+            logger.debug(f"Ending current run_id {mlflow.active_run().info.run_id}")
+            mlflow.end_run()
+        return ret_message
+
+    def pickle_and_log_automl_artifacts(self, automl, model, estimator, signature=None, run_id=None):
        """log automl artifacts to mlflow
        load back with `automl = mlflow.pyfunc.load_model(model_run_id_or_uri)`, then do prediction with `automl.predict(X)`
        """
-        logger.debug(f"logging automl artifacts {estimator}")
-        self._pickle_and_log_artifact(automl.feature_transformer, "feature_transformer", "feature_transformer.pkl")
-        self._pickle_and_log_artifact(automl.label_transformer, "label_transformer", "label_transformer.pkl")
-        # Test test_mlflow 1 and 4 will get error: TypeError: cannot pickle '_io.TextIOWrapper' object
-        # try:
-        #     self._pickle_and_log_artifact(automl, "automl", "automl.pkl")
-        # except TypeError:
-        #     pass
+        logger.debug(f"logging automl estimator {estimator}")
+        # self._pickle_and_log_artifact(
+        #     automl.feature_transformer, "feature_transformer", "feature_transformer.pkl", run_id
+        # )
+        # self._pickle_and_log_artifact(automl.label_transformer, "label_transformer", "label_transformer.pkl", run_id)
        if estimator.endswith("_spark"):
            # spark pipeline is not supported yet
            return
        feature_transformer = automl.feature_transformer
-        if isinstance(feature_transformer, Pipeline):
+        if isinstance(feature_transformer, Pipeline) and not estimator.endswith("_spark"):
            pipeline = feature_transformer
            pipeline.steps.append(("estimator", model))
-        elif isinstance(feature_transformer, SparkPipeline):
+        elif isinstance(feature_transformer, SparkPipelineModel) and estimator.endswith("_spark"):
            pipeline = feature_transformer
            pipeline.stages.append(model)
        elif not estimator.endswith("_spark"):
@@ -395,24 +637,26 @@ class MLflowIntegration:
            steps.append(("estimator", model))
            pipeline = Pipeline(steps)
        else:
-            stages = [feature_transformer]
+            stages = []
+            if feature_transformer is not None:
+                stages.append(feature_transformer)
            stages.append(model)
-            pipeline = SparkPipeline(stages=stages)
-        if isinstance(pipeline, SparkPipeline):
+            pipeline = SparkPipelineModel(stages=stages)
+        if isinstance(pipeline, SparkPipelineModel):
            logger.debug(f"logging spark pipeline {estimator}")
-            mlflow.spark.log_model(pipeline, "automl_pipeline", signature=signature)
+            self._log_pipeline(pipeline, "spark", "model", signature, run_id, estimator)
        else:
            # Add a log named "model" to fit default settings
            logger.debug(f"logging sklearn pipeline {estimator}")
-            mlflow.sklearn.log_model(pipeline, "automl_pipeline", signature=signature)
-            mlflow.sklearn.log_model(pipeline, "model", signature=signature)
+            self._log_pipeline(pipeline, "sklearn", "model", signature, run_id, estimator)
+        return f"Successfully pickle_and_log_automl_artifacts {estimator} to run_id {run_id}"

+    @time_it
    def record_state(self, automl, search_state, estimator):
        _st = time.time()
        automl_metric_name = (
            automl._state.metric if isinstance(automl._state.metric, str) else automl._state.error_metric
        )
-
        if automl._state.error_metric.startswith("1-"):
            automl_metric_value = 1 - search_state.val_loss
        elif automl._state.error_metric.startswith("-"):
@@ -425,6 +669,8 @@ class MLflowIntegration:
        else:
            config = search_state.config

+        self.automl_user_configurations = safe_json_dumps(automl._automl_user_configurations)
+
        info = {
            "metrics": {
                "iter_counter": automl._track_iter,
@@ -445,7 +691,7 @@ class MLflowIntegration:
                "flaml.meric": automl_metric_name,
                "flaml.run_source": "flaml-automl",
                "flaml.log_type": self.log_type,
-                "flaml.automl_user_configurations": safe_json_dumps(automl._automl_user_configurations),
+                "flaml.automl_user_configurations": self.automl_user_configurations,
            },
            "params": {
                "sample_size": search_state.sample_size,
@@ -472,33 +718,70 @@ class MLflowIntegration:
                run_name = f"{self.parent_run_name}_child_{self.child_counter}"
            else:
                run_name = None
+            _t1 = time.time()
+            wait(self.futures_log_model)
+            _t2 = time.time() - _t1
+            logger.debug(f"wait futures_log_model in record_state took {_t2} seconds")
            with mlflow.start_run(nested=True, run_name=run_name) as child_run:
-                self._log_info_to_run(info, child_run.info.run_id, log_params=True)
+                future = executor.submit(lambda: self._log_info_to_run(info, child_run.info.run_id, log_params=True))
+                self.futures[future] = f"iter_{automl._track_iter}_log_info_to_run"
+                future = executor.submit(lambda: self._log_automl_configurations(child_run.info.run_id))
+                self.futures[future] = f"iter_{automl._track_iter}_log_automl_configurations"
                if automl._state.model_history:
-                    self.log_model(
-                        search_state.trained_estimator._model, estimator, signature=automl.estimator_signature
-                    )
-                    self.pickle_and_log_automl_artifacts(
-                        automl, search_state.trained_estimator, estimator, signature=automl.pipeline_signature
-                    )
+                    if estimator.endswith("_spark"):
+                        future = executor.submit(
+                            lambda: self.log_model(
+                                search_state.trained_estimator._model,
+                                estimator,
+                                automl.estimator_signature,
+                                child_run.info.run_id,
+                            )
+                        )
+                        self.futures_log_model[future] = f"record_state-log_model_{estimator}"
+                    else:
+                        future = executor.submit(
+                            lambda: self.pickle_and_log_automl_artifacts(
+                                automl,
+                                search_state.trained_estimator,
+                                estimator,
+                                automl.pipeline_signature,
+                                child_run.info.run_id,
+                            )
+                        )
+                        self.futures_log_model[future] = f"record_state-pickle_and_log_automl_artifacts_{estimator}"
                self.manual_run_ids.append(child_run.info.run_id)
            self.child_counter += 1
+        return f"Successfully record_state iteration {automl._track_iter}"

+    @time_it
    def log_automl(self, automl):
        self.set_best_iter(automl)
        if self.autolog:
            if self.parent_run_id is not None:
                mlflow.start_run(run_id=self.parent_run_id, experiment_id=self.experiment_id)
-                mlflow.log_metric("best_validation_loss", automl._state.best_loss)
-                mlflow.log_metric("best_iteration", automl._best_iteration)
-                mlflow.log_metric("num_child_runs", len(self.infos))
-                if automl._trained_estimator is not None and not self.has_model:
-                    self.log_model(
-                        automl._trained_estimator._model, automl.best_estimator, signature=automl.estimator_signature
-                    )
-                    self.pickle_and_log_automl_artifacts(
-                        automl, automl.model, automl.best_estimator, signature=automl.pipeline_signature
-                    )
+                mlflow.log_metrics(
+                    {
+                        "best_validation_loss": automl._state.best_loss,
+                        "best_iteration": automl._best_iteration,
+                        "num_child_runs": len(self.infos),
+                    }
+                )
+                if (
+                    automl._trained_estimator is not None
+                    and not self.has_model
+                    and automl._trained_estimator._model is not None
+                ):
+                    if automl.best_estimator.endswith("_spark"):
+                        self.log_model(
+                            automl._trained_estimator._model,
+                            automl.best_estimator,
+                            automl.estimator_signature,
+                            self.parent_run_id,
+                        )
+                    else:
+                        self.pickle_and_log_automl_artifacts(
+                            automl, automl.model, automl.best_estimator, automl.pipeline_signature, self.parent_run_id
+                        )
                    self.has_model = True

            self.adopt_children(automl)
@@ -515,30 +798,65 @@ class MLflowIntegration:
                if "ml" in conf.keys():
                    conf = conf["ml"]

-                mlflow.log_params(conf)
-                mlflow.log_param("best_learner", automl._best_estimator)
+                mlflow.log_params({**conf, "best_learner": automl._best_estimator}, run_id=self.parent_run_id)
                if not self.has_summary:
                    logger.info(f"logging best model {automl.best_estimator}")
-                    self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id)
+                    future = executor.submit(lambda: self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id))
+                    self.futures[future] = "log_automl_copy_mlflow_run"
+                    future = executor.submit(lambda: self._log_automl_configurations(self.parent_run_id))
+                    self.futures[future] = "log_automl_log_automl_configurations"
                    self.has_summary = True
-                    if automl._trained_estimator is not None and not self.has_model:
-                        self.log_model(
-                            automl._trained_estimator._model,
-                            automl.best_estimator,
-                            signature=automl.estimator_signature,
-                        )
-                        self.pickle_and_log_automl_artifacts(
-                            automl, automl.model, automl.best_estimator, signature=automl.pipeline_signature
-                        )
+                    _t1 = time.time()
+                    wait(self.futures_log_model)
+                    _t2 = time.time() - _t1
+                    logger.debug(f"wait futures_log_model in log_automl took {_t2} seconds")
+                    if (
+                        automl._trained_estimator is not None
+                        and not self.has_model
+                        and automl._trained_estimator._model is not None
+                    ):
+                        if automl.best_estimator.endswith("_spark"):
+                            future = executor.submit(
+                                lambda: self.log_model(
+                                    automl._trained_estimator._model,
+                                    automl.best_estimator,
+                                    signature=automl.estimator_signature,
+                                    run_id=self.parent_run_id,
+                                )
+                            )
+                            self.futures_log_model[future] = f"log_automl-log_model_{automl.best_estimator}"
+                        else:
+                            future = executor.submit(
+                                lambda: self.pickle_and_log_automl_artifacts(
+                                    automl,
+                                    automl.model,
+                                    automl.best_estimator,
+                                    signature=automl.pipeline_signature,
+                                    run_id=self.parent_run_id,
+                                )
+                            )
+                            self.futures_log_model[
+                                future
+                            ] = f"log_automl-pickle_and_log_automl_artifacts_{automl.best_estimator}"
                        self.has_model = True

    def resume_mlflow(self):
        if len(self.resume_params) > 0:
            mlflow.autolog(**self.resume_params)

+    def _log_automl_configurations(self, run_id):
+        self.mlflow_client.log_text(
+            run_id=run_id,
+            text=self.automl_user_configurations,
+            artifact_file="automl_configurations/automl_user_configurations.json",
+        )
+        return f"Successfully _log_automl_configurations to run_id {run_id}"
+
    def _log_info_to_run(self, info, run_id, log_params=False):
        _metrics = [Metric(key, value, int(time.time() * 1000), 0) for key, value in info["metrics"].items()]
-        _tags = [RunTag(key, str(value)) for key, value in info["tags"].items()]
+        _tags = [
+            RunTag(key, str(value)[:5000]) for key, value in info["tags"].items()
+        ]  # AML will raise error if value length > 5000
        _params = [
            Param(key, str(value))
            for key, value in info["params"].items()
@@ -554,6 +872,7 @@ class MLflowIntegration:
                    _tags = [RunTag("mlflow.parentRunId", run_id)]
                    self.mlflow_client.log_batch(run_id=run.info.run_id, metrics=_metrics, params=[], tags=_tags)
            del info["submetrics"]["values"]
+        return f"Successfully _log_info_to_run to run_id {run_id}"

    def adopt_children(self, result=None):
        """
--- a/flaml/tune/logger.py
+++ b/flaml/tune/logger.py
@@ -0,0 +1,37 @@
+import logging
+import os
+
+
+class ColoredFormatter(logging.Formatter):
+    # ANSI escape codes for colors
+    COLORS = {
+        # logging.DEBUG: "\033[36m",  # Cyan
+        # logging.INFO: "\033[32m",   # Green
+        logging.WARNING: "\033[33m",  # Yellow
+        logging.ERROR: "\033[31m",  # Red
+        logging.CRITICAL: "\033[1;31m",  # Bright Red
+    }
+    RESET = "\033[0m"  # Reset to default
+
+    def __init__(self, fmt, datefmt, use_color=True):
+        super().__init__(fmt, datefmt)
+        self.use_color = use_color
+
+    def format(self, record):
+        formatted = super().format(record)
+        if self.use_color:
+            color = self.COLORS.get(record.levelno, "")
+            if color:
+                return f"{color}{formatted}{self.RESET}"
+        return formatted
+
+
+logger = logging.getLogger(__name__)
+use_color = True
+if os.getenv("FLAML_LOG_NO_COLOR"):
+    use_color = False
+
+logger_formatter = ColoredFormatter(
+    "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S", use_color
+)
+logger.propagate = False
--- a/flaml/tune/spark/utils.py
+++ b/flaml/tune/spark/utils.py
@@ -162,6 +162,10 @@ def broadcast_code(custom_code="", file_name="mylearner"):
    assert isinstance(MyLargeLGBM(), LGBMEstimator)
    ```
    """
+    # Check if Spark is available
+    spark_available, _ = check_spark()
+
+    # Write to local driver file system
    flaml_path = os.path.dirname(os.path.abspath(__file__))
    custom_code = textwrap.dedent(custom_code)
    custom_path = os.path.join(flaml_path, file_name + ".py")
@@ -169,6 +173,24 @@ def broadcast_code(custom_code="", file_name="mylearner"):
    with open(custom_path, "w") as f:
        f.write(custom_code)

+    # If using Spark, broadcast the code content to executors
+    if spark_available:
+        spark = SparkSession.builder.getOrCreate()
+        bc_code = spark.sparkContext.broadcast(custom_code)
+
+        # Execute a job to ensure the code is distributed to all executors
+        def _write_code(bc):
+            code = bc.value
+            import os
+
+            module_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name + ".py")
+            os.makedirs(os.path.dirname(module_path), exist_ok=True)
+            with open(module_path, "w") as f:
+                f.write(code)
+            return True
+
+        spark.sparkContext.parallelize(range(1)).map(lambda _: _write_code(bc_code)).collect()
+
    return custom_path


--- a/flaml/tune/tune.py
+++ b/flaml/tune/tune.py
@@ -21,11 +21,11 @@ except (ImportError, AssertionError):
    from .analysis import ExperimentAnalysis as EA
 else:
    ray_available = True
-
 import logging

 from flaml.tune.spark.utils import PySparkOvertimeMonitor, check_spark

+from .logger import logger, logger_formatter
 from .result import DEFAULT_METRIC
 from .trial import Trial

@@ -41,8 +41,6 @@ except ImportError:
    internal_mlflow = False


-logger = logging.getLogger(__name__)
-logger.propagate = False
 _use_ray = True
 _runner = None
 _verbose = 0
@@ -197,9 +195,16 @@ def report(_metric=None, **kwargs):
    global _training_iteration
    if _use_ray:
        try:
-            from ray import tune
+            from ray import __version__ as ray_version

-            return tune.report(_metric, **kwargs)
+            if ray_version.startswith("1."):
+                from ray import tune
+
+                return tune.report(_metric, **kwargs)
+            else:  # ray>=2
+                from ray.air import session
+
+                return session.report(metrics={"metric": _metric, **kwargs})
        except ImportError:
            # calling tune.report() outside tune.run()
            return
@@ -514,10 +519,6 @@ def run(
            elif not logger.hasHandlers():
                # Add the console handler.
                _ch = logging.StreamHandler(stream=sys.stdout)
-                logger_formatter = logging.Formatter(
-                    "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s",
-                    "%m-%d %H:%M:%S",
-                )
                _ch.setFormatter(logger_formatter)
                logger.addHandler(_ch)
            if verbose <= 2:
@@ -745,10 +746,16 @@ def run(
            max_concurrent = max(1, search_alg.max_concurrent)
        else:
            max_concurrent = max(1, max_spark_parallelism)
+        passed_in_n_concurrent_trials = max(n_concurrent_trials, max_concurrent)
        n_concurrent_trials = min(
            n_concurrent_trials if n_concurrent_trials > 0 else num_executors,
            max_concurrent,
        )
+        if n_concurrent_trials < passed_in_n_concurrent_trials:
+            logger.warning(
+                f"The actual concurrent trials is {n_concurrent_trials}. You can set the environment "
+                f"variable `FLAML_MAX_CONCURRENT` to '{passed_in_n_concurrent_trials}' to override the detected num of executors."
+            )
        with parallel_backend("spark"):
            with Parallel(n_jobs=n_concurrent_trials, verbose=max(0, (verbose - 1) * 50)) as parallel:
                try:
@@ -770,7 +777,7 @@ def run(
                        and num_failures < upperbound_num_failures
                    ):
                        if automl_info and automl_info[0] > 0 and time_budget_s < np.inf:
-                            time_budget_s -= automl_info[0]
+                            time_budget_s -= automl_info[0] * n_concurrent_trials
                            logger.debug(f"Remaining time budget with mlflow log latency: {time_budget_s} seconds.")
                        while len(_runner.running_trials) < n_concurrent_trials:
                            # suggest trials for spark
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "2.3.3"
+__version__ = "2.3.5"
--- a/pytest.ini
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    spark: mark a test as requiring Spark
--- a/setup.py
+++ b/setup.py
@@ -73,7 +73,7 @@ setuptools.setup(
            "psutil==5.8.0",
            "dataclasses",
            "transformers[torch]==4.26",
-            "datasets",
+            "datasets<=3.5.0",
            "nltk<=3.8.1",  # 3.8.2 doesn't work with mlflow
            "rouge_score",
            "hcrystalball==0.1.10",
@@ -170,10 +170,9 @@ setuptools.setup(
        "Operating System :: OS Independent",
        # Specify the Python versions you support here.
        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
    ],
-    python_requires=">=3.8",
+    python_requires=">=3.9",
 )
--- a/test/automl/test_extra_models.py
+++ b/test/automl/test_extra_models.py
@@ -17,6 +17,8 @@ from flaml import AutoML
 from flaml.automl.ml import sklearn_metric_loss_score
 from flaml.tune.spark.utils import check_spark

+pytestmark = pytest.mark.spark
+
 leaderboard = defaultdict(dict)

 warnings.simplefilter(action="ignore")
--- a/test/automl/test_forecast.py
+++ b/test/automl/test_forecast.py
@@ -477,7 +477,10 @@ def test_forecast_classification(budget=5):
 def get_stalliion_data():
    from pytorch_forecasting.data.examples import get_stallion_data

-    data = get_stallion_data()
+    # data = get_stallion_data()
+    data = pd.read_parquet(
+        "https://raw.githubusercontent.com/sktime/pytorch-forecasting/refs/heads/main/examples/data/stallion.parquet"
+    )
    # add time index - For datasets with no missing values, FLAML will automate this process
    data["time_idx"] = data["date"].dt.year * 12 + data["date"].dt.month
    data["time_idx"] -= data["time_idx"].min()
--- a/test/automl/test_max_iter_1.py
+++ b/test/automl/test_max_iter_1.py
@@ -0,0 +1,51 @@
+import mlflow
+import numpy as np
+import pandas as pd
+
+from flaml import AutoML
+
+
+def test_max_iter_1():
+    date_rng = pd.date_range(start="2024-01-01", periods=100, freq="H")
+    X = pd.DataFrame({"ds": date_rng})
+    y_train_24h = np.random.rand(len(X)) * 100
+
+    # AutoML
+    settings = {
+        "max_iter": 1,
+        "estimator_list": ["xgboost", "lgbm"],
+        "starting_points": {"xgboost": {}, "lgbm": {}},
+        "task": "ts_forecast",
+        "log_file_name": "test_max_iter_1.log",
+        "seed": 41,
+        "mlflow_exp_name": "TestExp-max_iter-1",
+        "use_spark": False,
+        "n_concurrent_trials": 1,
+        "verbose": 1,
+        "featurization": "off",
+        "metric": "rmse",
+        "mlflow_logging": True,
+    }
+
+    automl = AutoML(**settings)
+
+    with mlflow.start_run(run_name="AutoMLModel-XGBoost-and-LGBM-max_iter_1"):
+        automl.fit(
+            X_train=X,
+            y_train=y_train_24h,
+            period=24,
+            X_val=X,
+            y_val=y_train_24h,
+            split_ratio=0,
+            force_cancel=False,
+        )
+
+    assert automl.model is not None, "AutoML failed to return a model"
+    assert automl.best_run_id is not None, "Best run ID should not be None with mlflow logging"
+
+    print("Best model:", automl.model)
+    print("Best run ID:", automl.best_run_id)
+
+
+if __name__ == "__main__":
+    test_max_iter_1()
--- a/test/automl/test_mlflow.py
+++ b/test/automl/test_mlflow.py
@@ -10,6 +10,18 @@ from flaml import AutoML


 class TestMLFlowLoggingParam:
+    def test_update_and_install_requirements(self):
+        import mlflow
+        from sklearn import tree
+
+        from flaml.fabric.mlflow import update_and_install_requirements
+
+        with mlflow.start_run(run_name="test") as run:
+            sk_model = tree.DecisionTreeClassifier()
+            mlflow.sklearn.log_model(sk_model, "model", registered_model_name="test")
+
+        update_and_install_requirements(run_id=run.info.run_id)
+
    def test_should_start_new_run_by_default(self, automl_settings):
        with mlflow.start_run() as parent_run:
            automl = AutoML()
--- a/test/automl/test_model.py
+++ b/test/automl/test_model.py
@@ -143,4 +143,5 @@ def test_prep():


 if __name__ == "__main__":
+    test_lrl2()
    test_prep()
--- a/test/automl/test_split.py
+++ b/test/automl/test_split.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 from sklearn.datasets import fetch_openml, load_iris
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import GroupKFold, KFold, train_test_split
@@ -59,8 +60,6 @@ def test_groups_for_classification_task():

        X, y = load_wine(return_X_y=True)

-    import numpy as np
-
    automl = AutoML()
    automl_settings = {
        "time_budget": 2,
@@ -118,6 +117,43 @@ def test_groups_for_regression_task():
    automl.fit(X_train, y_train, **automl_settings)


+def test_groups_with_sample_weights():
+    """Verifies that sample weights can be used with group splits i.e. that https://github.com/microsoft/FLAML/issues/1396 remains fixed"""
+    iris_dict_data = load_iris(as_frame=True)  # numpy arrays
+    iris_data = iris_dict_data["frame"]  # pandas dataframe data + target
+    iris_data["cluster"] = np.random.randint(0, 5, iris_data.shape[0])
+    automl = AutoML()
+
+    X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
+    y = iris_data["petal width (cm)"]
+    sample_weight = pd.Series(np.random.rand(X.shape[0]))
+    (
+        X_train,
+        X_test,
+        y_train,
+        y_test,
+        groups_train,
+        groups_test,
+        sample_weight_train,
+        sample_weight_test,
+    ) = train_test_split(X, y, iris_data["cluster"], sample_weight, random_state=42)
+    automl_settings = {
+        "max_iter": 5,
+        "time_budget": -1,
+        "metric": "r2",
+        "task": "regression",
+        "log_file_name": "error.log",
+        "log_type": "all",
+        "estimator_list": ["lgbm"],
+        "eval_method": "cv",
+        "split_type": "group",
+        "groups": groups_train,
+        "sample_weight": sample_weight_train,
+    }
+    automl.fit(X_train, y_train, **automl_settings)
+    assert automl.model is not None
+
+
 def test_stratified_groupkfold():
    from minio.error import ServerError
    from sklearn.model_selection import StratifiedGroupKFold
--- a/test/nlp/test_autohf_classificationhead.py
+++ b/test/nlp/test_autohf_classificationhead.py
@@ -24,6 +24,8 @@ model_path_list = [
 if sys.platform.startswith("darwin") and sys.version_info[0] == 3 and sys.version_info[1] == 11:
    pytest.skip("skipping Python 3.11 on MacOS", allow_module_level=True)

+pytestmark = pytest.mark.spark  # set to spark as parallel testing raised RuntimeError
+

 def test_switch_1_1():
    data_idx, model_path_idx = 0, 0
--- a/test/nlp/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@@ -5,6 +5,8 @@ import sys
 import pytest
 from utils import get_automl_settings, get_toy_data_seqclassification

+pytestmark = pytest.mark.spark  # set to spark as parallel testing raised MlflowException of changing parameter
+

@pytest.mark.skipif(sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows")
 def test_cv():
--- a/test/nlp/test_default.py
+++ b/test/nlp/test_default.py
@@ -10,6 +10,10 @@ from flaml.default import portfolio
 if sys.platform.startswith("darwin") and sys.version_info[0] == 3 and sys.version_info[1] == 11:
    pytest.skip("skipping Python 3.11 on MacOS", allow_module_level=True)

+pytestmark = (
+    pytest.mark.spark
+)  # set to spark as parallel testing raised ValueError: Feature NonExisting not implemented.
+

 def pop_args(fit_kwargs):
    fit_kwargs.pop("max_iter", None)
--- a/test/spark/test_0sparkml.py
+++ b/test/spark/test_0sparkml.py
@@ -3,11 +3,13 @@ import sys
 import warnings

 import mlflow
+import numpy as np
 import pytest
 import sklearn.datasets as skds
 from packaging.version import Version

 from flaml import AutoML
+from flaml.automl.data import auto_convert_dtypes_pandas, auto_convert_dtypes_spark, get_random_dataframe
 from flaml.tune.spark.utils import check_spark

 warnings.simplefilter(action="ignore")
@@ -58,7 +60,7 @@ if sys.version_info >= (3, 11):
 else:
    skip_py311 = False

-pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
+pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]


 def _test_spark_synapseml_lightgbm(spark=None, task="classification"):
@@ -296,11 +298,88 @@ def _test_spark_large_df():
    print("time cost in minutes: ", (end_time - start_time) / 60)


+def test_get_random_dataframe():
+    # Test with default parameters
+    df = get_random_dataframe(n_rows=50, ratio_none=0.2, seed=123)
+    assert df.shape == (50, 14)  # Default is 200 rows and 14 columns
+
+    # Test column types
+    assert "timestamp" in df.columns and np.issubdtype(df["timestamp"].dtype, np.datetime64)
+    assert "id" in df.columns and np.issubdtype(df["id"].dtype, np.integer)
+    assert "score" in df.columns and np.issubdtype(df["score"].dtype, np.floating)
+    assert "category" in df.columns and df["category"].dtype.name == "category"
+
+
+def test_auto_convert_dtypes_pandas():
+    # Create a test DataFrame with various types
+    import pandas as pd
+
+    test_df = pd.DataFrame(
+        {
+            "int_col": ["1", "2", "3", "4", "5", "6", "6"],
+            "float_col": ["1.1", "2.2", "3.3", "NULL", "5.5", "6.6", "6.6"],
+            "date_col": ["2021-01-01", "2021-02-01", "NA", "2021-04-01", "2021-05-01", "2021-06-01", "2021-06-01"],
+            "cat_col": ["A", "B", "A", "A", "B", "A", "B"],
+            "string_col": ["text1", "text2", "text3", "text4", "text5", "text6", "text7"],
+        }
+    )
+
+    # Convert dtypes
+    converted_df, schema = auto_convert_dtypes_pandas(test_df)
+
+    # Check conversions
+    assert schema["int_col"] == "int"
+    assert schema["float_col"] == "double"
+    assert schema["date_col"] == "timestamp"
+    assert schema["cat_col"] == "category"
+    assert schema["string_col"] == "string"
+
+
+def test_auto_convert_dtypes_spark():
+    """Test auto_convert_dtypes_spark function with various data types."""
+    import pandas as pd
+
+    # Create a test DataFrame with various types
+    test_pdf = pd.DataFrame(
+        {
+            "int_col": ["1", "2", "3", "4", "NA"],
+            "float_col": ["1.1", "2.2", "3.3", "NULL", "5.5"],
+            "date_col": ["2021-01-01", "2021-02-01", "NA", "2021-04-01", "2021-05-01"],
+            "cat_col": ["A", "B", "A", "C", "B"],
+            "string_col": ["text1", "text2", "text3", "text4", "text5"],
+        }
+    )
+
+    # Convert pandas DataFrame to Spark DataFrame
+    test_df = spark.createDataFrame(test_pdf)
+
+    # Convert dtypes
+    converted_df, schema = auto_convert_dtypes_spark(test_df)
+
+    # Check conversions
+    assert schema["int_col"] == "int"
+    assert schema["float_col"] == "double"
+    assert schema["date_col"] == "timestamp"
+    assert schema["cat_col"] == "string"  # Conceptual category in schema
+    assert schema["string_col"] == "string"
+
+    # Verify the actual data types from the Spark DataFrame
+    spark_dtypes = dict(converted_df.dtypes)
+    assert spark_dtypes["int_col"] == "int"
+    assert spark_dtypes["float_col"] == "double"
+    assert spark_dtypes["date_col"] == "timestamp"
+    assert spark_dtypes["cat_col"] == "string"  # In Spark, categories are still strings
+    assert spark_dtypes["string_col"] == "string"
+
+
 if __name__ == "__main__":
    test_spark_synapseml_classification()
    test_spark_synapseml_regression()
    test_spark_synapseml_rank()
    test_spark_input_df()
+    test_get_random_dataframe()
+    test_auto_convert_dtypes_pandas()
+    test_auto_convert_dtypes_spark()

    # import cProfile
    # import pstats
--- a/test/spark/test_automl.py
+++ b/test/spark/test_automl.py
@@ -25,7 +25,7 @@ os.environ["FLAML_MAX_CONCURRENT"] = "2"
 spark_available, _ = check_spark()
 skip_spark = not spark_available

-pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
+pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]


 def test_parallel_xgboost(hpo_method=None, data_size=1000):
--- a/test/spark/test_ensemble.py
+++ b/test/spark/test_ensemble.py
@@ -1,6 +1,7 @@
 import os
 import unittest

+import pytest
 from sklearn.datasets import load_wine

 from flaml import AutoML
@@ -24,6 +25,8 @@ if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "custom_mylearner.p
 else:
    skip_my_learner = True

+pytestmark = pytest.mark.spark
+

 class TestEnsemble(unittest.TestCase):
    def setUp(self) -> None:
--- a/test/spark/test_exceptions.py
+++ b/test/spark/test_exceptions.py
@@ -9,7 +9,7 @@ from flaml.tune.spark.utils import check_spark
 spark_available, _ = check_spark()
 skip_spark = not spark_available

-pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
+pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]

 os.environ["FLAML_MAX_CONCURRENT"] = "2"

--- a/test/spark/test_mlflow.py
+++ b/test/spark/test_mlflow.py
@@ -21,6 +21,7 @@ try:
    from pyspark.ml.feature import VectorAssembler
 except ImportError:
    pass
+pytestmark = pytest.mark.spark
 warnings.filterwarnings("ignore")

 skip_spark = importlib.util.find_spec("pyspark") is None
--- a/test/spark/test_multiclass.py
+++ b/test/spark/test_multiclass.py
@@ -2,6 +2,7 @@ import os
 import unittest

 import numpy as np
+import pytest
 import scipy.sparse
 from sklearn.datasets import load_iris, load_wine

@@ -12,6 +13,7 @@ from flaml.tune.spark.utils import check_spark

 spark_available, _ = check_spark()
 skip_spark = not spark_available
+pytestmark = pytest.mark.spark

 os.environ["FLAML_MAX_CONCURRENT"] = "2"

--- a/test/spark/test_notebook.py
+++ b/test/spark/test_notebook.py
@@ -9,7 +9,7 @@ from flaml.tune.spark.utils import check_spark
 spark_available, _ = check_spark()
 skip_spark = not spark_available

-pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
+pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]

 here = os.path.abspath(os.path.dirname(__file__))
 os.environ["FLAML_MAX_CONCURRENT"] = "2"
--- a/test/spark/test_overtime.py
+++ b/test/spark/test_overtime.py
@@ -25,7 +25,7 @@ try:
 except ImportError:
    skip_spark = True

-pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
+pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]


 def test_overtime():
--- a/test/spark/test_performance.py
+++ b/test/spark/test_performance.py
@@ -11,7 +11,7 @@ from flaml.tune.spark.utils import check_spark
 spark_available, _ = check_spark()
 skip_spark = not spark_available

-pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
+pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]

 os.environ["FLAML_MAX_CONCURRENT"] = "2"

--- a/test/spark/test_tune.py
+++ b/test/spark/test_tune.py
@@ -14,7 +14,7 @@ from flaml.tune.spark.utils import check_spark
 spark_available, _ = check_spark()
 skip_spark = not spark_available

-pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
+pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]

 os.environ["FLAML_MAX_CONCURRENT"] = "2"
 X, y = load_breast_cancer(return_X_y=True)
--- a/test/spark/test_utils.py
+++ b/test/spark/test_utils.py
@@ -36,7 +36,7 @@ except ImportError:
    print("Spark is not installed. Skip all spark tests.")
    skip_spark = True

-pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
+pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]


 def test_with_parameters_spark():
--- a/test/test_gpu.py
+++ b/test/test_gpu.py
@@ -59,6 +59,17 @@ def _test_hf_data():
    except requests.exceptions.ConnectionError:
        return

+    # Tests will only run if there is a GPU available
+    try:
+        import ray
+
+        pg = ray.util.placement_group([{"CPU": 1, "GPU": 1}])
+
+        if not pg.wait(timeout_seconds=10):  # Wait 10 seconds for resources
+            raise RuntimeError("No available node types can fulfill resource request!")
+    except RuntimeError:
+        return
+
    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"

--- a/website/package.json
+++ b/website/package.json
@@ -2,9 +2,9 @@
  "name": "website",
  "version": "0.0.0",
  "private": true,
-  "resolutions" :{
-    "nth-check":"2.0.1",
-    "trim":"0.0.3",
+  "resolutions": {
+    "nth-check": "2.0.1",
+    "trim": "0.0.3",
    "got": "11.8.5",
    "node-forge": "1.3.0",
    "minimatch": "3.0.5",
@@ -12,7 +12,7 @@
    "eta": "2.0.0",
    "@sideway/formula": "3.0.1",
    "http-cache-semantics": "4.1.1"
-   },
+  },
  "scripts": {
    "docusaurus": "docusaurus",
    "start": "docusaurus start",
@@ -33,13 +33,13 @@
    "clsx": "^1.1.1",
    "file-loader": "^6.2.0",
    "hast-util-is-element": "1.1.0",
+    "minimatch": "3.0.5",
    "react": "^17.0.1",
    "react-dom": "^17.0.1",
    "rehype-katex": "4",
    "remark-math": "3",
    "trim": "^0.0.3",
-    "url-loader": "^4.1.1",
-    "minimatch": "3.0.5"
+    "url-loader": "^4.1.1"
  },
  "browserslist": {
    "production": [
--- a/website/yarn.lock
+++ b/website/yarn.lock
@@ -153,6 +153,15 @@
    "@babel/highlight" "^7.23.4"
    chalk "^2.4.2"

+"@babel/code-frame@^7.26.2":
+  version "7.26.2"
+  resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.26.2.tgz#4b5fab97d33338eff916235055f0ebc21e573a85"
+  integrity sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ==
+  dependencies:
+    "@babel/helper-validator-identifier" "^7.25.9"
+    js-tokens "^4.0.0"
+    picocolors "^1.0.0"
+
 "@babel/compat-data@^7.17.7", "@babel/compat-data@^7.20.0", "@babel/compat-data@^7.20.1":
  version "7.20.1"
  resolved "https://registry.npmmirror.com/@babel/compat-data/-/compat-data-7.20.1.tgz#f2e6ef7790d8c8dbf03d379502dcc246dcce0b30"
@@ -429,6 +438,11 @@
  resolved "https://registry.yarnpkg.com/@babel/helper-string-parser/-/helper-string-parser-7.23.4.tgz#9478c707febcbbe1ddb38a3d91a2e054ae622d83"
  integrity sha512-803gmbQdqwdf4olxrX4AJyFBV/RTr3rSmOj0rKwesmzlfhYNDEs+/iOcznzpNWlJlIlTJC2QfPFcHB6DlzdVLQ==

+"@babel/helper-string-parser@^7.25.9":
+  version "7.25.9"
+  resolved "https://registry.yarnpkg.com/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz#1aabb72ee72ed35789b4bbcad3ca2862ce614e8c"
+  integrity sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==
+
 "@babel/helper-validator-identifier@^7.18.6", "@babel/helper-validator-identifier@^7.19.1":
  version "7.19.1"
  resolved "https://registry.npmmirror.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.19.1.tgz#7eea834cf32901ffdc1a7ee555e2f9c27e249ca2"
@@ -439,6 +453,11 @@
  resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz#c4ae002c61d2879e724581d96665583dbc1dc0e0"
  integrity sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==

+"@babel/helper-validator-identifier@^7.25.9":
+  version "7.25.9"
+  resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz#24b64e2c3ec7cd3b3c547729b8d16871f22cbdc7"
+  integrity sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==
+
 "@babel/helper-validator-option@^7.18.6":
  version "7.18.6"
  resolved "https://registry.npmmirror.com/@babel/helper-validator-option/-/helper-validator-option-7.18.6.tgz#bf0d2b5a509b1f336099e4ff36e1a63aa5db4db8"
@@ -455,13 +474,12 @@
    "@babel/types" "^7.19.0"

 "@babel/helpers@^7.12.5", "@babel/helpers@^7.20.1":
-  version "7.20.1"
-  resolved "https://registry.npmmirror.com/@babel/helpers/-/helpers-7.20.1.tgz#2ab7a0fcb0a03b5bf76629196ed63c2d7311f4c9"
-  integrity sha512-J77mUVaDTUJFZ5BpP6mMn6OIl3rEWymk2ZxDBQJUG3P+PbmyMcF3bYWvz0ma69Af1oobDqT/iAsvzhB58xhQUg==
+  version "7.26.10"
+  resolved "https://registry.yarnpkg.com/@babel/helpers/-/helpers-7.26.10.tgz#6baea3cd62ec2d0c1068778d63cb1314f6637384"
+  integrity sha512-UPYc3SauzZ3JGgj87GgZ89JVdC5dj0AoetR5Bw6wj4niittNyFh6+eOGonYvJ1ao6B8lEa3Q3klS7ADZ53bc5g==
  dependencies:
-    "@babel/template" "^7.18.10"
-    "@babel/traverse" "^7.20.1"
-    "@babel/types" "^7.20.0"
+    "@babel/template" "^7.26.9"
+    "@babel/types" "^7.26.10"

 "@babel/highlight@^7.18.6":
  version "7.18.6"
@@ -491,6 +509,13 @@
  resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.23.6.tgz#ba1c9e512bda72a47e285ae42aff9d2a635a9e3b"
  integrity sha512-Z2uID7YJ7oNvAI20O9X0bblw7Qqs8Q2hFy0R9tAfnfLkp5MW0UH9eUvnDSnFwKZ0AvgS1ucqR4KzvVHgnke1VQ==

+"@babel/parser@^7.26.9":
+  version "7.26.10"
+  resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.26.10.tgz#e9bdb82f14b97df6569b0b038edd436839c57749"
+  integrity sha512-6aQR2zGE/QFi8JpDLjUZEPYOs7+mhKXm86VaKFiLP35JQwQb6bwUE+XbvkH0EptsYhbNBSUGaUBLKqxH1xSgsA==
+  dependencies:
+    "@babel/types" "^7.26.10"
+
 "@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression@^7.18.6":
  version "7.18.6"
  resolved "https://registry.npmmirror.com/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression/-/plugin-bugfix-safari-id-destructuring-collision-in-function-expression-7.18.6.tgz#da5b8f9a580acdfbe53494dba45ea389fb09a4d2"
@@ -1196,19 +1221,19 @@
    "@babel/plugin-transform-typescript" "^7.18.6"

 "@babel/runtime-corejs3@^7.15.4":
-  version "7.20.1"
-  resolved "https://registry.npmmirror.com/@babel/runtime-corejs3/-/runtime-corejs3-7.20.1.tgz#d0775a49bb5fba77e42cbb7276c9955c7b05af8d"
-  integrity sha512-CGulbEDcg/ND1Im7fUNRZdGXmX2MTWVVZacQi/6DiKE5HNwZ3aVTm5PV4lO8HHz0B2h8WQyvKKjbX5XgTtydsg==
+  version "7.26.10"
+  resolved "https://registry.yarnpkg.com/@babel/runtime-corejs3/-/runtime-corejs3-7.26.10.tgz#5a3185ca2813f8de8ae68622572086edf5cf51f2"
+  integrity sha512-uITFQYO68pMEYR46AHgQoyBg7KPPJDAbGn4jUTIRgCFJIp88MIBUianVOplhZDEec07bp9zIyr4Kp0FCyQzmWg==
  dependencies:
-    core-js-pure "^3.25.1"
-    regenerator-runtime "^0.13.10"
+    core-js-pure "^3.30.2"
+    regenerator-runtime "^0.14.0"

 "@babel/runtime@^7.1.2", "@babel/runtime@^7.10.2", "@babel/runtime@^7.10.3", "@babel/runtime@^7.12.13", "@babel/runtime@^7.15.4", "@babel/runtime@^7.8.4":
-  version "7.20.1"
-  resolved "https://registry.npmmirror.com/@babel/runtime/-/runtime-7.20.1.tgz#1148bb33ab252b165a06698fde7576092a78b4a9"
-  integrity sha512-mrzLkl6U9YLF8qpqI7TB82PESyEGjm/0Ly91jG575eVxMMlb8fYfOXFZIJ8XfLrJZQbm7dlKry2bJmXBUEkdFg==
+  version "7.26.10"
+  resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.26.10.tgz#a07b4d8fa27af131a633d7b3524db803eb4764c2"
+  integrity sha512-2WJMeRQPHKSPemqk/awGrAiuFfzBmOIPXKizAsVhWH9YJqLZ0H+HS4c8loHGgW6utJ3E/ejXQUsiGaQy2NZ9Fw==
  dependencies:
-    regenerator-runtime "^0.13.10"
+    regenerator-runtime "^0.14.0"

 "@babel/template@^7.12.7", "@babel/template@^7.18.10":
  version "7.18.10"
@@ -1228,6 +1253,15 @@
    "@babel/parser" "^7.22.15"
    "@babel/types" "^7.22.15"

+"@babel/template@^7.26.9":
+  version "7.26.9"
+  resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.26.9.tgz#4577ad3ddf43d194528cff4e1fa6b232fa609bb2"
+  integrity sha512-qyRplbeIpNZhmzOysF/wFMuP9sctmh2cFzRAZOn1YapxBsE1i9bJIY586R/WBLfLcmcBlM8ROBiQURnnNy+zfA==
+  dependencies:
+    "@babel/code-frame" "^7.26.2"
+    "@babel/parser" "^7.26.9"
+    "@babel/types" "^7.26.9"
+
 "@babel/traverse@^7.12.13", "@babel/traverse@^7.12.9", "@babel/traverse@^7.19.0", "@babel/traverse@^7.19.1", "@babel/traverse@^7.20.1":
  version "7.23.6"
  resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.23.6.tgz#b53526a2367a0dd6edc423637f3d2d0f2521abc5"
@@ -1262,6 +1296,14 @@
    "@babel/helper-validator-identifier" "^7.22.20"
    to-fast-properties "^2.0.0"

+"@babel/types@^7.26.10", "@babel/types@^7.26.9":
+  version "7.26.10"
+  resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.26.10.tgz#396382f6335bd4feb65741eacfc808218f859259"
+  integrity sha512-emqcG3vHrpxUKTrxcblR36dcrcoRDvKmnL/dCL6ZsHaShW80qxCAcNhzQZrpeM765VzEos+xOi4s+r4IXzTwdQ==
+  dependencies:
+    "@babel/helper-string-parser" "^7.25.9"
+    "@babel/helper-validator-identifier" "^7.25.9"
+
 "@docsearch/css@3.3.0":
  version "3.3.0"
  resolved "https://registry.npmmirror.com/@docsearch/css/-/css-3.3.0.tgz#d698e48302d12240d7c2f7452ccb2d2239a8cd80"
@@ -2995,15 +3037,10 @@ caniuse-api@^3.0.0:
    lodash.memoize "^4.1.2"
    lodash.uniq "^4.5.0"

-caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001400, caniuse-lite@^1.0.30001426:
-  version "1.0.30001430"
-  resolved "https://registry.npmmirror.com/caniuse-lite/-/caniuse-lite-1.0.30001430.tgz#638a8ae00b5a8a97e66ff43733b2701f81b101fa"
-  integrity sha512-IB1BXTZKPDVPM7cnV4iaKaHxckvdr/3xtctB3f7Hmenx3qYBhGtTZ//7EllK66aKXW98Lx0+7Yr0kxBtIt3tzg==
-
-caniuse-lite@^1.0.30001646:
-  version "1.0.30001657"
-  resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001657.tgz#29fd504bffca719d1c6b63a1f6f840be1973a660"
-  integrity sha512-DPbJAlP8/BAXy3IgiWmZKItubb3TYGP0WscQQlVGIfT4s/YlFYVuJgyOsQNP7rJRChx/qdMeLJQJP0Sgg2yjNA==
+caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001400, caniuse-lite@^1.0.30001426, caniuse-lite@^1.0.30001646:
+  version "1.0.30001718"
+  resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001718.tgz"
+  integrity sha512-AflseV1ahcSunK53NfEs9gFWgOEmzr0f+kaMFA4xiLZlr9Hzt7HxcSpIFcnNCUkz6R6dWKa54rUz3HUmI3nVcw==

 ccount@^1.0.0, ccount@^1.0.3:
  version "1.1.0"
@@ -3326,10 +3363,10 @@ core-js-compat@^3.25.1:
  dependencies:
    browserslist "^4.21.4"

-core-js-pure@^3.25.1:
-  version "3.26.0"
-  resolved "https://registry.npmmirror.com/core-js-pure/-/core-js-pure-3.26.0.tgz#7ad8a5dd7d910756f3124374b50026e23265ca9a"
-  integrity sha512-LiN6fylpVBVwT8twhhluD9TzXmZQQsr2I2eIKtWNbZI1XMfBT7CV18itaN6RA7EtQd/SDdRx/wzvAShX2HvhQA==
+core-js-pure@^3.30.2:
+  version "3.41.0"
+  resolved "https://registry.yarnpkg.com/core-js-pure/-/core-js-pure-3.41.0.tgz#349fecad168d60807a31e83c99d73d786fe80811"
+  integrity sha512-71Gzp96T9YPk63aUvE5Q5qP+DryB4ZloUZPSOebGM88VNw8VNfvdA7z6kGA8iGOTEzAomsRidp4jXSmUIJsL+Q==

 core-js@^3.18.0:
  version "3.26.0"
@@ -4830,9 +4867,9 @@ http-parser-js@>=0.5.1:
  integrity sha512-SGeBX54F94Wgu5RH3X5jsDtf4eHyRogWX1XGT3b4HuW3tQPM4AaBzoUji/4AAJNXCEOWZ5O0DgZmJw1947gD5Q==

 http-proxy-middleware@^2.0.3:
-  version "2.0.7"
-  resolved "https://registry.yarnpkg.com/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz#915f236d92ae98ef48278a95dedf17e991936ec6"
-  integrity sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==
+  version "2.0.9"
+  resolved "https://registry.yarnpkg.com/http-proxy-middleware/-/http-proxy-middleware-2.0.9.tgz#e9e63d68afaa4eee3d147f39149ab84c0c2815ef"
+  integrity sha512-c1IyJYLYppU574+YI7R4QyX2ystMtVXZwIdzazUIPIJsHuWNd+mho2j+bKoHftndicGj9yh+xjd+l0yj7VeT1Q==
  dependencies:
    "@types/http-proxy" "^1.17.8"
    http-proxy "^1.18.1"
@@ -6441,9 +6478,9 @@ prism-react-renderer@^1.2.1:
  integrity sha512-IJ+MSwBWKG+SM3b2SUfdrhC+gu01QkV2KmRQgREThBfSQRoufqRfxfHUxpG1WcaFjP+kojcFyO9Qqtpgt3qLCg==

 prismjs@^1.23.0:
-  version "1.29.0"
-  resolved "https://registry.npmmirror.com/prismjs/-/prismjs-1.29.0.tgz#f113555a8fa9b57c35e637bba27509dcf802dd12"
-  integrity sha512-Kx/1w86q/epKcmte75LNrEoT+lX8pBpavuAbvJWRXar7Hz8jrtF+e3vY751p0R8H9HdArwaCTNDDzHg/ScJK1Q==
+  version "1.30.0"
+  resolved "https://registry.yarnpkg.com/prismjs/-/prismjs-1.30.0.tgz#d9709969d9d4e16403f6f348c63553b19f0975a9"
+  integrity sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==

 process-nextick-args@~2.0.0:
  version "2.0.1"
@@ -6816,10 +6853,10 @@ regenerate@^1.4.2:
  resolved "https://registry.npmmirror.com/regenerate/-/regenerate-1.4.2.tgz#b9346d8827e8f5a32f7ba29637d398b69014848a"
  integrity sha512-zrceR/XhGYU/d/opr2EKO7aRHUeiBI8qjtfHqADTwZd6Szfy16la6kqD0MIUs5z5hx6AaKa+PixpPrR289+I0A==

-regenerator-runtime@^0.13.10:
-  version "0.13.10"
-  resolved "https://registry.npmmirror.com/regenerator-runtime/-/regenerator-runtime-0.13.10.tgz#ed07b19616bcbec5da6274ebc75ae95634bfc2ee"
-  integrity sha512-KepLsg4dU12hryUO7bp/axHAKvwGOCV0sGloQtpagJ12ai+ojVDqkeGSiRX1zlq+kjIMZ1t7gpze+26QqtdGqw==
+regenerator-runtime@^0.14.0:
+  version "0.14.1"
+  resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz#356ade10263f685dda125100cd862c1db895327f"
+  integrity sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==

 regenerator-transform@^0.15.0:
  version "0.15.0"
@@ -7272,14 +7309,7 @@ send@0.19.0:
    range-parser "~1.2.1"
    statuses "2.0.1"

-serialize-javascript@^6.0.0:
-  version "6.0.0"
-  resolved "https://registry.npmmirror.com/serialize-javascript/-/serialize-javascript-6.0.0.tgz#efae5d88f45d7924141da8b5c3a7a7e663fefeb8"
-  integrity sha512-Qr3TosvguFt8ePWqsvRfrKyQXIiW+nGbYpy8XK24NQHE83caxWt+mIymTT19DGFbNWNLfEwsrkSmN64lVWB9ag==
-  dependencies:
-    randombytes "^2.1.0"
-
-serialize-javascript@^6.0.1:
+serialize-javascript@^6.0.0, serialize-javascript@^6.0.1:
  version "6.0.2"
  resolved "https://registry.yarnpkg.com/serialize-javascript/-/serialize-javascript-6.0.2.tgz#defa1e055c83bf6d59ea805d8da862254eb6a6c2"
  integrity sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==
Author	SHA1	Message	Date
Li Jiang	01c3c83653	Install wheel and setuptools (#1443 )	2025-05-28 12:56:48 +08:00
Li Jiang	9b66103f7c	Fix typo, add quotes to python-version (#1442 )	2025-05-28 12:24:00 +08:00
Li Jiang	48dfd72e64	Fix CD actions (#1441 ) * Fix CD actions * Skip Build if no relevant changes	2025-05-28 10:45:27 +08:00
Li Jiang	dec92e5b02	Upgrade python 3.8 to 3.10 in github actions (#1440 )	2025-05-27 21:34:21 +08:00
Li Jiang	22911ea1ef	Merged PR 1685054: Add more logs and function wait_futures for easier post analysis (#1438 ) - Add function wait_futures for easier post analysis - Use logger instead of print ---- #### AI description (iteration 1) #### PR Classification A code enhancement for debugging asynchronous mlflow logging and improving post-run analysis. #### PR Summary This PR adds detailed debug logging to the mlflow integration and introduces a new `wait_futures` function to streamline the collection of asynchronous task results for improved analysis. - `flaml/fabric/mlflow.py`: Added debug log statements around starting and ending mlflow runs to trace run IDs and execution flow. - `flaml/automl/automl.py`: Implemented the `wait_futures` function to handle asynchronous task results and replaced a print call with `logger.info` for consistent logging. <!-- GitOpsUserAgent=GitOps.Apps.Server.pullrequestcopilot --> Related work items: #4029592	2025-05-27 15:32:56 +08:00
murunlin	12183e5f73	Add the detailed info for parameter 'verbose' (#1435 ) * explain-verbose-parameter * concise-verbose-docstring * explain-verbose-parameter * explain-verbose-parameter * test-ignore * test-ignore * sklearn-version-califonia * submit-0526 --------- Co-authored-by: Runlin Mu (FESCO Adecco Human Resources) <v-runlinmu@microsoft.com> Co-authored-by: Li Jiang <bnujli@gmail.com>	2025-05-27 10:01:01 +08:00
Li Jiang	c2b25310fc	Sync Fabric till 2cd1c3da (#1433 ) * Sync Fabric till 2cd1c3da * Remove synapseml from tag names * Fix 'NoneType' object has no attribute 'DataFrame' * Deprecated 3.8 support * Fix 'NoneType' object has no attribute 'DataFrame' * Still use python 3.8 for pydoc * Don't run tests in parallel * Remove autofe and lowcode	2025-05-23 10:19:31 +08:00
murunlin	0f9420590d	fix: best_model_for_estimator returns inconsistent feature_importances_ compared to automl.model (#1429 ) * mrl-issue1422-0513 * fix version dependency * fix datasets version * test completion --------- Co-authored-by: Runlin Mu (FESCO Adecco Human Resources) <v-runlinmu@microsoft.com> Co-authored-by: Li Jiang <bnujli@gmail.com>	2025-05-15 09:37:34 +08:00
hexiang-x	5107c506b4	fix:When use_spark = True and mlflow_logging = True are set, an error is reported when logging the best model: 'NoneType' object has no attribute 'save' bug Something isn't working (#1432 )	2025-05-14 19:34:06 +08:00
dependabot[bot]	9e219ef8dc	Bump http-proxy-middleware from 2.0.7 to 2.0.9 in /website (#1425 ) Bumps [http-proxy-middleware](https://github.com/chimurai/http-proxy-middleware) from 2.0.7 to 2.0.9. - [Release notes](https://github.com/chimurai/http-proxy-middleware/releases) - [Changelog](https://github.com/chimurai/http-proxy-middleware/blob/v2.0.9/CHANGELOG.md) - [Commits](https://github.com/chimurai/http-proxy-middleware/compare/v2.0.7...v2.0.9) --- updated-dependencies: - dependency-name: http-proxy-middleware dependency-version: 2.0.9 dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com>	2025-04-23 14:22:12 +08:00
Li Jiang	6e4083743b	Revert "Numpy 2.x is not supported yet. (#1424 )" (#1426 ) This reverts commit `17e95edd9e`.	2025-04-22 21:31:44 +08:00
Li Jiang	17e95edd9e	Numpy 2.x is not supported yet. (#1424 )	2025-04-22 12:11:27 +08:00
Stickic-cyber	468bc62d27	Fix issue with "list index out of range" when max_iter=1 (#1419 )	2025-04-09 21:54:17 +08:00
dependabot[bot]	437c239c11	Bump @babel/helpers from 7.20.1 to 7.26.10 in /website (#1413 ) Bumps [@babel/helpers](https://github.com/babel/babel/tree/HEAD/packages/babel-helpers) from 7.20.1 to 7.26.10. - [Release notes](https://github.com/babel/babel/releases) - [Changelog](https://github.com/babel/babel/blob/main/CHANGELOG.md) - [Commits](https://github.com/babel/babel/commits/v7.26.10/packages/babel-helpers) --- updated-dependencies: - dependency-name: "@babel/helpers" dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com>	2025-03-14 15:51:06 +08:00
dependabot[bot]	8e753f1092	Bump @babel/runtime from 7.20.1 to 7.26.10 in /website (#1414 ) Bumps [@babel/runtime](https://github.com/babel/babel/tree/HEAD/packages/babel-runtime) from 7.20.1 to 7.26.10. - [Release notes](https://github.com/babel/babel/releases) - [Changelog](https://github.com/babel/babel/blob/main/CHANGELOG.md) - [Commits](https://github.com/babel/babel/commits/v7.26.10/packages/babel-runtime) --- updated-dependencies: - dependency-name: "@babel/runtime" dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com>	2025-03-13 21:34:02 +08:00
dependabot[bot]	a3b57e11d4	Bump prismjs from 1.29.0 to 1.30.0 in /website (#1411 ) Bumps [prismjs](https://github.com/PrismJS/prism) from 1.29.0 to 1.30.0. - [Release notes](https://github.com/PrismJS/prism/releases) - [Changelog](https://github.com/PrismJS/prism/blob/master/CHANGELOG.md) - [Commits](https://github.com/PrismJS/prism/compare/v1.29.0...v1.30.0) --- updated-dependencies: - dependency-name: prismjs dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com>	2025-03-13 14:06:41 +08:00
dependabot[bot]	a80dcf9925	Bump @babel/runtime-corejs3 from 7.20.1 to 7.26.10 in /website (#1412 ) Bumps [@babel/runtime-corejs3](https://github.com/babel/babel/tree/HEAD/packages/babel-runtime-corejs3) from 7.20.1 to 7.26.10. - [Release notes](https://github.com/babel/babel/releases) - [Changelog](https://github.com/babel/babel/blob/main/CHANGELOG.md) - [Commits](https://github.com/babel/babel/commits/v7.26.10/packages/babel-runtime-corejs3) --- updated-dependencies: - dependency-name: "@babel/runtime-corejs3" dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-03-13 10:04:03 +08:00
SkBlaz	7157af44e0	Improved error handling in case no scikit present (#1402 ) * Improved error handling in case no scikit present Currently there is no description for when this error is thrown. Being explicit seems of value. * Update histgb.py --------- Co-authored-by: Li Jiang <bnujli@gmail.com>	2025-03-03 15:39:43 +08:00
Li Jiang	1798c4591e	Upgrade setuptools (#1410 )	2025-03-01 08:05:51 +08:00
Li Jiang	dd26263330	Bump version to 2.3.5 (#1409 )	2025-02-17 22:26:59 +08:00
Li Jiang	2ba5f8bed1	Fix params pop error (#1408 )	2025-02-17 15:06:05 +08:00
Daniel Grindrod	d0a11958a5	fix: Fixed bug where group folds and sample weights couldn't be used in the same automl instance (#1405 )	2025-02-15 10:41:27 +08:00
dependabot[bot]	0ef9b00a75	Bump serialize-javascript from 6.0.0 to 6.0.2 in /website (#1407 ) Bumps [serialize-javascript](https://github.com/yahoo/serialize-javascript) from 6.0.0 to 6.0.2. - [Release notes](https://github.com/yahoo/serialize-javascript/releases) - [Commits](https://github.com/yahoo/serialize-javascript/compare/v6.0.0...v6.0.2) --- updated-dependencies: - dependency-name: serialize-javascript dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com>	2025-02-14 12:36:49 +08:00
Will Charles	840f76e5e5	Changed tune.report import for ray>=2 (#1392 ) * Changed tune.report import for ray>=2 * env: Changed pydantic restriction in env * Reverted Pydantic install conditions * Reverted Pydantic install conditions * test: Check if GPU is available * tests: uncommented a line * tests: Better fix for Ray GPU checking * tests: Added timeout to dataset loading * tests: Deleted _test_hf_data() * test: Reduce lrl2 dataset size * bug: timeout error * bug: timeout error * fix: Added threading check for timout issue * Undo old commits * Timeout fix from #1406 --------- Co-authored-by: Daniel Grindrod <dannycg1996@gmail.com>	2025-02-14 09:38:33 +08:00
Li Jiang	d8b7d25b80	Fix test hang issue (#1406 ) * Add try except to resource.setrlimit * Set time limit only in main thread * Check only test model * Pytest debug * Test separately * Move test_model.py to automl folder	2025-02-13 19:50:35 +08:00
Li Jiang	6d53929803	Bump version to 2.3.4 (#1389 )	2024-12-18 12:49:59 +08:00