Update readme, bump version to 2.4.0, fix CI errors (#1466)

* Update gitignore * Bump version to 2.4.0 * Update readme * Pre-download california housing data * Use pre-downloaded california housing data * Pin lightning<=2.5.6 * Fix typo in find and replace * Fix estimators has no attribute __sklearn_tags__ * Pin torch to 2.2.2 in tests * Fix conflict * Update pytorch-forecasting * Update pytorch-forecasting * Update pytorch-forecasting * Use numpy<2 for testing * Update scikit-learn * Run Build and UT every other day * Pin pip<24.1 * Pin pip<24.1 in pipeline * Loosen pip, install pytorch_forecasting only in py311 * Add support to new versions of nlp dependecies * Fix formats * Remove redefinition * Update mlflow versions * Fix mlflow version syntax * Update gitignore * Clean up cache to free space * Remove clean up action cache * Fix blendsearch * Update test workflow * Update setup.py * Fix catboost version * Update workflow * Prepare for python 3.14 * Support no catboost * Fix tests * Fix python_requires * Update test workflow * Fix vw tests * Remove python 3.9 * Fix nlp tests * Fix prophet * Print pip freeze for better debugging * Fix Optuna search does not support parameters of type Float with samplers of type Quantized * Save dependencies for later inspection * Fix coverage.xml not exists * Fix github action permission * Handle python 3.13 * Address openml is not installed * Check dependencies before run tests * Update dependencies * Fix syntax error * Use bash * Update dependencies * Fix git error * Loose mlflow constraints * Add rerun, use mlflow-skinny * Fix git error * Remove ray tests * Update xgboost versions * Fix automl pickle error * Don't test python 3.10 on macos as it's stuck * Rebase before push * Reduce number of branches
2026-02-09 02:09:16 +08:00 · 2026-01-09 13:40:52 +08:00
parent 7f42bece89
commit 1285700d7a
31 changed files with 543 additions and 237 deletions
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -22,8 +22,12 @@ on:
      - 'setup.py'
  merge_group:
    types: [checks_requested]
+  schedule:
+    # Every other day at 02:00 UTC
+    - cron: '0 2 */2 * *'

-permissions: {}
+permissions:
+  contents: write
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -36,7 +40,10 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11"]
+        exclude:
+          - os: macos-latest
+            python-version: "3.10"
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
@@ -44,7 +51,7 @@ jobs:
        with:
          python-version: ${{ matrix.python-version }}
      - name: On mac, install libomp to facilitate lgbm and xgboost install
-        if: matrix.os == 'macOS-latest'
+        if: matrix.os == 'macos-latest'
        run: |
          brew update
          brew install libomp
@@ -70,33 +77,43 @@ jobs:
        run: |
          pip install pyspark==3.5.1
          pip list | grep "pyspark"
-      - name: If linux and python<3.11, install ray 2
-        if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.11'
+      - name: On Ubuntu python 3.12, install pyspark 4.0.1
+        if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
        run: |
-          pip install "ray[tune]<2.5.0"
-      - name: If mac and python 3.10, install ray and xgboost 1
-        if: matrix.os == 'macOS-latest' && matrix.python-version == '3.10'
-        run: |
-          pip install -e .[ray]
-          # use macOS to test xgboost 1, but macOS also supports xgboost 2
-          pip install "xgboost<2"
-      - name: If linux, install prophet on python < 3.9
-        if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.8'
+          pip install pyspark==4.0.1
+          pip list | grep "pyspark"
+      # # TODO: support ray
+      # - name: If linux and python<3.11, install ray 2
+      #   if: matrix.os == 'ubuntu-latest' && matrix.python-version < '3.11'
+      #   run: |
+      #     pip install "ray[tune]<2.5.0"
+      - name: Install prophet when on linux
+        if: matrix.os == 'ubuntu-latest'
        run: |
          pip install -e .[forecast]
-      - name: Install vw on python < 3.10
-        if: matrix.python-version == '3.8' || matrix.python-version == '3.9'
+      # TODO: support vw for python 3.10+
+      - name:  If linux and python<3.10, install vw
+        if: matrix.os == 'ubuntu-latest' && matrix.python-version < '3.10'
        run: |
          pip install -e .[vw]
+      - name: Pip freeze
+        run: |
+          pip freeze
+      - name: Check dependencies
+        run: |
+          python test/check_dependency.py
+      - name: Clear pip cache
+        run: |
+          pip cache purge
      - name: Test with pytest
        if: matrix.python-version != '3.10'
        run: |
-          pytest test/ --ignore=test/autogen
+          pytest test/ --ignore=test/autogen --reruns 2 --reruns-delay 10
      - name: Coverage
        if: matrix.python-version == '3.10'
        run: |
          pip install coverage
-          coverage run -a -m pytest test --ignore=test/autogen
+          coverage run -a -m pytest test --ignore=test/autogen --reruns 2 --reruns-delay 10
          coverage xml
      - name: Upload coverage to Codecov
        if: matrix.python-version == '3.10'
@@ -104,28 +121,24 @@ jobs:
        with:
          file: ./coverage.xml
          flags: unittests
+      - name: Save dependencies
+        shell: bash
+        run: |
+          git config --global user.name 'github-actions[bot]'
+          git config --global user.email 'github-actions[bot]@users.noreply.github.com'
+          git config advice.addIgnoredFile false

-  # docs:
+          BRANCH=unit-tests-installed-dependencies
+          git fetch origin
+          git checkout -B "$BRANCH"
+          if git show-ref --verify --quiet "refs/remotes/origin/$BRANCH"; then
+            git rebase "origin/$BRANCH"
+          fi

-  #   runs-on: ubuntu-latest
-
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - name: Setup Python
-  #       uses: actions/setup-python@v4
-  #       with:
-  #         python-version: '3.8'
-  #     - name: Compile documentation
-  #       run: |
-  #           pip install -e .
-  #           python -m pip install sphinx sphinx_rtd_theme
-  #           cd docs
-  #           make html
-  #     - name: Deploy to GitHub pages
-  #       if: ${{ github.ref == 'refs/heads/main' }}
-  #       uses: JamesIves/github-pages-deploy-action@3.6.2
-  #       with:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #         BRANCH: gh-pages
-  #         FOLDER: docs/_build/html
-  #         CLEAN: true
+          pip freeze > installed_all_dependencies_${{ matrix.python-version }}_${{ matrix.os }}.txt
+          python test/check_dependency.py > installed_first_tier_dependencies_${{ matrix.python-version }}_${{ matrix.os }}.txt
+          git add installed_*dependencies*.txt
+          mv coverage.xml ./coverage_${{ matrix.python-version }}_${{ matrix.os }}.xml || true
+          git add -f ./coverage_${{ matrix.python-version }}_${{ matrix.os }}.xml || true
+          git commit -m "Update installed dependencies for Python ${{ matrix.python-version }} on ${{ matrix.os }}" || exit 0
+          git push origin "$BRANCH"
--- a/.gitignore
+++ b/.gitignore
@@ -172,7 +172,7 @@ test/default
 test/housing.json
 test/nlp/default/transformer_ms/seq-classification.json

-flaml/fabric/fanova/_fanova.c
+flaml/fabric/fanova/*fanova.c
 # local config files
 *.config.local

@@ -184,3 +184,7 @@ notebook/lightning_logs/
 lightning_logs/
 flaml/autogen/extensions/tmp/
 test/autogen/my_tmp/
+catboost_*
+
+# Internal configs
+.pypirc
--- a/README.md
+++ b/README.md
@@ -14,15 +14,9 @@
    <br>
 </p>

-:fire: FLAML supports AutoML and Hyperparameter Tuning in [Microsoft Fabric Data Science](https://learn.microsoft.com/en-us/fabric/data-science/automated-machine-learning-fabric). In addition, we've introduced Python 3.11 support, along with a range of new estimators, and comprehensive integration with MLflow—thanks to contributions from the Microsoft Fabric product team.
+:fire: FLAML supports AutoML and Hyperparameter Tuning in [Microsoft Fabric Data Science](https://learn.microsoft.com/en-us/fabric/data-science/automated-machine-learning-fabric). In addition, we've introduced Python 3.11 and 3.12 support, along with a range of new estimators, and comprehensive integration with MLflow—thanks to contributions from the Microsoft Fabric product team.

-:fire: Heads-up: We have migrated [AutoGen](https://microsoft.github.io/autogen/) into a dedicated [github repository](https://github.com/microsoft/autogen). Alongside this move, we have also launched a dedicated [Discord](https://discord.gg/pAbnFJrkgZ) server and a [website](https://microsoft.github.io/autogen/) for comprehensive documentation.
-
-:fire: The automated multi-agent chat framework in [AutoGen](https://microsoft.github.io/autogen/) is in preview from v2.0.0.
-
-:fire: FLAML is highlighted in OpenAI's [cookbook](https://github.com/openai/openai-cookbook#related-resources-from-around-the-web).
-
-:fire: [autogen](https://microsoft.github.io/autogen/) is released with support for ChatGPT and GPT-4, based on [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673).
+:fire: Heads-up: [AutoGen](https://microsoft.github.io/autogen/) has moved to a dedicated [GitHub repository](https://github.com/microsoft/autogen). FLAML no longer includes the `autogen` module—please use AutoGen directly.

 ## What is FLAML

@@ -30,7 +24,7 @@ FLAML is a lightweight Python library for efficient automation of machine
 learning and AI operations. It automates workflow based on large language models, machine learning models, etc.
 and optimizes their performance.

- FLAML enables building next-gen GPT-X applications based on multi-agent conversations with minimal effort. It simplifies the orchestration, automation and optimization of a complex GPT-X workflow. It maximizes the performance of GPT-X models and augments their weakness.
+- FLAML enables economical automation and tuning for ML/AI workflows, including model selection and hyperparameter optimization under resource constraints.
 - For common machine learning tasks like classification and regression, it quickly finds quality models for user-provided data with low computational resources. It is easy to customize or extend. Users can find their desired customizability from a smooth range.
 - It supports fast and economical automatic tuning (e.g., inference hyperparameters for foundation models, configurations in MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations), capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping.

@@ -46,10 +40,10 @@ FLAML requires **Python version >= 3.9**. It can be installed from pip:
 pip install flaml
 ```

-Minimal dependencies are installed without extra options. You can install extra options based on the feature you need. For example, use the following to install the dependencies needed by the [`autogen`](https://microsoft.github.io/autogen/) package.
+Minimal dependencies are installed without extra options. You can install extra options based on the feature you need. For example, use the following to install the dependencies needed by the [`automl`](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML) module.

 ```bash
-pip install "flaml[autogen]"
+pip install "flaml[automl]"
 ```

 Find more options in [Installation](https://microsoft.github.io/FLAML/docs/Installation).
@@ -57,39 +51,6 @@ Each of the [`notebook examples`](https://github.com/microsoft/FLAML/tree/main/n

 ## Quickstart

- (New) The [autogen](https://microsoft.github.io/autogen/) package enables the next-gen GPT-X applications with a generic multi-agent conversation framework.
-  It offers customizable and conversable agents which integrate LLMs, tools and human.
-  By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For example,
-
-```python
-from flaml import autogen
-
-assistant = autogen.AssistantAgent("assistant")
-user_proxy = autogen.UserProxyAgent("user_proxy")
-user_proxy.initiate_chat(
-    assistant,
-    message="Show me the YTD gain of 10 largest technology companies as of today.",
-)
-# This initiates an automated chat between the two agents to solve the task
-```
-
-Autogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers a drop-in replacement of `openai.Completion` or `openai.ChatCompletion` with powerful functionalites like tuning, caching, templating, filtering. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.
-
-```python
-# perform tuning
-config, analysis = autogen.Completion.tune(
-    data=tune_data,
-    metric="success",
-    mode="max",
-    eval_func=eval_func,
-    inference_budget=0.05,
-    optimization_budget=3,
-    num_samples=-1,
-)
-# perform inference for a test instance
-response = autogen.Completion.create(context=test_instance, **config)
-```
-
 - With three lines of code, you can start using this economical and fast
  AutoML engine as a [scikit-learn style estimator](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML).

--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -401,6 +401,24 @@ class AutoML(BaseEstimator):
        self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor"
        self.best_run_id = None

+    def __getstate__(self):
+        """Customize pickling to avoid serializing runtime-only objects.
+
+        MLflow's sklearn flavor serializes estimators via (cloud)pickle. During
+        AutoML fitting we may attach an internal mlflow integration instance
+        which holds `concurrent.futures.Future` objects and executors containing
+        thread locks, which are not picklable.
+        """
+
+        state = self.__dict__.copy()
+        state.pop("mlflow_integration", None)
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        # Ensure attribute exists post-unpickle.
+        self.mlflow_integration = None
+
    def get_params(self, deep: bool = False) -> dict:
        return self._settings.copy()

--- a/flaml/automl/data.py
+++ b/flaml/automl/data.py
@@ -50,7 +50,10 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_forma
    """
    import pickle

+    try:
        import openml
+    except ImportError:
+        openml = None
    from sklearn.model_selection import train_test_split

    filename = "openml_ds" + str(dataset_id) + ".pkl"
@@ -61,15 +64,15 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_forma
            dataset = pickle.load(f)
    else:
        print("download dataset from openml")
-        dataset = openml.datasets.get_dataset(dataset_id)
+        dataset = openml.datasets.get_dataset(dataset_id) if openml else None
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        with open(filepath, "wb") as f:
            pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
-    print("Dataset name:", dataset.name)
+    print("Dataset name:", dataset.name) if dataset else None
    try:
        X, y, *__ = dataset.get_data(target=dataset.default_target_attribute, dataset_format=dataset_format)
-    except ValueError:
+    except (ValueError, AttributeError, TypeError):
        from sklearn.datasets import fetch_openml

        X, y = fetch_openml(data_id=dataset_id, return_X_y=True)
--- a/flaml/automl/ml.py
+++ b/flaml/automl/ml.py
@@ -127,9 +127,21 @@ def metric_loss_score(
            import datasets

            datasets_metric_name = huggingface_submetric_to_metric.get(metric_name, metric_name.split(":")[0])
-            metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
            metric_mode = huggingface_metric_to_mode[datasets_metric_name]

+            # datasets>=3 removed load_metric; prefer evaluate if available
+            try:
+                import evaluate
+
+                metric = evaluate.load(datasets_metric_name, trust_remote_code=True)
+            except Exception:
+                if hasattr(datasets, "load_metric"):
+                    metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
+                else:
+                    from datasets import load_metric as _load_metric  # older datasets
+
+                    metric = _load_metric(datasets_metric_name, trust_remote_code=True)
+
            if metric_name.startswith("seqeval"):
                y_processed_true = [[labels[tr] for tr in each_list] for each_list in y_processed_true]
            elif metric in ("pearsonr", "spearmanr"):
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -111,7 +111,7 @@ def limit_resource(memory_limit, time_limit):
                pass


-class BaseEstimator:
+class BaseEstimator(sklearn.base.ClassifierMixin, sklearn.base.BaseEstimator):
    """The abstract class for all learners.

    Typical examples:
--- a/flaml/automl/nlp/huggingface/training_args.py
+++ b/flaml/automl/nlp/huggingface/training_args.py
@@ -77,6 +77,14 @@ class TrainingArgumentsForAuto(TrainingArguments):

    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})

+    # Newer versions of HuggingFace Transformers may access `TrainingArguments.generation_config`
+    # (e.g., in generation-aware trainers/callbacks). Keep this attribute to remain compatible
+    # while defaulting to None for non-generation tasks.
+    generation_config: Optional[object] = field(
+        default=None,
+        metadata={"help": "Optional generation config (or path) used by generation-aware trainers."},
+    )
+
    @staticmethod
    def load_args_from_console():
        from dataclasses import fields
--- a/flaml/automl/time_series/tft.py
+++ b/flaml/automl/time_series/tft.py
@@ -1,3 +1,4 @@
+import inspect
 import time

 try:
@@ -106,12 +107,17 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
    def fit(self, X_train, y_train, budget=None, **kwargs):
        import warnings

+        try:
+            import lightning.pytorch as pl
+            from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
+            from lightning.pytorch.loggers import TensorBoardLogger
+        except ImportError:
            import pytorch_lightning as pl
+            from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
+            from pytorch_lightning.loggers import TensorBoardLogger
        import torch
        from pytorch_forecasting import TemporalFusionTransformer
        from pytorch_forecasting.metrics import QuantileLoss
-        from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
-        from pytorch_lightning.loggers import TensorBoardLogger

        # a bit of monkey patching to fix the MacOS test
        # all the log_prediction method appears to do is plot stuff, which ?breaks github tests
@@ -132,12 +138,26 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
        lr_logger = LearningRateMonitor()  # log the learning rate
        logger = TensorBoardLogger(kwargs.get("log_dir", "lightning_logs"))  # logging results to a tensorboard
        default_trainer_kwargs = dict(
-            gpus=self._kwargs.get("gpu_per_trial", [0]) if torch.cuda.is_available() else None,
            max_epochs=max_epochs,
            gradient_clip_val=gradient_clip_val,
            callbacks=[lr_logger, early_stop_callback],
            logger=logger,
        )
+
+        # PyTorch Lightning >=2.0 replaced `gpus` with `accelerator`/`devices`.
+        # Also, passing `gpus=None` is not accepted on newer versions.
+        trainer_sig_params = inspect.signature(pl.Trainer.__init__).parameters
+        if torch.cuda.is_available() and "gpus" in trainer_sig_params:
+            gpus = self._kwargs.get("gpu_per_trial", None)
+            if gpus is not None:
+                default_trainer_kwargs["gpus"] = gpus
+        elif torch.cuda.is_available() and "devices" in trainer_sig_params:
+            devices = self._kwargs.get("gpu_per_trial", None)
+            if devices == -1:
+                devices = "auto"
+            if devices is not None:
+                default_trainer_kwargs["accelerator"] = "gpu"
+                default_trainer_kwargs["devices"] = devices
        trainer = pl.Trainer(
            **default_trainer_kwargs,
        )
@@ -157,6 +177,13 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
            val_dataloaders=val_dataloader,
        )
        best_model_path = trainer.checkpoint_callback.best_model_path
+        # PyTorch 2.6 changed `torch.load` default `weights_only` from False -> True.
+        # Some Lightning checkpoints (including those produced here) can require full unpickling.
+        # This path is generated locally during training, so it's trusted.
+        load_sig_params = inspect.signature(TemporalFusionTransformer.load_from_checkpoint).parameters
+        if "weights_only" in load_sig_params:
+            best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path, weights_only=False)
+        else:
            best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
        train_time = time.time() - current_time
        self._model = best_tft
--- a/flaml/automl/time_series/ts_data.py
+++ b/flaml/automl/time_series/ts_data.py
@@ -9,6 +9,7 @@ import numpy as np
 try:
    import pandas as pd
    from pandas import DataFrame, Series, to_datetime
+    from pandas.api.types import is_datetime64_any_dtype
    from scipy.sparse import issparse
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
@@ -392,6 +393,15 @@ class DataTransformerTS:
        assert len(self.num_columns) == 0, "Trying to call fit() twice, something is wrong"

        for column in X.columns:
+            # Never treat the time column as a feature for sklearn preprocessing
+            if column == self.time_col:
+                continue
+
+            # Robust datetime detection (covers datetime64[ms/us/ns], tz-aware, etc.)
+            if is_datetime64_any_dtype(X[column]):
+                self.datetime_columns.append(column)
+                continue
+
            # sklearn/utils/validation.py needs int/float values
            if X[column].dtype.name in ("object", "category", "string"):
                if (
--- a/flaml/tune/searcher/blendsearch.py
+++ b/flaml/tune/searcher/blendsearch.py
@@ -244,6 +244,7 @@ class BlendSearch(Searcher):
                    evaluated_rewards=evaluated_rewards,
                )
            except (AssertionError, ValueError):
+                try:
                    self._gs = GlobalSearch(
                        space=gs_space,
                        metric=metric,
@@ -251,6 +252,24 @@ class BlendSearch(Searcher):
                        seed=gs_seed,
                        sampler=sampler,
                    )
+                except ValueError:
+                    # Ray Tune's OptunaSearch converts Tune domains into Optuna
+                    # distributions. Optuna disallows integer log distributions
+                    # with step != 1 (e.g., qlograndint with q>1), which can
+                    # raise here. Fall back to FLAML's OptunaSearch wrapper,
+                    # which handles these spaces more permissively.
+                    if getattr(GlobalSearch, "__module__", "").startswith("ray.tune"):
+                        from .suggestion import OptunaSearch as _FallbackOptunaSearch
+
+                        self._gs = _FallbackOptunaSearch(
+                            space=gs_space,
+                            metric=metric,
+                            mode=mode,
+                            seed=gs_seed,
+                            sampler=sampler,
+                        )
+                    else:
+                        raise
            self._gs.space = space
        else:
            self._gs = None
--- a/flaml/tune/searcher/suggestion.py
+++ b/flaml/tune/searcher/suggestion.py
@@ -35,6 +35,73 @@ from ..sample import (
    Quantized,
    Uniform,
 )
+
+# If Ray is installed, flaml.tune may re-export Ray Tune sampling functions.
+# In that case, the search space contains Ray Tune Domain/Sampler objects,
+# which should be accepted by our Optuna search-space conversion.
+try:
+    from ray import __version__ as _ray_version  # type: ignore
+
+    if str(_ray_version).startswith("1."):
+        from ray.tune.sample import (  # type: ignore
+            Categorical as _RayCategorical,
+        )
+        from ray.tune.sample import (
+            Domain as _RayDomain,
+        )
+        from ray.tune.sample import (
+            Float as _RayFloat,
+        )
+        from ray.tune.sample import (
+            Integer as _RayInteger,
+        )
+        from ray.tune.sample import (
+            LogUniform as _RayLogUniform,
+        )
+        from ray.tune.sample import (
+            Quantized as _RayQuantized,
+        )
+        from ray.tune.sample import (
+            Uniform as _RayUniform,
+        )
+    else:
+        from ray.tune.search.sample import (  # type: ignore
+            Categorical as _RayCategorical,
+        )
+        from ray.tune.search.sample import (
+            Domain as _RayDomain,
+        )
+        from ray.tune.search.sample import (
+            Float as _RayFloat,
+        )
+        from ray.tune.search.sample import (
+            Integer as _RayInteger,
+        )
+        from ray.tune.search.sample import (
+            LogUniform as _RayLogUniform,
+        )
+        from ray.tune.search.sample import (
+            Quantized as _RayQuantized,
+        )
+        from ray.tune.search.sample import (
+            Uniform as _RayUniform,
+        )
+
+    _FLOAT_TYPES = (Float, _RayFloat)
+    _INTEGER_TYPES = (Integer, _RayInteger)
+    _CATEGORICAL_TYPES = (Categorical, _RayCategorical)
+    _DOMAIN_TYPES = (Domain, _RayDomain)
+    _QUANTIZED_TYPES = (Quantized, _RayQuantized)
+    _UNIFORM_TYPES = (Uniform, _RayUniform)
+    _LOGUNIFORM_TYPES = (LogUniform, _RayLogUniform)
+except Exception:  # pragma: no cover
+    _FLOAT_TYPES = (Float,)
+    _INTEGER_TYPES = (Integer,)
+    _CATEGORICAL_TYPES = (Categorical,)
+    _DOMAIN_TYPES = (Domain,)
+    _QUANTIZED_TYPES = (Quantized,)
+    _UNIFORM_TYPES = (Uniform,)
+    _LOGUNIFORM_TYPES = (LogUniform,)
 from ..trial import flatten_dict, unflatten_dict
 from .variant_generator import parse_spec_vars

@@ -850,19 +917,22 @@ class OptunaSearch(Searcher):
        def resolve_value(domain: Domain) -> ot.distributions.BaseDistribution:
            quantize = None

-            sampler = domain.get_sampler()
-            if isinstance(sampler, Quantized):
+            # Ray Tune Domains and FLAML Domains both provide get_sampler(), but
+            # fall back to the .sampler attribute for robustness.
+            sampler = domain.get_sampler() if hasattr(domain, "get_sampler") else getattr(domain, "sampler", None)
+
+            if isinstance(sampler, _QUANTIZED_TYPES) or type(sampler).__name__ == "Quantized":
                quantize = sampler.q
-                sampler = sampler.sampler
-                if isinstance(sampler, LogUniform):
+                sampler = getattr(sampler, "sampler", None) or sampler.get_sampler()
+                if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
                    logger.warning(
                        "Optuna does not handle quantization in loguniform "
                        "sampling. The parameter will be passed but it will "
                        "probably be ignored."
                    )

-            if isinstance(domain, Float):
-                if isinstance(sampler, LogUniform):
+            if isinstance(domain, _FLOAT_TYPES) or type(domain).__name__ == "Float":
+                if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
                    if quantize:
                        logger.warning(
                            "Optuna does not support both quantization and "
@@ -870,17 +940,17 @@ class OptunaSearch(Searcher):
                        )
                    return ot.distributions.LogUniformDistribution(domain.lower, domain.upper)

-                elif isinstance(sampler, Uniform):
+                elif isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
                    if quantize:
                        return ot.distributions.DiscreteUniformDistribution(domain.lower, domain.upper, quantize)
                    return ot.distributions.UniformDistribution(domain.lower, domain.upper)

-            elif isinstance(domain, Integer):
-                if isinstance(sampler, LogUniform):
+            elif isinstance(domain, _INTEGER_TYPES) or type(domain).__name__ == "Integer":
+                if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
                    # ``step`` argument Deprecated in v2.0.0. ``step`` argument should be 1 in Log Distribution
                    # The removal of this feature is currently scheduled for v4.0.0,
                    return ot.distributions.IntLogUniformDistribution(domain.lower, domain.upper - 1, step=1)
-                elif isinstance(sampler, Uniform):
+                elif isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
                    # Upper bound should be inclusive for quantization and
                    # exclusive otherwise
                    return ot.distributions.IntUniformDistribution(
@@ -888,13 +958,13 @@ class OptunaSearch(Searcher):
                        domain.upper - int(bool(not quantize)),
                        step=quantize or 1,
                    )
-            elif isinstance(domain, Categorical):
-                if isinstance(sampler, Uniform):
+            elif isinstance(domain, _CATEGORICAL_TYPES) or type(domain).__name__ == "Categorical":
+                if isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
                    return ot.distributions.CategoricalDistribution(domain.categories)

            raise ValueError(
                "Optuna search does not support parameters of type "
-                "`{}` with samplers of type `{}`".format(type(domain).__name__, type(domain.sampler).__name__)
+                "`{}` with samplers of type `{}`".format(type(domain).__name__, type(sampler).__name__)
            )

        # Parameter name is e.g. "a/b/c" for nested dicts
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "2.3.7"
+__version__ = "2.4.0"
--- a/setup.py
+++ b/setup.py
@@ -51,60 +51,59 @@ setuptools.setup(
            "joblib<=1.3.2",
        ],
        "test": [
-            "jupyter",
+            "numpy>=1.17,<2.0.0; python_version<'3.13'",
+            "numpy>2.0.0; python_version>='3.13'",
+            "jupyter; python_version<'3.13'",
            "lightgbm>=2.3.1",
-            "xgboost>=0.90,<2.0.0",
+            "xgboost>=0.90,<2.0.0; python_version<'3.11'",
+            "xgboost>=2.0.0; python_version>='3.11'",
            "scipy>=1.4.1",
            "pandas>=1.1.4,<2.0.0; python_version<'3.10'",
            "pandas>=1.1.4; python_version>='3.10'",
-            "scikit-learn>=1.0.0",
+            "scikit-learn>=1.2.0",
            "thop",
            "pytest>=6.1.1",
+            "pytest-rerunfailures>=13.0",
            "coverage>=5.3",
            "pre-commit",
            "torch",
            "torchvision",
-            "catboost>=0.26,<1.2; python_version<'3.11'",
-            "catboost>=0.26; python_version>='3.11'",
+            "catboost>=0.26; python_version<'3.13'",
            "rgf-python",
            "optuna>=2.8.0,<=3.6.1",
-            "openml",
+            "openml; python_version<'3.13'",
            "statsmodels>=0.12.2",
-            "psutil==5.8.0",
+            "psutil",
            "dataclasses",
-            "transformers[torch]==4.26",
-            "datasets<=3.5.0",
-            "nltk<=3.8.1",  # 3.8.2 doesn't work with mlflow
+            "transformers[torch]",
+            "datasets",
+            "evaluate",
+            "nltk!=3.8.2",  # 3.8.2 doesn't work with mlflow
            "rouge_score",
-            "hcrystalball==0.1.10",
+            "hcrystalball",
            "seqeval",
-            "pytorch-forecasting>=0.9.0,<=0.10.1; python_version<'3.11'",
-            # "pytorch-forecasting==0.10.1; python_version=='3.11'",
-            "mlflow==2.15.1",
+            "pytorch-forecasting; python_version<'3.13'",
+            "mlflow-skinny<=2.22.1",  # Refer to https://mvnrepository.com/artifact/org.mlflow/mlflow-spark
            "joblibspark>=0.5.0",
            "joblib<=1.3.2",
            "nbconvert",
            "nbformat",
            "ipykernel",
-            "pytorch-lightning<1.9.1",  # test_forecast_panel
-            "tensorboardX==2.6",  # test_forecast_panel
-            "requests<2.29.0",  # https://github.com/docker/docker-py/issues/3113
+            "pytorch-lightning",  # test_forecast_panel
+            "tensorboardX",  # test_forecast_panel
+            "requests",  # https://github.com/docker/docker-py/issues/3113
            "packaging",
-            "pydantic==1.10.9",
-            "sympy",
-            "wolframalpha",
            "dill",  # a drop in replacement of pickle
        ],
        "catboost": [
-            "catboost>=0.26,<1.2; python_version<'3.11'",
-            "catboost>=0.26,<=1.2.5; python_version>='3.11'",
+            "catboost>=0.26",
        ],
        "blendsearch": [
            "optuna>=2.8.0,<=3.6.1",
            "packaging",
        ],
        "ray": [
-            "ray[tune]~=1.13",
+            "ray[tune]>=1.13,<2.5.0",
        ],
        "azureml": [
            "azureml-mlflow",
@@ -131,33 +130,21 @@ setuptools.setup(
            "seqeval",
        ],
        "ts_forecast": [
-            "holidays<0.14",  # to prevent installation error for prophet
-            "prophet>=1.0.1",
+            "holidays",
+            "prophet>=1.1.5",
            "statsmodels>=0.12.2",
-            "hcrystalball==0.1.10",
+            "hcrystalball>=0.1.10",
        ],
        "forecast": [
-            "holidays<0.14",  # to prevent installation error for prophet
-            "prophet>=1.0.1",
+            "holidays",
+            "prophet>=1.1.5",
            "statsmodels>=0.12.2",
-            "hcrystalball==0.1.10",
-            "pytorch-forecasting>=0.9.0; python_version<'3.11'",
-            # "pytorch-forecasting==0.10.1; python_version=='3.11'",
-            "pytorch-lightning==1.9.0",
-            "tensorboardX==2.6",
+            "hcrystalball>=0.1.10",
+            "pytorch-forecasting>=0.10.4; python_version<'3.13'",
+            "pytorch-lightning>=1.9.0",
+            "tensorboardX>=2.6",
        ],
        "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3", "pandas==1.1.4"],
-        "openai": ["openai==0.27.8", "diskcache"],
-        "autogen": ["openai==0.27.8", "diskcache", "termcolor"],
-        "mathchat": ["openai==0.27.8", "diskcache", "termcolor", "sympy", "pydantic==1.10.9", "wolframalpha"],
-        "retrievechat": [
-            "openai==0.27.8",
-            "diskcache",
-            "termcolor",
-            "chromadb",
-            "tiktoken",
-            "sentence_transformers",
-        ],
        "synapse": [
            "joblibspark>=0.5.0",
            "optuna>=2.8.0,<=3.6.1",
@@ -170,9 +157,9 @@ setuptools.setup(
        "Operating System :: OS Independent",
        # Specify the Python versions you support here.
        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
    ],
-    python_requires=">=3.9",
+    python_requires=">=3.10",
 )
--- a/test/automl/test_notebook_example.py
+++ b/test/automl/test_notebook_example.py
@@ -1,8 +1,23 @@
 import sys

 import pytest
+
+try:
    from minio.error import ServerError
+except ImportError:
+
+    class ServerError(Exception):
+        pass
+
+
+try:
    from openml.exceptions import OpenMLServerException
+except ImportError:
+
+    class OpenMLServerException(Exception):
+        pass
+
+
 from requests.exceptions import ChunkedEncodingError, SSLError


--- a/test/automl/test_python_log.py
+++ b/test/automl/test_python_log.py
@@ -38,7 +38,7 @@ class TestLogging(unittest.TestCase):
                "keep_search_state": True,
                "learner_selector": "roundrobin",
            }
-            X_train, y_train = fetch_california_housing(return_X_y=True)
+            X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
            n = len(y_train) >> 1
            print(automl.model, automl.classes_, automl.predict(X_train))
            automl.fit(
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -47,7 +47,7 @@ class TestRegression(unittest.TestCase):
            "n_jobs": 1,
            "model_history": True,
        }
-        X_train, y_train = fetch_california_housing(return_X_y=True)
+        X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
        n = int(len(y_train) * 9 // 10)
        automl.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings)
        assert automl._state.eval_method == "holdout"
@@ -141,7 +141,7 @@ class TestRegression(unittest.TestCase):
            "n_concurrent_trials": 10,
            "hpo_method": hpo_method,
        }
-        X_train, y_train = fetch_california_housing(return_X_y=True)
+        X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
        try:
            automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
            print(automl_experiment.predict(X_train))
@@ -268,7 +268,7 @@ def test_reproducibility_of_regression_models(estimator: str):
        "skip_transform": True,
        "retrain_full": True,
    }
-    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    automl.fit(X_train=X, y_train=y, **automl_settings)
    best_model = automl.model
    assert best_model is not None
@@ -314,7 +314,7 @@ def test_reproducibility_of_catboost_regression_model():
        "skip_transform": True,
        "retrain_full": True,
    }
-    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    automl.fit(X_train=X, y_train=y, **automl_settings)
    best_model = automl.model
    assert best_model is not None
@@ -360,7 +360,7 @@ def test_reproducibility_of_lgbm_regression_model():
        "skip_transform": True,
        "retrain_full": True,
    }
-    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    automl.fit(X_train=X, y_train=y, **automl_settings)
    best_model = automl.model
    assert best_model is not None
@@ -424,7 +424,7 @@ def test_reproducibility_of_underlying_regression_models(estimator: str):
        "skip_transform": True,
        "retrain_full": False,
    }
-    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    automl.fit(X_train=X, y_train=y, **automl_settings)
    best_model = automl.model
    assert best_model is not None
--- a/test/automl/test_score.py
+++ b/test/automl/test_score.py
@@ -142,7 +142,7 @@ class TestScore:
    def test_regression(self):
        automl_experiment = AutoML()

-        X_train, y_train = fetch_california_housing(return_X_y=True)
+        X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
        n = int(len(y_train) * 9 // 10)

        for each_estimator in [
--- a/test/automl/test_training_log.py
+++ b/test/automl/test_training_log.py
@@ -30,7 +30,7 @@ class TestTrainingLog(unittest.TestCase):
                "keep_search_state": True,
                "estimator_list": estimator_list,
            }
-            X_train, y_train = fetch_california_housing(return_X_y=True)
+            X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
            automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
            # Check if the training log file is populated.
            self.assertTrue(os.path.exists(filename))
--- a/test/automl/test_warmstart.py
+++ b/test/automl/test_warmstart.py
@@ -108,7 +108,14 @@ class TestWarmStart(unittest.TestCase):

    def test_FLAML_sample_size_in_starting_points(self):
        from minio.error import ServerError
+
+        try:
            from openml.exceptions import OpenMLServerException
+        except ImportError:
+
+            class OpenMLServerException(Exception):
+                pass
+
        from requests.exceptions import ChunkedEncodingError, SSLError

        from flaml import AutoML
--- a/test/cal_housing_py3.pkz
+++ b/test/cal_housing_py3.pkz
--- a/test/check_dependency.py
+++ b/test/check_dependency.py
@@ -0,0 +1,60 @@
+import subprocess
+from importlib.metadata import distributions
+
+installed_libs = sorted(f"{dist.metadata['Name']}=={dist.version}" for dist in distributions())
+
+first_tier_dependencies = [
+    "numpy",
+    "jupyter",
+    "lightgbm",
+    "xgboost",
+    "scipy",
+    "pandas",
+    "scikit-learn",
+    "thop",
+    "pytest",
+    "pytest-rerunfailures",
+    "coverage",
+    "pre-commit",
+    "torch",
+    "torchvision",
+    "catboost",
+    "rgf-python",
+    "optuna",
+    "openml",
+    "statsmodels",
+    "psutil",
+    "dataclasses",
+    "transformers[torch]",
+    "transformers",
+    "datasets",
+    "evaluate",
+    "nltk",
+    "rouge_score",
+    "hcrystalball",
+    "seqeval",
+    "pytorch-forecasting",
+    "mlflow-skinny",
+    "joblibspark",
+    "joblib",
+    "nbconvert",
+    "nbformat",
+    "ipykernel",
+    "pytorch-lightning",
+    "tensorboardX",
+    "requests",
+    "packaging",
+    "dill",
+    "ray",
+    "prophet",
+]
+
+
+for lib in installed_libs:
+    lib_name = lib.split("==")[0]
+    if lib_name in first_tier_dependencies:
+        print(lib)
+
+# print current commit hash
+commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
+print(f"Current commit hash: {commit_hash}")
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -2,11 +2,24 @@ from typing import Any, Dict, List, Union

 import numpy as np
 import pandas as pd
-from catboost import CatBoostClassifier, CatBoostRegressor, Pool
+import pytest
 from sklearn.metrics import f1_score, r2_score

+try:
+    from catboost import CatBoostClassifier, CatBoostRegressor, Pool
+except ImportError:  # pragma: no cover
+    CatBoostClassifier = None
+    CatBoostRegressor = None
+    Pool = None

-def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> pd.DataFrame:
+
+def _is_catboost_model_type(model_type: type) -> bool:
+    if CatBoostClassifier is not None and CatBoostRegressor is not None:
+        return model_type is CatBoostClassifier or model_type is CatBoostRegressor
+    return getattr(model_type, "__module__", "").startswith("catboost")
+
+
+def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> List[float]:
    """Mimic the FLAML CV process to calculate the metrics across each fold.

    :param X_train_all: X training data
@@ -17,7 +30,7 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
    :return: An array containing the metrics
    """
    rng = np.random.RandomState(2020)
-    all_fold_metrics: List[Dict[str, Union[int, float]]] = []
+    all_fold_metrics: List[float] = []
    for train_index, val_index in kf.split(X_train_all, y_train_all):
        X_train_split, y_train_split = X_train_all, y_train_all
        train_index = rng.permutation(train_index)
@@ -25,9 +38,11 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
        X_val = X_train_split.iloc[val_index]
        y_train, y_val = y_train_split[train_index], y_train_split[val_index]
        model_type = type(model)
-        if model_type is not CatBoostClassifier and model_type is not CatBoostRegressor:
+        if not _is_catboost_model_type(model_type):
            model.fit(X_train, y_train)
        else:
+            if Pool is None:
+                pytest.skip("catboost is not installed")
            use_best_model = True
            n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
            X_tr, y_tr = (X_train)[:n], y_train[:n]
@@ -38,5 +53,5 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
            reproduced_metric = 1 - f1_score(y_val, y_pred_classes)
        else:
            reproduced_metric = 1 - r2_score(y_val, y_pred_classes)
-        all_fold_metrics.append(reproduced_metric)
+        all_fold_metrics.append(float(reproduced_metric))
    return all_fold_metrics
--- a/test/default/test_defaults.py
+++ b/test/default/test_defaults.py
@@ -60,7 +60,7 @@ def test_housing(as_frame=True):
        "starting_points": "data",
        "max_iter": 0,
    }
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame, data_home="test")
    automl.fit(X_train, y_train, **automl_settings)


@@ -115,7 +115,7 @@ def test_suggest_classification():

 def test_suggest_regression():
    location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    suggested = suggest_hyperparams("regression", X_train, y_train, "lgbm", location=location)
    print(suggested)
    suggested = preprocess_and_suggest_hyperparams("regression", X_train, y_train, "xgboost", location=location)
@@ -137,7 +137,7 @@ def test_rf():
    print(rf)

    location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    rf = RandomForestRegressor(default_location=location)
    rf.fit(X_train[:100], y_train[:100])
    rf.predict(X_train)
@@ -155,7 +155,7 @@ def test_extratrees():
    print(classifier)

    location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    regressor = ExtraTreesRegressor(default_location=location)
    regressor.fit(X_train[:100], y_train[:100])
    regressor.predict(X_train)
@@ -175,7 +175,7 @@ def test_lgbm():
    print(classifier.classes_)

    location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    regressor = LGBMRegressor(default_location=location)
    regressor.fit(X_train, y_train)
    regressor.predict(X_train)
@@ -194,7 +194,7 @@ def test_xgboost():
    print(classifier.classes_)

    location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
    regressor = XGBRegressor(default_location=location)
    regressor.fit(X_train[:100], y_train[:100])
    regressor.predict(X_train)
--- a/test/nlp/test_default.py
+++ b/test/nlp/test_default.py
@@ -30,21 +30,33 @@ def test_build_portfolio(path="./test/nlp/default", strategy="greedy"):

@pytest.mark.skipif(sys.platform == "win32", reason="do not run on windows")
 def test_starting_point_not_in_search_space():
-    from flaml import AutoML
+    """Regression test for invalid starting points and custom_hp.
+
+    This test must not require network access to Hugging Face.
+    """

    """
        test starting_points located outside of the search space, and custom_hp is not set
    """
+    from flaml.automl.state import SearchState
+    from flaml.automl.task.factory import task_factory
+
    this_estimator_name = "transformer"
-    X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
+    X_train, y_train, _, _, _ = get_toy_data_seqclassification()
+    task = task_factory("seq-classification", X_train, y_train)
+    estimator_class = task.estimator_class_from_str(this_estimator_name)
+    estimator_class.init()

-    automl = AutoML()
-    automl_settings = get_automl_settings(estimator_name=this_estimator_name)
-
-    automl_settings["starting_points"] = {this_estimator_name: [{"learning_rate": 2e-3}]}
-
-    automl.fit(X_train, y_train, **automl_settings)
-    assert automl._search_states[this_estimator_name].init_config[0]["learning_rate"] != 2e-3
+    # SearchState is where invalid starting points are filtered out when max_iter > 1.
+    search_state = SearchState(
+        learner_class=estimator_class,
+        data=X_train,
+        task=task,
+        starting_point={"learning_rate": 2e-3},
+        max_iter=3,
+        budget=10,
+    )
+    assert search_state.init_config and search_state.init_config[0].get("learning_rate") != 2e-3

    """
        test starting_points located outside of the search space, and custom_hp is set
@@ -52,14 +64,14 @@ def test_starting_point_not_in_search_space():

    from flaml import tune

-    X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
+    X_train, y_train, _, _, _ = get_toy_data_seqclassification()

    this_estimator_name = "transformer_ms"
-    automl = AutoML()
-    automl_settings = get_automl_settings(estimator_name=this_estimator_name)
+    task = task_factory("seq-classification", X_train, y_train)
+    estimator_class = task.estimator_class_from_str(this_estimator_name)
+    estimator_class.init()

-    automl_settings["custom_hp"] = {
-        this_estimator_name: {
+    custom_hp = {
        "model_path": {
            "domain": "albert-base-v2",
        },
@@ -70,21 +82,42 @@ def test_starting_point_not_in_search_space():
            "domain": 2,
        },
    }
-    }
-    automl_settings["starting_points"] = "data:test/nlp/default/"

-    automl.fit(X_train, y_train, **automl_settings)
-    assert len(automl._search_states[this_estimator_name].init_config[0]) == len(
-        automl._search_states[this_estimator_name]._search_space_domain
-    ) - len(automl_settings["custom_hp"][this_estimator_name]), (
+    # Simulate a suggested starting point (e.g. from portfolio) which becomes invalid
+    # after custom_hp constrains the space.
+    invalid_starting_points = [
+        {
+            "learning_rate": 1e-5,
+            "num_train_epochs": 1.0,
+            "per_device_train_batch_size": 8,
+            "seed": 43,
+            "global_max_steps": 100,
+            "model_path": "google/electra-base-discriminator",
+        }
+    ]
+
+    search_state = SearchState(
+        learner_class=estimator_class,
+        data=X_train,
+        task=task,
+        starting_point=invalid_starting_points,
+        custom_hp=custom_hp,
+        max_iter=3,
+        budget=10,
+    )
+
+    assert search_state.init_config, "Expected a non-empty init_config list"
+    init_config0 = search_state.init_config[0]
+    assert init_config0 is not None
+    assert len(init_config0) == len(search_state._search_space_domain) - len(custom_hp), (
        "The search space is updated with the custom_hp on {} hyperparameters of "
        "the specified estimator without an initial value. Thus a valid init config "
        "should only contain the cardinality of the search space minus {}".format(
-            len(automl_settings["custom_hp"][this_estimator_name]),
-            len(automl_settings["custom_hp"][this_estimator_name]),
+            len(custom_hp),
+            len(custom_hp),
        )
    )
-    assert automl._search_states[this_estimator_name].search_space["model_path"] == "albert-base-v2"
+    assert search_state.search_space["model_path"] == "albert-base-v2"

    if os.path.exists("test/data/output/"):
        try:
@@ -106,7 +139,13 @@ def test_points_to_evaluate():

    automl_settings["custom_hp"] = {"transformer_ms": {"model_path": {"domain": "google/electra-small-discriminator"}}}

+    try:
        automl.fit(X_train, y_train, **automl_settings)
+    except OSError as e:
+        message = str(e)
+        if "Too Many Requests" in message or "rate limit" in message.lower():
+            pytest.skip(f"Skipping HF model load/training: {message}")
+        raise

    if os.path.exists("test/data/output/"):
        try:
@@ -141,7 +180,14 @@ def test_zero_shot_nomodel():
    fit_kwargs = automl_settings.pop("fit_kwargs_by_estimator", {}).get(estimator_name)
    fit_kwargs.update(automl_settings)
    pop_args(fit_kwargs)
+
+    try:
        model.fit(X_train, y_train, **fit_kwargs)
+    except OSError as e:
+        message = str(e)
+        if "Too Many Requests" in message or "rate limit" in message.lower():
+            pytest.skip(f"Skipping HF model load/training: {message}")
+        raise

    if os.path.exists("test/data/output/"):
        try:
--- a/test/object_store.py
+++ b/test/object_store.py
@@ -7,7 +7,7 @@ from sklearn.model_selection import train_test_split
 from flaml import tune
 from flaml.automl.model import LGBMEstimator

-data = fetch_california_housing(return_X_y=False, as_frame=True)
+data = fetch_california_housing(return_X_y=False, as_frame=True, data_home="test")
 X, y = data.data, data.target
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
 X_train_ref = ray.put(X_train)
--- a/test/reg.py
+++ b/test/reg.py
@@ -11,7 +11,7 @@ automl_settings = {
    "task": "regression",
    "log_file_name": "test/california.log",
 }
-X_train, y_train = fetch_california_housing(return_X_y=True)
+X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
 # Train with labeled input data
 automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
 print(automl.model)
--- a/test/spark/test_exceptions.py
+++ b/test/spark/test_exceptions.py
@@ -22,7 +22,7 @@ def base_automl(n_concurrent_trials=1, use_ray=False, use_spark=False, verbose=0
    except (ServerError, Exception):
        from sklearn.datasets import fetch_california_housing

-        X_train, y_train = fetch_california_housing(return_X_y=True)
+        X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
    automl = AutoML()
    settings = {
        "time_budget": 3,  # total running time in seconds
--- a/test/spark/test_performance.py
+++ b/test/spark/test_performance.py
@@ -2,8 +2,23 @@ import os
 import sys

 import pytest
+
+try:
    from minio.error import ServerError
+except ImportError:
+
+    class ServerError(Exception):
+        pass
+
+
+try:
    from openml.exceptions import OpenMLServerException
+except ImportError:
+
+    class OpenMLServerException(Exception):
+        pass
+
+
 from requests.exceptions import ChunkedEncodingError, SSLError

 from flaml.tune.spark.utils import check_spark
--- a/test/test_autovw.py
+++ b/test/test_autovw.py
@@ -5,17 +5,38 @@ import sys
 import unittest

 import numpy as np
+
+try:
    import openml
+except ImportError:
+    openml = None
 import pandas as pd
 import pytest
 import scipy.sparse
+
+try:
    from minio.error import ServerError
+except ImportError:
+
+    class ServerError(Exception):
+        pass
+
+
 from requests.exceptions import SSLError
 from sklearn.metrics import mean_absolute_error, mean_squared_error

 from flaml import AutoVW
 from flaml.tune import loguniform, polynomial_expansion_set

+try:
+    from vowpalwabbit import pyvw
+except ImportError:
+    skip_vw_test = True
+else:
+    skip_vw_test = False
+
+pytest.skip("skipping if no openml", allow_module_level=True) if openml is None else None
+
 VW_DS_DIR = "test/data/"
 NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
 logger = logging.getLogger(__name__)
@@ -351,14 +372,9 @@ def get_vw_tuning_problem(tuning_hp="NamesapceInteraction"):
    return vw_oml_problem_args, vw_online_aml_problem


-@pytest.mark.skipif(
-    "3.10" in sys.version or "3.11" in sys.version,
-    reason="do not run on py >= 3.10",
-)
+@pytest.mark.skipif(skip_vw_test, reason="vowpalwabbit not installed")
 class TestAutoVW(unittest.TestCase):
    def test_vw_oml_problem_and_vanilla_vw(self):
-        from vowpalwabbit import pyvw
-
        try:
            vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem()
        except (SSLError, ServerError, Exception) as e:
--- a/test/tune_example.py
+++ b/test/tune_example.py
@@ -6,12 +6,12 @@ from sklearn.model_selection import train_test_split
 from flaml import tune
 from flaml.automl.model import LGBMEstimator

-data = fetch_california_housing(return_X_y=False, as_frame=True)
+data = fetch_california_housing(return_X_y=False, as_frame=True, data_home="test")
 df, X, y = data.frame, data.data, data.target
 df_train, _, X_train, X_test, _, y_test = train_test_split(df, X, y, test_size=0.33, random_state=42)
 csv_file_name = "test/housing.csv"
 df_train.to_csv(csv_file_name, index=False)
-# X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+# X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
 # X_train, X_test, y_train, y_test = train_test_split(
 #     X, y, test_size=0.33, random_state=42
 # )