diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 38b98ff3d..9e88a4e8d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -22,8 +22,12 @@ on:
       - 'setup.py'
   merge_group:
     types: [checks_requested]
+  schedule:
+    # Every other day at 02:00 UTC
+    - cron: '0 2 */2 * *'
 
-permissions: {}
+permissions:
+  contents: write
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -36,7 +40,10 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11"]
+        exclude:
+          - os: macos-latest
+            python-version: "3.10"
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
@@ -44,7 +51,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
       - name: On mac, install libomp to facilitate lgbm and xgboost install
-        if: matrix.os == 'macOS-latest'
+        if: matrix.os == 'macos-latest'
         run: |
           brew update
           brew install libomp
@@ -70,33 +77,43 @@ jobs:
         run: |
           pip install pyspark==3.5.1
           pip list | grep "pyspark"
-      - name: If linux and python<3.11, install ray 2
-        if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.11'
+      - name: On Ubuntu python 3.12, install pyspark 4.0.1
+        if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
         run: |
-          pip install "ray[tune]<2.5.0"
-      - name: If mac and python 3.10, install ray and xgboost 1
-        if: matrix.os == 'macOS-latest' && matrix.python-version == '3.10'
-        run: |
-          pip install -e .[ray]
-          # use macOS to test xgboost 1, but macOS also supports xgboost 2
-          pip install "xgboost<2"
-      - name: If linux, install prophet on python < 3.9
-        if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.8'
+          pip install pyspark==4.0.1
+          pip list | grep "pyspark"
+      # # TODO: support ray
+      # - name: If linux and python<3.11, install ray 2
+      #   if: matrix.os == 'ubuntu-latest' && matrix.python-version < '3.11'
+      #   run: |
+      #     pip install "ray[tune]<2.5.0"
+      - name: Install prophet when on linux
+        if: matrix.os == 'ubuntu-latest'
         run: |
           pip install -e .[forecast]
-      - name: Install vw on python < 3.10
-        if: matrix.python-version == '3.8' || matrix.python-version == '3.9'
+      # TODO: support vw for python 3.10+
+      - name:  If linux and python<3.10, install vw
+        if: matrix.os == 'ubuntu-latest' && matrix.python-version < '3.10'
         run: |
           pip install -e .[vw]
+      - name: Pip freeze
+        run: |
+          pip freeze
+      - name: Check dependencies
+        run: |
+          python test/check_dependency.py
+      - name: Clear pip cache
+        run: |
+          pip cache purge
       - name: Test with pytest
         if: matrix.python-version != '3.10'
         run: |
-          pytest test/ --ignore=test/autogen
+          pytest test/ --ignore=test/autogen --reruns 2 --reruns-delay 10
       - name: Coverage
         if: matrix.python-version == '3.10'
         run: |
           pip install coverage
-          coverage run -a -m pytest test --ignore=test/autogen
+          coverage run -a -m pytest test --ignore=test/autogen --reruns 2 --reruns-delay 10
           coverage xml
       - name: Upload coverage to Codecov
         if: matrix.python-version == '3.10'
@@ -104,28 +121,24 @@ jobs:
         with:
           file: ./coverage.xml
           flags: unittests
+      - name: Save dependencies
+        shell: bash
+        run: |
+          git config --global user.name 'github-actions[bot]'
+          git config --global user.email 'github-actions[bot]@users.noreply.github.com'
+          git config advice.addIgnoredFile false
 
-  # docs:
+          BRANCH=unit-tests-installed-dependencies
+          git fetch origin
+          git checkout -B "$BRANCH"
+          if git show-ref --verify --quiet "refs/remotes/origin/$BRANCH"; then
+            git rebase "origin/$BRANCH"
+          fi
 
-  #   runs-on: ubuntu-latest
-
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - name: Setup Python
-  #       uses: actions/setup-python@v4
-  #       with:
-  #         python-version: '3.8'
-  #     - name: Compile documentation
-  #       run: |
-  #           pip install -e .
-  #           python -m pip install sphinx sphinx_rtd_theme
-  #           cd docs
-  #           make html
-  #     - name: Deploy to GitHub pages
-  #       if: ${{ github.ref == 'refs/heads/main' }}
-  #       uses: JamesIves/github-pages-deploy-action@3.6.2
-  #       with:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #         BRANCH: gh-pages
-  #         FOLDER: docs/_build/html
-  #         CLEAN: true
+          pip freeze > installed_all_dependencies_${{ matrix.python-version }}_${{ matrix.os }}.txt
+          python test/check_dependency.py > installed_first_tier_dependencies_${{ matrix.python-version }}_${{ matrix.os }}.txt
+          git add installed_*dependencies*.txt
+          mv coverage.xml ./coverage_${{ matrix.python-version }}_${{ matrix.os }}.xml || true
+          git add -f ./coverage_${{ matrix.python-version }}_${{ matrix.os }}.xml || true
+          git commit -m "Update installed dependencies for Python ${{ matrix.python-version }} on ${{ matrix.os }}" || exit 0
+          git push origin "$BRANCH"
diff --git a/.gitignore b/.gitignore
index 8a3365b20..18c858dad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,7 +172,7 @@ test/default
 test/housing.json
 test/nlp/default/transformer_ms/seq-classification.json
 
-flaml/fabric/fanova/_fanova.c
+flaml/fabric/fanova/*fanova.c
 # local config files
 *.config.local
 
@@ -184,3 +184,7 @@ notebook/lightning_logs/
 lightning_logs/
 flaml/autogen/extensions/tmp/
 test/autogen/my_tmp/
+catboost_*
+
+# Internal configs
+.pypirc
diff --git a/README.md b/README.md
index fcd62bd42..30a2b1457 100644
--- a/README.md
+++ b/README.md
@@ -14,15 +14,9 @@
     <br>
 </p>
 
-:fire: FLAML supports AutoML and Hyperparameter Tuning in [Microsoft Fabric Data Science](https://learn.microsoft.com/en-us/fabric/data-science/automated-machine-learning-fabric). In addition, we've introduced Python 3.11 support, along with a range of new estimators, and comprehensive integration with MLflow—thanks to contributions from the Microsoft Fabric product team.
+:fire: FLAML supports AutoML and Hyperparameter Tuning in [Microsoft Fabric Data Science](https://learn.microsoft.com/en-us/fabric/data-science/automated-machine-learning-fabric). In addition, we've introduced Python 3.11 and 3.12 support, along with a range of new estimators, and comprehensive integration with MLflow—thanks to contributions from the Microsoft Fabric product team.
 
-:fire: Heads-up: We have migrated [AutoGen](https://microsoft.github.io/autogen/) into a dedicated [github repository](https://github.com/microsoft/autogen). Alongside this move, we have also launched a dedicated [Discord](https://discord.gg/pAbnFJrkgZ) server and a [website](https://microsoft.github.io/autogen/) for comprehensive documentation.
-
-:fire: The automated multi-agent chat framework in [AutoGen](https://microsoft.github.io/autogen/) is in preview from v2.0.0.
-
-:fire: FLAML is highlighted in OpenAI's [cookbook](https://github.com/openai/openai-cookbook#related-resources-from-around-the-web).
-
-:fire: [autogen](https://microsoft.github.io/autogen/) is released with support for ChatGPT and GPT-4, based on [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673).
+:fire: Heads-up: [AutoGen](https://microsoft.github.io/autogen/) has moved to a dedicated [GitHub repository](https://github.com/microsoft/autogen). FLAML no longer includes the `autogen` module—please use AutoGen directly.
 
 ## What is FLAML
 
@@ -30,7 +24,7 @@ FLAML is a lightweight Python library for efficient automation of machine
 learning and AI operations. It automates workflow based on large language models, machine learning models, etc.
 and optimizes their performance.
 
-- FLAML enables building next-gen GPT-X applications based on multi-agent conversations with minimal effort. It simplifies the orchestration, automation and optimization of a complex GPT-X workflow. It maximizes the performance of GPT-X models and augments their weakness.
+- FLAML enables economical automation and tuning for ML/AI workflows, including model selection and hyperparameter optimization under resource constraints.
 - For common machine learning tasks like classification and regression, it quickly finds quality models for user-provided data with low computational resources. It is easy to customize or extend. Users can find their desired customizability from a smooth range.
 - It supports fast and economical automatic tuning (e.g., inference hyperparameters for foundation models, configurations in MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations), capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping.
 
@@ -46,10 +40,10 @@ FLAML requires **Python version >= 3.9**. It can be installed from pip:
 pip install flaml
 ```
 
-Minimal dependencies are installed without extra options. You can install extra options based on the feature you need. For example, use the following to install the dependencies needed by the [`autogen`](https://microsoft.github.io/autogen/) package.
+Minimal dependencies are installed without extra options. You can install extra options based on the feature you need. For example, use the following to install the dependencies needed by the [`automl`](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML) module.
 
 ```bash
-pip install "flaml[autogen]"
+pip install "flaml[automl]"
 ```
 
 Find more options in [Installation](https://microsoft.github.io/FLAML/docs/Installation).
@@ -57,39 +51,6 @@ Each of the [`notebook examples`](https://github.com/microsoft/FLAML/tree/main/n
 
 ## Quickstart
 
-- (New) The [autogen](https://microsoft.github.io/autogen/) package enables the next-gen GPT-X applications with a generic multi-agent conversation framework.
-  It offers customizable and conversable agents which integrate LLMs, tools and human.
-  By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For example,
-
-```python
-from flaml import autogen
-
-assistant = autogen.AssistantAgent("assistant")
-user_proxy = autogen.UserProxyAgent("user_proxy")
-user_proxy.initiate_chat(
-    assistant,
-    message="Show me the YTD gain of 10 largest technology companies as of today.",
-)
-# This initiates an automated chat between the two agents to solve the task
-```
-
-Autogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers a drop-in replacement of `openai.Completion` or `openai.ChatCompletion` with powerful functionalites like tuning, caching, templating, filtering. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.
-
-```python
-# perform tuning
-config, analysis = autogen.Completion.tune(
-    data=tune_data,
-    metric="success",
-    mode="max",
-    eval_func=eval_func,
-    inference_budget=0.05,
-    optimization_budget=3,
-    num_samples=-1,
-)
-# perform inference for a test instance
-response = autogen.Completion.create(context=test_instance, **config)
-```
-
 - With three lines of code, you can start using this economical and fast
   AutoML engine as a [scikit-learn style estimator](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML).
 
diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
index 1f6fdadc5..c4c6e2dbb 100644
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -401,6 +401,24 @@ class AutoML(BaseEstimator):
         self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor"
         self.best_run_id = None
 
+    def __getstate__(self):
+        """Customize pickling to avoid serializing runtime-only objects.
+
+        MLflow's sklearn flavor serializes estimators via (cloud)pickle. During
+        AutoML fitting we may attach an internal mlflow integration instance
+        which holds `concurrent.futures.Future` objects and executors containing
+        thread locks, which are not picklable.
+        """
+
+        state = self.__dict__.copy()
+        state.pop("mlflow_integration", None)
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        # Ensure attribute exists post-unpickle.
+        self.mlflow_integration = None
+
     def get_params(self, deep: bool = False) -> dict:
         return self._settings.copy()
 
diff --git a/flaml/automl/data.py b/flaml/automl/data.py
index 4c473963f..096ba46d9 100644
--- a/flaml/automl/data.py
+++ b/flaml/automl/data.py
@@ -50,7 +50,10 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_forma
     """
     import pickle
 
-    import openml
+    try:
+        import openml
+    except ImportError:
+        openml = None
     from sklearn.model_selection import train_test_split
 
     filename = "openml_ds" + str(dataset_id) + ".pkl"
@@ -61,15 +64,15 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_forma
             dataset = pickle.load(f)
     else:
         print("download dataset from openml")
-        dataset = openml.datasets.get_dataset(dataset_id)
+        dataset = openml.datasets.get_dataset(dataset_id) if openml else None
         if not os.path.exists(data_dir):
             os.makedirs(data_dir)
         with open(filepath, "wb") as f:
             pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
-    print("Dataset name:", dataset.name)
+    print("Dataset name:", dataset.name) if dataset else None
     try:
         X, y, *__ = dataset.get_data(target=dataset.default_target_attribute, dataset_format=dataset_format)
-    except ValueError:
+    except (ValueError, AttributeError, TypeError):
         from sklearn.datasets import fetch_openml
 
         X, y = fetch_openml(data_id=dataset_id, return_X_y=True)
diff --git a/flaml/automl/ml.py b/flaml/automl/ml.py
index bd13d8259..8ba05120e 100644
--- a/flaml/automl/ml.py
+++ b/flaml/automl/ml.py
@@ -127,9 +127,21 @@ def metric_loss_score(
             import datasets
 
             datasets_metric_name = huggingface_submetric_to_metric.get(metric_name, metric_name.split(":")[0])
-            metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
             metric_mode = huggingface_metric_to_mode[datasets_metric_name]
 
+            # datasets>=3 removed load_metric; prefer evaluate if available
+            try:
+                import evaluate
+
+                metric = evaluate.load(datasets_metric_name, trust_remote_code=True)
+            except Exception:
+                if hasattr(datasets, "load_metric"):
+                    metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
+                else:
+                    from datasets import load_metric as _load_metric  # older datasets
+
+                    metric = _load_metric(datasets_metric_name, trust_remote_code=True)
+
             if metric_name.startswith("seqeval"):
                 y_processed_true = [[labels[tr] for tr in each_list] for each_list in y_processed_true]
             elif metric in ("pearsonr", "spearmanr"):
diff --git a/flaml/automl/model.py b/flaml/automl/model.py
index 298f3cab8..53a92ece2 100644
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -111,7 +111,7 @@ def limit_resource(memory_limit, time_limit):
                 pass
 
 
-class BaseEstimator:
+class BaseEstimator(sklearn.base.ClassifierMixin, sklearn.base.BaseEstimator):
     """The abstract class for all learners.
 
     Typical examples:
diff --git a/flaml/automl/nlp/huggingface/training_args.py b/flaml/automl/nlp/huggingface/training_args.py
index 6a408b1a2..383fc9f39 100644
--- a/flaml/automl/nlp/huggingface/training_args.py
+++ b/flaml/automl/nlp/huggingface/training_args.py
@@ -77,6 +77,14 @@ class TrainingArgumentsForAuto(TrainingArguments):
 
     logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
 
+    # Newer versions of HuggingFace Transformers may access `TrainingArguments.generation_config`
+    # (e.g., in generation-aware trainers/callbacks). Keep this attribute to remain compatible
+    # while defaulting to None for non-generation tasks.
+    generation_config: Optional[object] = field(
+        default=None,
+        metadata={"help": "Optional generation config (or path) used by generation-aware trainers."},
+    )
+
     @staticmethod
     def load_args_from_console():
         from dataclasses import fields
diff --git a/flaml/automl/time_series/tft.py b/flaml/automl/time_series/tft.py
index c9ab30be1..fb660ce43 100644
--- a/flaml/automl/time_series/tft.py
+++ b/flaml/automl/time_series/tft.py
@@ -1,3 +1,4 @@
+import inspect
 import time
 
 try:
@@ -106,12 +107,17 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
     def fit(self, X_train, y_train, budget=None, **kwargs):
         import warnings
 
-        import pytorch_lightning as pl
+        try:
+            import lightning.pytorch as pl
+            from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
+            from lightning.pytorch.loggers import TensorBoardLogger
+        except ImportError:
+            import pytorch_lightning as pl
+            from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
+            from pytorch_lightning.loggers import TensorBoardLogger
         import torch
         from pytorch_forecasting import TemporalFusionTransformer
         from pytorch_forecasting.metrics import QuantileLoss
-        from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
-        from pytorch_lightning.loggers import TensorBoardLogger
 
         # a bit of monkey patching to fix the MacOS test
         # all the log_prediction method appears to do is plot stuff, which ?breaks github tests
@@ -132,12 +138,26 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
         lr_logger = LearningRateMonitor()  # log the learning rate
         logger = TensorBoardLogger(kwargs.get("log_dir", "lightning_logs"))  # logging results to a tensorboard
         default_trainer_kwargs = dict(
-            gpus=self._kwargs.get("gpu_per_trial", [0]) if torch.cuda.is_available() else None,
             max_epochs=max_epochs,
             gradient_clip_val=gradient_clip_val,
             callbacks=[lr_logger, early_stop_callback],
             logger=logger,
         )
+
+        # PyTorch Lightning >=2.0 replaced `gpus` with `accelerator`/`devices`.
+        # Also, passing `gpus=None` is not accepted on newer versions.
+        trainer_sig_params = inspect.signature(pl.Trainer.__init__).parameters
+        if torch.cuda.is_available() and "gpus" in trainer_sig_params:
+            gpus = self._kwargs.get("gpu_per_trial", None)
+            if gpus is not None:
+                default_trainer_kwargs["gpus"] = gpus
+        elif torch.cuda.is_available() and "devices" in trainer_sig_params:
+            devices = self._kwargs.get("gpu_per_trial", None)
+            if devices == -1:
+                devices = "auto"
+            if devices is not None:
+                default_trainer_kwargs["accelerator"] = "gpu"
+                default_trainer_kwargs["devices"] = devices
         trainer = pl.Trainer(
             **default_trainer_kwargs,
         )
@@ -157,7 +177,14 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
             val_dataloaders=val_dataloader,
         )
         best_model_path = trainer.checkpoint_callback.best_model_path
-        best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
+        # PyTorch 2.6 changed `torch.load` default `weights_only` from False -> True.
+        # Some Lightning checkpoints (including those produced here) can require full unpickling.
+        # This path is generated locally during training, so it's trusted.
+        load_sig_params = inspect.signature(TemporalFusionTransformer.load_from_checkpoint).parameters
+        if "weights_only" in load_sig_params:
+            best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path, weights_only=False)
+        else:
+            best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
         train_time = time.time() - current_time
         self._model = best_tft
         return train_time
diff --git a/flaml/automl/time_series/ts_data.py b/flaml/automl/time_series/ts_data.py
index 2587a70e7..0c7d25558 100644
--- a/flaml/automl/time_series/ts_data.py
+++ b/flaml/automl/time_series/ts_data.py
@@ -9,6 +9,7 @@ import numpy as np
 try:
     import pandas as pd
     from pandas import DataFrame, Series, to_datetime
+    from pandas.api.types import is_datetime64_any_dtype
     from scipy.sparse import issparse
     from sklearn.compose import ColumnTransformer
     from sklearn.impute import SimpleImputer
@@ -392,6 +393,15 @@ class DataTransformerTS:
         assert len(self.num_columns) == 0, "Trying to call fit() twice, something is wrong"
 
         for column in X.columns:
+            # Never treat the time column as a feature for sklearn preprocessing
+            if column == self.time_col:
+                continue
+
+            # Robust datetime detection (covers datetime64[ms/us/ns], tz-aware, etc.)
+            if is_datetime64_any_dtype(X[column]):
+                self.datetime_columns.append(column)
+                continue
+
             # sklearn/utils/validation.py needs int/float values
             if X[column].dtype.name in ("object", "category", "string"):
                 if (
diff --git a/flaml/tune/searcher/blendsearch.py b/flaml/tune/searcher/blendsearch.py
index 8cd8f52e3..c76a9a162 100644
--- a/flaml/tune/searcher/blendsearch.py
+++ b/flaml/tune/searcher/blendsearch.py
@@ -244,13 +244,32 @@ class BlendSearch(Searcher):
                     evaluated_rewards=evaluated_rewards,
                 )
             except (AssertionError, ValueError):
-                self._gs = GlobalSearch(
-                    space=gs_space,
-                    metric=metric,
-                    mode=mode,
-                    seed=gs_seed,
-                    sampler=sampler,
-                )
+                try:
+                    self._gs = GlobalSearch(
+                        space=gs_space,
+                        metric=metric,
+                        mode=mode,
+                        seed=gs_seed,
+                        sampler=sampler,
+                    )
+                except ValueError:
+                    # Ray Tune's OptunaSearch converts Tune domains into Optuna
+                    # distributions. Optuna disallows integer log distributions
+                    # with step != 1 (e.g., qlograndint with q>1), which can
+                    # raise here. Fall back to FLAML's OptunaSearch wrapper,
+                    # which handles these spaces more permissively.
+                    if getattr(GlobalSearch, "__module__", "").startswith("ray.tune"):
+                        from .suggestion import OptunaSearch as _FallbackOptunaSearch
+
+                        self._gs = _FallbackOptunaSearch(
+                            space=gs_space,
+                            metric=metric,
+                            mode=mode,
+                            seed=gs_seed,
+                            sampler=sampler,
+                        )
+                    else:
+                        raise
             self._gs.space = space
         else:
             self._gs = None
diff --git a/flaml/tune/searcher/suggestion.py b/flaml/tune/searcher/suggestion.py
index 614266892..552f65a66 100644
--- a/flaml/tune/searcher/suggestion.py
+++ b/flaml/tune/searcher/suggestion.py
@@ -35,6 +35,73 @@ from ..sample import (
     Quantized,
     Uniform,
 )
+
+# If Ray is installed, flaml.tune may re-export Ray Tune sampling functions.
+# In that case, the search space contains Ray Tune Domain/Sampler objects,
+# which should be accepted by our Optuna search-space conversion.
+try:
+    from ray import __version__ as _ray_version  # type: ignore
+
+    if str(_ray_version).startswith("1."):
+        from ray.tune.sample import (  # type: ignore
+            Categorical as _RayCategorical,
+        )
+        from ray.tune.sample import (
+            Domain as _RayDomain,
+        )
+        from ray.tune.sample import (
+            Float as _RayFloat,
+        )
+        from ray.tune.sample import (
+            Integer as _RayInteger,
+        )
+        from ray.tune.sample import (
+            LogUniform as _RayLogUniform,
+        )
+        from ray.tune.sample import (
+            Quantized as _RayQuantized,
+        )
+        from ray.tune.sample import (
+            Uniform as _RayUniform,
+        )
+    else:
+        from ray.tune.search.sample import (  # type: ignore
+            Categorical as _RayCategorical,
+        )
+        from ray.tune.search.sample import (
+            Domain as _RayDomain,
+        )
+        from ray.tune.search.sample import (
+            Float as _RayFloat,
+        )
+        from ray.tune.search.sample import (
+            Integer as _RayInteger,
+        )
+        from ray.tune.search.sample import (
+            LogUniform as _RayLogUniform,
+        )
+        from ray.tune.search.sample import (
+            Quantized as _RayQuantized,
+        )
+        from ray.tune.search.sample import (
+            Uniform as _RayUniform,
+        )
+
+    _FLOAT_TYPES = (Float, _RayFloat)
+    _INTEGER_TYPES = (Integer, _RayInteger)
+    _CATEGORICAL_TYPES = (Categorical, _RayCategorical)
+    _DOMAIN_TYPES = (Domain, _RayDomain)
+    _QUANTIZED_TYPES = (Quantized, _RayQuantized)
+    _UNIFORM_TYPES = (Uniform, _RayUniform)
+    _LOGUNIFORM_TYPES = (LogUniform, _RayLogUniform)
+except Exception:  # pragma: no cover
+    _FLOAT_TYPES = (Float,)
+    _INTEGER_TYPES = (Integer,)
+    _CATEGORICAL_TYPES = (Categorical,)
+    _DOMAIN_TYPES = (Domain,)
+    _QUANTIZED_TYPES = (Quantized,)
+    _UNIFORM_TYPES = (Uniform,)
+    _LOGUNIFORM_TYPES = (LogUniform,)
 from ..trial import flatten_dict, unflatten_dict
 from .variant_generator import parse_spec_vars
 
@@ -850,19 +917,22 @@ class OptunaSearch(Searcher):
         def resolve_value(domain: Domain) -> ot.distributions.BaseDistribution:
             quantize = None
 
-            sampler = domain.get_sampler()
-            if isinstance(sampler, Quantized):
+            # Ray Tune Domains and FLAML Domains both provide get_sampler(), but
+            # fall back to the .sampler attribute for robustness.
+            sampler = domain.get_sampler() if hasattr(domain, "get_sampler") else getattr(domain, "sampler", None)
+
+            if isinstance(sampler, _QUANTIZED_TYPES) or type(sampler).__name__ == "Quantized":
                 quantize = sampler.q
-                sampler = sampler.sampler
-                if isinstance(sampler, LogUniform):
+                sampler = getattr(sampler, "sampler", None) or sampler.get_sampler()
+                if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
                     logger.warning(
                         "Optuna does not handle quantization in loguniform "
                         "sampling. The parameter will be passed but it will "
                         "probably be ignored."
                     )
 
-            if isinstance(domain, Float):
-                if isinstance(sampler, LogUniform):
+            if isinstance(domain, _FLOAT_TYPES) or type(domain).__name__ == "Float":
+                if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
                     if quantize:
                         logger.warning(
                             "Optuna does not support both quantization and "
@@ -870,17 +940,17 @@ class OptunaSearch(Searcher):
                         )
                     return ot.distributions.LogUniformDistribution(domain.lower, domain.upper)
 
-                elif isinstance(sampler, Uniform):
+                elif isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
                     if quantize:
                         return ot.distributions.DiscreteUniformDistribution(domain.lower, domain.upper, quantize)
                     return ot.distributions.UniformDistribution(domain.lower, domain.upper)
 
-            elif isinstance(domain, Integer):
-                if isinstance(sampler, LogUniform):
+            elif isinstance(domain, _INTEGER_TYPES) or type(domain).__name__ == "Integer":
+                if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
                     # ``step`` argument Deprecated in v2.0.0. ``step`` argument should be 1 in Log Distribution
                     # The removal of this feature is currently scheduled for v4.0.0,
                     return ot.distributions.IntLogUniformDistribution(domain.lower, domain.upper - 1, step=1)
-                elif isinstance(sampler, Uniform):
+                elif isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
                     # Upper bound should be inclusive for quantization and
                     # exclusive otherwise
                     return ot.distributions.IntUniformDistribution(
@@ -888,16 +958,16 @@ class OptunaSearch(Searcher):
                         domain.upper - int(bool(not quantize)),
                         step=quantize or 1,
                     )
-            elif isinstance(domain, Categorical):
-                if isinstance(sampler, Uniform):
+            elif isinstance(domain, _CATEGORICAL_TYPES) or type(domain).__name__ == "Categorical":
+                if isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
                     return ot.distributions.CategoricalDistribution(domain.categories)
 
             raise ValueError(
                 "Optuna search does not support parameters of type "
-                "`{}` with samplers of type `{}`".format(type(domain).__name__, type(domain.sampler).__name__)
+                "`{}` with samplers of type `{}`".format(type(domain).__name__, type(sampler).__name__)
             )
 
         # Parameter name is e.g. "a/b/c" for nested dicts
         values = {"/".join(path): resolve_value(domain) for path, domain in domain_vars}
 
-        return values
+        return values
diff --git a/flaml/version.py b/flaml/version.py
index a0b06b867..3d67cd6bb 100644
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "2.3.7"
+__version__ = "2.4.0"
diff --git a/setup.py b/setup.py
index 31cc56372..9b44b7814 100644
--- a/setup.py
+++ b/setup.py
@@ -51,60 +51,59 @@ setuptools.setup(
             "joblib<=1.3.2",
         ],
         "test": [
-            "jupyter",
+            "numpy>=1.17,<2.0.0; python_version<'3.13'",
+            "numpy>2.0.0; python_version>='3.13'",
+            "jupyter; python_version<'3.13'",
             "lightgbm>=2.3.1",
-            "xgboost>=0.90,<2.0.0",
+            "xgboost>=0.90,<2.0.0; python_version<'3.11'",
+            "xgboost>=2.0.0; python_version>='3.11'",
             "scipy>=1.4.1",
             "pandas>=1.1.4,<2.0.0; python_version<'3.10'",
             "pandas>=1.1.4; python_version>='3.10'",
-            "scikit-learn>=1.0.0",
+            "scikit-learn>=1.2.0",
             "thop",
             "pytest>=6.1.1",
+            "pytest-rerunfailures>=13.0",
             "coverage>=5.3",
             "pre-commit",
             "torch",
             "torchvision",
-            "catboost>=0.26,<1.2; python_version<'3.11'",
-            "catboost>=0.26; python_version>='3.11'",
+            "catboost>=0.26; python_version<'3.13'",
             "rgf-python",
             "optuna>=2.8.0,<=3.6.1",
-            "openml",
+            "openml; python_version<'3.13'",
             "statsmodels>=0.12.2",
-            "psutil==5.8.0",
+            "psutil",
             "dataclasses",
-            "transformers[torch]==4.26",
-            "datasets<=3.5.0",
-            "nltk<=3.8.1",  # 3.8.2 doesn't work with mlflow
+            "transformers[torch]",
+            "datasets",
+            "evaluate",
+            "nltk!=3.8.2",  # 3.8.2 doesn't work with mlflow
             "rouge_score",
-            "hcrystalball==0.1.10",
+            "hcrystalball",
             "seqeval",
-            "pytorch-forecasting>=0.9.0,<=0.10.1; python_version<'3.11'",
-            # "pytorch-forecasting==0.10.1; python_version=='3.11'",
-            "mlflow==2.15.1",
+            "pytorch-forecasting; python_version<'3.13'",
+            "mlflow-skinny<=2.22.1",  # Refer to https://mvnrepository.com/artifact/org.mlflow/mlflow-spark
             "joblibspark>=0.5.0",
             "joblib<=1.3.2",
             "nbconvert",
             "nbformat",
             "ipykernel",
-            "pytorch-lightning<1.9.1",  # test_forecast_panel
-            "tensorboardX==2.6",  # test_forecast_panel
-            "requests<2.29.0",  # https://github.com/docker/docker-py/issues/3113
+            "pytorch-lightning",  # test_forecast_panel
+            "tensorboardX",  # test_forecast_panel
+            "requests",  # https://github.com/docker/docker-py/issues/3113
             "packaging",
-            "pydantic==1.10.9",
-            "sympy",
-            "wolframalpha",
             "dill",  # a drop in replacement of pickle
         ],
         "catboost": [
-            "catboost>=0.26,<1.2; python_version<'3.11'",
-            "catboost>=0.26,<=1.2.5; python_version>='3.11'",
+            "catboost>=0.26",
         ],
         "blendsearch": [
             "optuna>=2.8.0,<=3.6.1",
             "packaging",
         ],
         "ray": [
-            "ray[tune]~=1.13",
+            "ray[tune]>=1.13,<2.5.0",
         ],
         "azureml": [
             "azureml-mlflow",
@@ -131,33 +130,21 @@ setuptools.setup(
             "seqeval",
         ],
         "ts_forecast": [
-            "holidays<0.14",  # to prevent installation error for prophet
-            "prophet>=1.0.1",
+            "holidays",
+            "prophet>=1.1.5",
             "statsmodels>=0.12.2",
-            "hcrystalball==0.1.10",
+            "hcrystalball>=0.1.10",
         ],
         "forecast": [
-            "holidays<0.14",  # to prevent installation error for prophet
-            "prophet>=1.0.1",
+            "holidays",
+            "prophet>=1.1.5",
             "statsmodels>=0.12.2",
-            "hcrystalball==0.1.10",
-            "pytorch-forecasting>=0.9.0; python_version<'3.11'",
-            # "pytorch-forecasting==0.10.1; python_version=='3.11'",
-            "pytorch-lightning==1.9.0",
-            "tensorboardX==2.6",
+            "hcrystalball>=0.1.10",
+            "pytorch-forecasting>=0.10.4; python_version<'3.13'",
+            "pytorch-lightning>=1.9.0",
+            "tensorboardX>=2.6",
         ],
         "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3", "pandas==1.1.4"],
-        "openai": ["openai==0.27.8", "diskcache"],
-        "autogen": ["openai==0.27.8", "diskcache", "termcolor"],
-        "mathchat": ["openai==0.27.8", "diskcache", "termcolor", "sympy", "pydantic==1.10.9", "wolframalpha"],
-        "retrievechat": [
-            "openai==0.27.8",
-            "diskcache",
-            "termcolor",
-            "chromadb",
-            "tiktoken",
-            "sentence_transformers",
-        ],
         "synapse": [
             "joblibspark>=0.5.0",
             "optuna>=2.8.0,<=3.6.1",
@@ -170,9 +157,9 @@ setuptools.setup(
         "Operating System :: OS Independent",
         # Specify the Python versions you support here.
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
     ],
-    python_requires=">=3.9",
+    python_requires=">=3.10",
 )
diff --git a/test/automl/test_notebook_example.py b/test/automl/test_notebook_example.py
index 536ef0484..b4558f109 100644
--- a/test/automl/test_notebook_example.py
+++ b/test/automl/test_notebook_example.py
@@ -1,8 +1,23 @@
 import sys
 
 import pytest
-from minio.error import ServerError
-from openml.exceptions import OpenMLServerException
+
+try:
+    from minio.error import ServerError
+except ImportError:
+
+    class ServerError(Exception):
+        pass
+
+
+try:
+    from openml.exceptions import OpenMLServerException
+except ImportError:
+
+    class OpenMLServerException(Exception):
+        pass
+
+
 from requests.exceptions import ChunkedEncodingError, SSLError
 
 
diff --git a/test/automl/test_python_log.py b/test/automl/test_python_log.py
index e18e33634..b3f141780 100644
--- a/test/automl/test_python_log.py
+++ b/test/automl/test_python_log.py
@@ -38,7 +38,7 @@ class TestLogging(unittest.TestCase):
                 "keep_search_state": True,
                 "learner_selector": "roundrobin",
             }
-            X_train, y_train = fetch_california_housing(return_X_y=True)
+            X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
             n = len(y_train) >> 1
             print(automl.model, automl.classes_, automl.predict(X_train))
             automl.fit(
diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
index 892ad1ece..daa5f3830 100644
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -47,7 +47,7 @@ class TestRegression(unittest.TestCase):
             "n_jobs": 1,
             "model_history": True,
         }
-        X_train, y_train = fetch_california_housing(return_X_y=True)
+        X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
         n = int(len(y_train) * 9 // 10)
         automl.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings)
         assert automl._state.eval_method == "holdout"
@@ -141,7 +141,7 @@ class TestRegression(unittest.TestCase):
             "n_concurrent_trials": 10,
             "hpo_method": hpo_method,
         }
-        X_train, y_train = fetch_california_housing(return_X_y=True)
+        X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
         try:
             automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
             print(automl_experiment.predict(X_train))
@@ -268,7 +268,7 @@ def test_reproducibility_of_regression_models(estimator: str):
         "skip_transform": True,
         "retrain_full": True,
     }
-    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
     automl.fit(X_train=X, y_train=y, **automl_settings)
     best_model = automl.model
     assert best_model is not None
@@ -314,7 +314,7 @@ def test_reproducibility_of_catboost_regression_model():
         "skip_transform": True,
         "retrain_full": True,
     }
-    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
     automl.fit(X_train=X, y_train=y, **automl_settings)
     best_model = automl.model
     assert best_model is not None
@@ -360,7 +360,7 @@ def test_reproducibility_of_lgbm_regression_model():
         "skip_transform": True,
         "retrain_full": True,
     }
-    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
     automl.fit(X_train=X, y_train=y, **automl_settings)
     best_model = automl.model
     assert best_model is not None
@@ -424,7 +424,7 @@ def test_reproducibility_of_underlying_regression_models(estimator: str):
         "skip_transform": True,
         "retrain_full": False,
     }
-    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
     automl.fit(X_train=X, y_train=y, **automl_settings)
     best_model = automl.model
     assert best_model is not None
diff --git a/test/automl/test_score.py b/test/automl/test_score.py
index 2976daade..7e9abc864 100644
--- a/test/automl/test_score.py
+++ b/test/automl/test_score.py
@@ -142,7 +142,7 @@ class TestScore:
     def test_regression(self):
         automl_experiment = AutoML()
 
-        X_train, y_train = fetch_california_housing(return_X_y=True)
+        X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
         n = int(len(y_train) * 9 // 10)
 
         for each_estimator in [
diff --git a/test/automl/test_training_log.py b/test/automl/test_training_log.py
index cd8db8118..0d9628473 100644
--- a/test/automl/test_training_log.py
+++ b/test/automl/test_training_log.py
@@ -30,7 +30,7 @@ class TestTrainingLog(unittest.TestCase):
                 "keep_search_state": True,
                 "estimator_list": estimator_list,
             }
-            X_train, y_train = fetch_california_housing(return_X_y=True)
+            X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
             automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
             # Check if the training log file is populated.
             self.assertTrue(os.path.exists(filename))
diff --git a/test/automl/test_warmstart.py b/test/automl/test_warmstart.py
index 677883741..ba390bf7d 100644
--- a/test/automl/test_warmstart.py
+++ b/test/automl/test_warmstart.py
@@ -108,7 +108,14 @@ class TestWarmStart(unittest.TestCase):
 
     def test_FLAML_sample_size_in_starting_points(self):
         from minio.error import ServerError
-        from openml.exceptions import OpenMLServerException
+
+        try:
+            from openml.exceptions import OpenMLServerException
+        except ImportError:
+
+            class OpenMLServerException(Exception):
+                pass
+
         from requests.exceptions import ChunkedEncodingError, SSLError
 
         from flaml import AutoML
diff --git a/test/cal_housing_py3.pkz b/test/cal_housing_py3.pkz
new file mode 100644
index 000000000..201c864b2
Binary files /dev/null and b/test/cal_housing_py3.pkz differ
diff --git a/test/check_dependency.py b/test/check_dependency.py
new file mode 100644
index 000000000..526f25b18
--- /dev/null
+++ b/test/check_dependency.py
@@ -0,0 +1,60 @@
+import subprocess
+from importlib.metadata import distributions
+
+installed_libs = sorted(f"{dist.metadata['Name']}=={dist.version}" for dist in distributions())
+
+first_tier_dependencies = [
+    "numpy",
+    "jupyter",
+    "lightgbm",
+    "xgboost",
+    "scipy",
+    "pandas",
+    "scikit-learn",
+    "thop",
+    "pytest",
+    "pytest-rerunfailures",
+    "coverage",
+    "pre-commit",
+    "torch",
+    "torchvision",
+    "catboost",
+    "rgf-python",
+    "optuna",
+    "openml",
+    "statsmodels",
+    "psutil",
+    "dataclasses",
+    "transformers[torch]",
+    "transformers",
+    "datasets",
+    "evaluate",
+    "nltk",
+    "rouge_score",
+    "hcrystalball",
+    "seqeval",
+    "pytorch-forecasting",
+    "mlflow-skinny",
+    "joblibspark",
+    "joblib",
+    "nbconvert",
+    "nbformat",
+    "ipykernel",
+    "pytorch-lightning",
+    "tensorboardX",
+    "requests",
+    "packaging",
+    "dill",
+    "ray",
+    "prophet",
+]
+
+
+for lib in installed_libs:
+    lib_name = lib.split("==")[0]
+    if lib_name in first_tier_dependencies:
+        print(lib)
+
+# print current commit hash
+commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
+print(f"Current commit hash: {commit_hash}")
diff --git a/test/conftest.py b/test/conftest.py
index 47a74b289..4b4620775 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -2,11 +2,24 @@ from typing import Any, Dict, List, Union
 
 import numpy as np
 import pandas as pd
-from catboost import CatBoostClassifier, CatBoostRegressor, Pool
+import pytest
 from sklearn.metrics import f1_score, r2_score
 
+try:
+    from catboost import CatBoostClassifier, CatBoostRegressor, Pool
+except ImportError:  # pragma: no cover
+    CatBoostClassifier = None
+    CatBoostRegressor = None
+    Pool = None
 
-def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> pd.DataFrame:
+
+def _is_catboost_model_type(model_type: type) -> bool:
+    if CatBoostClassifier is not None and CatBoostRegressor is not None:
+        return model_type is CatBoostClassifier or model_type is CatBoostRegressor
+    return getattr(model_type, "__module__", "").startswith("catboost")
+
+
+def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> List[float]:
     """Mimic the FLAML CV process to calculate the metrics across each fold.
 
     :param X_train_all: X training data
@@ -17,7 +30,7 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
     :return: An array containing the metrics
     """
     rng = np.random.RandomState(2020)
-    all_fold_metrics: List[Dict[str, Union[int, float]]] = []
+    all_fold_metrics: List[float] = []
     for train_index, val_index in kf.split(X_train_all, y_train_all):
         X_train_split, y_train_split = X_train_all, y_train_all
         train_index = rng.permutation(train_index)
@@ -25,9 +38,11 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
         X_val = X_train_split.iloc[val_index]
         y_train, y_val = y_train_split[train_index], y_train_split[val_index]
         model_type = type(model)
-        if model_type is not CatBoostClassifier and model_type is not CatBoostRegressor:
+        if not _is_catboost_model_type(model_type):
             model.fit(X_train, y_train)
         else:
+            if Pool is None:
+                pytest.skip("catboost is not installed")
             use_best_model = True
             n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
             X_tr, y_tr = (X_train)[:n], y_train[:n]
@@ -38,5 +53,5 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
             reproduced_metric = 1 - f1_score(y_val, y_pred_classes)
         else:
             reproduced_metric = 1 - r2_score(y_val, y_pred_classes)
-        all_fold_metrics.append(reproduced_metric)
+        all_fold_metrics.append(float(reproduced_metric))
     return all_fold_metrics
diff --git a/test/default/test_defaults.py b/test/default/test_defaults.py
index d8be7b61b..acf50e4ea 100644
--- a/test/default/test_defaults.py
+++ b/test/default/test_defaults.py
@@ -60,7 +60,7 @@ def test_housing(as_frame=True):
         "starting_points": "data",
         "max_iter": 0,
     }
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame, data_home="test")
     automl.fit(X_train, y_train, **automl_settings)
 
 
@@ -115,7 +115,7 @@ def test_suggest_classification():
 
 def test_suggest_regression():
     location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
     suggested = suggest_hyperparams("regression", X_train, y_train, "lgbm", location=location)
     print(suggested)
     suggested = preprocess_and_suggest_hyperparams("regression", X_train, y_train, "xgboost", location=location)
@@ -137,7 +137,7 @@ def test_rf():
     print(rf)
 
     location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
     rf = RandomForestRegressor(default_location=location)
     rf.fit(X_train[:100], y_train[:100])
     rf.predict(X_train)
@@ -155,7 +155,7 @@ def test_extratrees():
     print(classifier)
 
     location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
     regressor = ExtraTreesRegressor(default_location=location)
     regressor.fit(X_train[:100], y_train[:100])
     regressor.predict(X_train)
@@ -175,7 +175,7 @@ def test_lgbm():
     print(classifier.classes_)
 
     location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
     regressor = LGBMRegressor(default_location=location)
     regressor.fit(X_train, y_train)
     regressor.predict(X_train)
@@ -194,7 +194,7 @@ def test_xgboost():
     print(classifier.classes_)
 
     location = "test/default"
-    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
+    X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
     regressor = XGBRegressor(default_location=location)
     regressor.fit(X_train[:100], y_train[:100])
     regressor.predict(X_train)
diff --git a/test/nlp/test_default.py b/test/nlp/test_default.py
index 239fce227..5f7622a1c 100644
--- a/test/nlp/test_default.py
+++ b/test/nlp/test_default.py
@@ -30,21 +30,33 @@ def test_build_portfolio(path="./test/nlp/default", strategy="greedy"):
 
 @pytest.mark.skipif(sys.platform == "win32", reason="do not run on windows")
 def test_starting_point_not_in_search_space():
-    from flaml import AutoML
+    """Regression test for invalid starting points and custom_hp.
+
+    This test must not require network access to Hugging Face.
+    """
 
     """
         test starting_points located outside of the search space, and custom_hp is not set
     """
+    from flaml.automl.state import SearchState
+    from flaml.automl.task.factory import task_factory
+
     this_estimator_name = "transformer"
-    X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
+    X_train, y_train, _, _, _ = get_toy_data_seqclassification()
+    task = task_factory("seq-classification", X_train, y_train)
+    estimator_class = task.estimator_class_from_str(this_estimator_name)
+    estimator_class.init()
 
-    automl = AutoML()
-    automl_settings = get_automl_settings(estimator_name=this_estimator_name)
-
-    automl_settings["starting_points"] = {this_estimator_name: [{"learning_rate": 2e-3}]}
-
-    automl.fit(X_train, y_train, **automl_settings)
-    assert automl._search_states[this_estimator_name].init_config[0]["learning_rate"] != 2e-3
+    # SearchState is where invalid starting points are filtered out when max_iter > 1.
+    search_state = SearchState(
+        learner_class=estimator_class,
+        data=X_train,
+        task=task,
+        starting_point={"learning_rate": 2e-3},
+        max_iter=3,
+        budget=10,
+    )
+    assert search_state.init_config and search_state.init_config[0].get("learning_rate") != 2e-3
 
     """
         test starting_points located outside of the search space, and custom_hp is set
@@ -52,39 +64,60 @@ def test_starting_point_not_in_search_space():
 
     from flaml import tune
 
-    X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
+    X_train, y_train, _, _, _ = get_toy_data_seqclassification()
 
     this_estimator_name = "transformer_ms"
-    automl = AutoML()
-    automl_settings = get_automl_settings(estimator_name=this_estimator_name)
+    task = task_factory("seq-classification", X_train, y_train)
+    estimator_class = task.estimator_class_from_str(this_estimator_name)
+    estimator_class.init()
 
-    automl_settings["custom_hp"] = {
-        this_estimator_name: {
-            "model_path": {
-                "domain": "albert-base-v2",
-            },
-            "learning_rate": {
-                "domain": tune.choice([1e-4, 1e-5]),
-            },
-            "per_device_train_batch_size": {
-                "domain": 2,
-            },
-        }
+    custom_hp = {
+        "model_path": {
+            "domain": "albert-base-v2",
+        },
+        "learning_rate": {
+            "domain": tune.choice([1e-4, 1e-5]),
+        },
+        "per_device_train_batch_size": {
+            "domain": 2,
+        },
     }
-    automl_settings["starting_points"] = "data:test/nlp/default/"
 
-    automl.fit(X_train, y_train, **automl_settings)
-    assert len(automl._search_states[this_estimator_name].init_config[0]) == len(
-        automl._search_states[this_estimator_name]._search_space_domain
-    ) - len(automl_settings["custom_hp"][this_estimator_name]), (
+    # Simulate a suggested starting point (e.g. from portfolio) which becomes invalid
+    # after custom_hp constrains the space.
+    invalid_starting_points = [
+        {
+            "learning_rate": 1e-5,
+            "num_train_epochs": 1.0,
+            "per_device_train_batch_size": 8,
+            "seed": 43,
+            "global_max_steps": 100,
+            "model_path": "google/electra-base-discriminator",
+        }
+    ]
+
+    search_state = SearchState(
+        learner_class=estimator_class,
+        data=X_train,
+        task=task,
+        starting_point=invalid_starting_points,
+        custom_hp=custom_hp,
+        max_iter=3,
+        budget=10,
+    )
+
+    assert search_state.init_config, "Expected a non-empty init_config list"
+    init_config0 = search_state.init_config[0]
+    assert init_config0 is not None
+    assert len(init_config0) == len(search_state._search_space_domain) - len(custom_hp), (
         "The search space is updated with the custom_hp on {} hyperparameters of "
         "the specified estimator without an initial value. Thus a valid init config "
         "should only contain the cardinality of the search space minus {}".format(
-            len(automl_settings["custom_hp"][this_estimator_name]),
-            len(automl_settings["custom_hp"][this_estimator_name]),
+            len(custom_hp),
+            len(custom_hp),
         )
     )
-    assert automl._search_states[this_estimator_name].search_space["model_path"] == "albert-base-v2"
+    assert search_state.search_space["model_path"] == "albert-base-v2"
 
     if os.path.exists("test/data/output/"):
         try:
@@ -106,7 +139,13 @@ def test_points_to_evaluate():
 
     automl_settings["custom_hp"] = {"transformer_ms": {"model_path": {"domain": "google/electra-small-discriminator"}}}
 
-    automl.fit(X_train, y_train, **automl_settings)
+    try:
+        automl.fit(X_train, y_train, **automl_settings)
+    except OSError as e:
+        message = str(e)
+        if "Too Many Requests" in message or "rate limit" in message.lower():
+            pytest.skip(f"Skipping HF model load/training: {message}")
+        raise
 
     if os.path.exists("test/data/output/"):
         try:
@@ -141,7 +180,14 @@ def test_zero_shot_nomodel():
     fit_kwargs = automl_settings.pop("fit_kwargs_by_estimator", {}).get(estimator_name)
     fit_kwargs.update(automl_settings)
     pop_args(fit_kwargs)
-    model.fit(X_train, y_train, **fit_kwargs)
+
+    try:
+        model.fit(X_train, y_train, **fit_kwargs)
+    except OSError as e:
+        message = str(e)
+        if "Too Many Requests" in message or "rate limit" in message.lower():
+            pytest.skip(f"Skipping HF model load/training: {message}")
+        raise
 
     if os.path.exists("test/data/output/"):
         try:
diff --git a/test/object_store.py b/test/object_store.py
index 6d32237b4..1b3f98502 100644
--- a/test/object_store.py
+++ b/test/object_store.py
@@ -7,7 +7,7 @@ from sklearn.model_selection import train_test_split
 from flaml import tune
 from flaml.automl.model import LGBMEstimator
 
-data = fetch_california_housing(return_X_y=False, as_frame=True)
+data = fetch_california_housing(return_X_y=False, as_frame=True, data_home="test")
 X, y = data.data, data.target
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
 X_train_ref = ray.put(X_train)
diff --git a/test/reg.py b/test/reg.py
index 795ab1c5c..1b758e2ca 100644
--- a/test/reg.py
+++ b/test/reg.py
@@ -11,7 +11,7 @@ automl_settings = {
     "task": "regression",
     "log_file_name": "test/california.log",
 }
-X_train, y_train = fetch_california_housing(return_X_y=True)
+X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
 # Train with labeled input data
 automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
 print(automl.model)
diff --git a/test/spark/test_exceptions.py b/test/spark/test_exceptions.py
index 13c265d37..63a22e625 100644
--- a/test/spark/test_exceptions.py
+++ b/test/spark/test_exceptions.py
@@ -22,7 +22,7 @@ def base_automl(n_concurrent_trials=1, use_ray=False, use_spark=False, verbose=0
     except (ServerError, Exception):
         from sklearn.datasets import fetch_california_housing
 
-        X_train, y_train = fetch_california_housing(return_X_y=True)
+        X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
     automl = AutoML()
     settings = {
         "time_budget": 3,  # total running time in seconds
diff --git a/test/spark/test_performance.py b/test/spark/test_performance.py
index febd1a4b8..030fcae49 100644
--- a/test/spark/test_performance.py
+++ b/test/spark/test_performance.py
@@ -2,8 +2,23 @@ import os
 import sys
 
 import pytest
-from minio.error import ServerError
-from openml.exceptions import OpenMLServerException
+
+try:
+    from minio.error import ServerError
+except ImportError:
+
+    class ServerError(Exception):
+        pass
+
+
+try:
+    from openml.exceptions import OpenMLServerException
+except ImportError:
+
+    class OpenMLServerException(Exception):
+        pass
+
+
 from requests.exceptions import ChunkedEncodingError, SSLError
 
 from flaml.tune.spark.utils import check_spark
diff --git a/test/test_autovw.py b/test/test_autovw.py
index 06140f435..100021d56 100644
--- a/test/test_autovw.py
+++ b/test/test_autovw.py
@@ -5,17 +5,38 @@ import sys
 import unittest
 
 import numpy as np
-import openml
+
+try:
+    import openml
+except ImportError:
+    openml = None
 import pandas as pd
 import pytest
 import scipy.sparse
-from minio.error import ServerError
+
+try:
+    from minio.error import ServerError
+except ImportError:
+
+    class ServerError(Exception):
+        pass
+
+
 from requests.exceptions import SSLError
 from sklearn.metrics import mean_absolute_error, mean_squared_error
 
 from flaml import AutoVW
 from flaml.tune import loguniform, polynomial_expansion_set
 
+try:
+    from vowpalwabbit import pyvw
+except ImportError:
+    skip_vw_test = True
+else:
+    skip_vw_test = False
+
+pytest.skip("skipping if no openml", allow_module_level=True) if openml is None else None
+
 VW_DS_DIR = "test/data/"
 NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
 logger = logging.getLogger(__name__)
@@ -351,14 +372,9 @@ def get_vw_tuning_problem(tuning_hp="NamesapceInteraction"):
     return vw_oml_problem_args, vw_online_aml_problem
 
 
-@pytest.mark.skipif(
-    "3.10" in sys.version or "3.11" in sys.version,
-    reason="do not run on py >= 3.10",
-)
+@pytest.mark.skipif(skip_vw_test, reason="vowpalwabbit not installed")
 class TestAutoVW(unittest.TestCase):
     def test_vw_oml_problem_and_vanilla_vw(self):
-        from vowpalwabbit import pyvw
-
         try:
             vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem()
         except (SSLError, ServerError, Exception) as e:
diff --git a/test/tune_example.py b/test/tune_example.py
index 9b4ba68fe..592f94762 100644
--- a/test/tune_example.py
+++ b/test/tune_example.py
@@ -6,12 +6,12 @@ from sklearn.model_selection import train_test_split
 from flaml import tune
 from flaml.automl.model import LGBMEstimator
 
-data = fetch_california_housing(return_X_y=False, as_frame=True)
+data = fetch_california_housing(return_X_y=False, as_frame=True, data_home="test")
 df, X, y = data.frame, data.data, data.target
 df_train, _, X_train, X_test, _, y_test = train_test_split(df, X, y, test_size=0.33, random_state=42)
 csv_file_name = "test/housing.csv"
 df_train.to_csv(csv_file_name, index=False)
-# X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+# X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
 # X_train, X_test, y_train, y_test = train_test_split(
 #     X, y, test_size=0.33, random_state=42
 # )