Update readme, bump version to 2.4.0, fix CI errors (#1466)

* Update gitignore

* Bump version to 2.4.0

* Update readme

* Pre-download california housing data

* Use pre-downloaded california housing data

* Pin lightning<=2.5.6

* Fix typo in find and replace

* Fix estimators has no attribute __sklearn_tags__

* Pin torch to 2.2.2 in tests

* Fix conflict

* Update pytorch-forecasting

* Update pytorch-forecasting

* Update pytorch-forecasting

* Use numpy<2 for testing

* Update scikit-learn

* Run Build and UT every other day

* Pin pip<24.1

* Pin pip<24.1 in pipeline

* Loosen pip, install pytorch_forecasting only in py311

* Add support to new versions of nlp dependecies

* Fix formats

* Remove redefinition

* Update mlflow versions

* Fix mlflow version syntax

* Update gitignore

* Clean up cache to free space

* Remove clean up action cache

* Fix blendsearch

* Update test workflow

* Update setup.py

* Fix catboost version

* Update workflow

* Prepare for python 3.14

* Support no catboost

* Fix tests

* Fix python_requires

* Update test workflow

* Fix vw tests

* Remove python 3.9

* Fix nlp tests

* Fix prophet

* Print pip freeze for better debugging

* Fix Optuna search does not support parameters of type Float with samplers of type Quantized

* Save dependencies for later inspection

* Fix coverage.xml not exists

* Fix github action permission

* Handle python 3.13

* Address openml is not installed

* Check dependencies before run tests

* Update dependencies

* Fix syntax error

* Use bash

* Update dependencies

* Fix git error

* Loose mlflow constraints

* Add rerun, use mlflow-skinny

* Fix git error

* Remove ray tests

* Update xgboost versions

* Fix automl pickle error

* Don't test python 3.10 on macos as it's stuck

* Rebase before push

* Reduce number of branches
This commit is contained in:
Li Jiang
2026-01-09 13:40:52 +08:00
committed by GitHub
parent 7f42bece89
commit 1285700d7a
31 changed files with 543 additions and 237 deletions

View File

@@ -22,8 +22,12 @@ on:
- 'setup.py'
merge_group:
types: [checks_requested]
schedule:
# Every other day at 02:00 UTC
- cron: '0 2 */2 * *'
permissions: {}
permissions:
contents: write
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -36,7 +40,10 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.9", "3.10", "3.11"]
python-version: ["3.10", "3.11"]
exclude:
- os: macos-latest
python-version: "3.10"
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
@@ -44,7 +51,7 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: On mac, install libomp to facilitate lgbm and xgboost install
if: matrix.os == 'macOS-latest'
if: matrix.os == 'macos-latest'
run: |
brew update
brew install libomp
@@ -70,33 +77,43 @@ jobs:
run: |
pip install pyspark==3.5.1
pip list | grep "pyspark"
- name: If linux and python<3.11, install ray 2
if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.11'
- name: On Ubuntu python 3.12, install pyspark 4.0.1
if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
run: |
pip install "ray[tune]<2.5.0"
- name: If mac and python 3.10, install ray and xgboost 1
if: matrix.os == 'macOS-latest' && matrix.python-version == '3.10'
run: |
pip install -e .[ray]
# use macOS to test xgboost 1, but macOS also supports xgboost 2
pip install "xgboost<2"
- name: If linux, install prophet on python < 3.9
if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.8'
pip install pyspark==4.0.1
pip list | grep "pyspark"
# # TODO: support ray
# - name: If linux and python<3.11, install ray 2
# if: matrix.os == 'ubuntu-latest' && matrix.python-version < '3.11'
# run: |
# pip install "ray[tune]<2.5.0"
- name: Install prophet when on linux
if: matrix.os == 'ubuntu-latest'
run: |
pip install -e .[forecast]
- name: Install vw on python < 3.10
if: matrix.python-version == '3.8' || matrix.python-version == '3.9'
# TODO: support vw for python 3.10+
- name: If linux and python<3.10, install vw
if: matrix.os == 'ubuntu-latest' && matrix.python-version < '3.10'
run: |
pip install -e .[vw]
- name: Pip freeze
run: |
pip freeze
- name: Check dependencies
run: |
python test/check_dependency.py
- name: Clear pip cache
run: |
pip cache purge
- name: Test with pytest
if: matrix.python-version != '3.10'
run: |
pytest test/ --ignore=test/autogen
pytest test/ --ignore=test/autogen --reruns 2 --reruns-delay 10
- name: Coverage
if: matrix.python-version == '3.10'
run: |
pip install coverage
coverage run -a -m pytest test --ignore=test/autogen
coverage run -a -m pytest test --ignore=test/autogen --reruns 2 --reruns-delay 10
coverage xml
- name: Upload coverage to Codecov
if: matrix.python-version == '3.10'
@@ -104,28 +121,24 @@ jobs:
with:
file: ./coverage.xml
flags: unittests
- name: Save dependencies
shell: bash
run: |
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
git config advice.addIgnoredFile false
# docs:
BRANCH=unit-tests-installed-dependencies
git fetch origin
git checkout -B "$BRANCH"
if git show-ref --verify --quiet "refs/remotes/origin/$BRANCH"; then
git rebase "origin/$BRANCH"
fi
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v3
# - name: Setup Python
# uses: actions/setup-python@v4
# with:
# python-version: '3.8'
# - name: Compile documentation
# run: |
# pip install -e .
# python -m pip install sphinx sphinx_rtd_theme
# cd docs
# make html
# - name: Deploy to GitHub pages
# if: ${{ github.ref == 'refs/heads/main' }}
# uses: JamesIves/github-pages-deploy-action@3.6.2
# with:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# BRANCH: gh-pages
# FOLDER: docs/_build/html
# CLEAN: true
pip freeze > installed_all_dependencies_${{ matrix.python-version }}_${{ matrix.os }}.txt
python test/check_dependency.py > installed_first_tier_dependencies_${{ matrix.python-version }}_${{ matrix.os }}.txt
git add installed_*dependencies*.txt
mv coverage.xml ./coverage_${{ matrix.python-version }}_${{ matrix.os }}.xml || true
git add -f ./coverage_${{ matrix.python-version }}_${{ matrix.os }}.xml || true
git commit -m "Update installed dependencies for Python ${{ matrix.python-version }} on ${{ matrix.os }}" || exit 0
git push origin "$BRANCH"

6
.gitignore vendored
View File

@@ -172,7 +172,7 @@ test/default
test/housing.json
test/nlp/default/transformer_ms/seq-classification.json
flaml/fabric/fanova/_fanova.c
flaml/fabric/fanova/*fanova.c
# local config files
*.config.local
@@ -184,3 +184,7 @@ notebook/lightning_logs/
lightning_logs/
flaml/autogen/extensions/tmp/
test/autogen/my_tmp/
catboost_*
# Internal configs
.pypirc

View File

@@ -14,15 +14,9 @@
<br>
</p>
:fire: FLAML supports AutoML and Hyperparameter Tuning in [Microsoft Fabric Data Science](https://learn.microsoft.com/en-us/fabric/data-science/automated-machine-learning-fabric). In addition, we've introduced Python 3.11 support, along with a range of new estimators, and comprehensive integration with MLflow—thanks to contributions from the Microsoft Fabric product team.
:fire: FLAML supports AutoML and Hyperparameter Tuning in [Microsoft Fabric Data Science](https://learn.microsoft.com/en-us/fabric/data-science/automated-machine-learning-fabric). In addition, we've introduced Python 3.11 and 3.12 support, along with a range of new estimators, and comprehensive integration with MLflow—thanks to contributions from the Microsoft Fabric product team.
:fire: Heads-up: We have migrated [AutoGen](https://microsoft.github.io/autogen/) into a dedicated [github repository](https://github.com/microsoft/autogen). Alongside this move, we have also launched a dedicated [Discord](https://discord.gg/pAbnFJrkgZ) server and a [website](https://microsoft.github.io/autogen/) for comprehensive documentation.
:fire: The automated multi-agent chat framework in [AutoGen](https://microsoft.github.io/autogen/) is in preview from v2.0.0.
:fire: FLAML is highlighted in OpenAI's [cookbook](https://github.com/openai/openai-cookbook#related-resources-from-around-the-web).
:fire: [autogen](https://microsoft.github.io/autogen/) is released with support for ChatGPT and GPT-4, based on [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673).
:fire: Heads-up: [AutoGen](https://microsoft.github.io/autogen/) has moved to a dedicated [GitHub repository](https://github.com/microsoft/autogen). FLAML no longer includes the `autogen` module—please use AutoGen directly.
## What is FLAML
@@ -30,7 +24,7 @@ FLAML is a lightweight Python library for efficient automation of machine
learning and AI operations. It automates workflow based on large language models, machine learning models, etc.
and optimizes their performance.
- FLAML enables building next-gen GPT-X applications based on multi-agent conversations with minimal effort. It simplifies the orchestration, automation and optimization of a complex GPT-X workflow. It maximizes the performance of GPT-X models and augments their weakness.
- FLAML enables economical automation and tuning for ML/AI workflows, including model selection and hyperparameter optimization under resource constraints.
- For common machine learning tasks like classification and regression, it quickly finds quality models for user-provided data with low computational resources. It is easy to customize or extend. Users can find their desired customizability from a smooth range.
- It supports fast and economical automatic tuning (e.g., inference hyperparameters for foundation models, configurations in MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations), capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping.
@@ -46,10 +40,10 @@ FLAML requires **Python version >= 3.9**. It can be installed from pip:
pip install flaml
```
Minimal dependencies are installed without extra options. You can install extra options based on the feature you need. For example, use the following to install the dependencies needed by the [`autogen`](https://microsoft.github.io/autogen/) package.
Minimal dependencies are installed without extra options. You can install extra options based on the feature you need. For example, use the following to install the dependencies needed by the [`automl`](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML) module.
```bash
pip install "flaml[autogen]"
pip install "flaml[automl]"
```
Find more options in [Installation](https://microsoft.github.io/FLAML/docs/Installation).
@@ -57,39 +51,6 @@ Each of the [`notebook examples`](https://github.com/microsoft/FLAML/tree/main/n
## Quickstart
- (New) The [autogen](https://microsoft.github.io/autogen/) package enables the next-gen GPT-X applications with a generic multi-agent conversation framework.
It offers customizable and conversable agents which integrate LLMs, tools and human.
By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For example,
```python
from flaml import autogen
assistant = autogen.AssistantAgent("assistant")
user_proxy = autogen.UserProxyAgent("user_proxy")
user_proxy.initiate_chat(
assistant,
message="Show me the YTD gain of 10 largest technology companies as of today.",
)
# This initiates an automated chat between the two agents to solve the task
```
Autogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers a drop-in replacement of `openai.Completion` or `openai.ChatCompletion` with powerful functionalites like tuning, caching, templating, filtering. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.
```python
# perform tuning
config, analysis = autogen.Completion.tune(
data=tune_data,
metric="success",
mode="max",
eval_func=eval_func,
inference_budget=0.05,
optimization_budget=3,
num_samples=-1,
)
# perform inference for a test instance
response = autogen.Completion.create(context=test_instance, **config)
```
- With three lines of code, you can start using this economical and fast
AutoML engine as a [scikit-learn style estimator](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML).

View File

@@ -401,6 +401,24 @@ class AutoML(BaseEstimator):
self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor"
self.best_run_id = None
def __getstate__(self):
"""Customize pickling to avoid serializing runtime-only objects.
MLflow's sklearn flavor serializes estimators via (cloud)pickle. During
AutoML fitting we may attach an internal mlflow integration instance
which holds `concurrent.futures.Future` objects and executors containing
thread locks, which are not picklable.
"""
state = self.__dict__.copy()
state.pop("mlflow_integration", None)
return state
def __setstate__(self, state):
self.__dict__.update(state)
# Ensure attribute exists post-unpickle.
self.mlflow_integration = None
def get_params(self, deep: bool = False) -> dict:
return self._settings.copy()

View File

@@ -50,7 +50,10 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_forma
"""
import pickle
try:
import openml
except ImportError:
openml = None
from sklearn.model_selection import train_test_split
filename = "openml_ds" + str(dataset_id) + ".pkl"
@@ -61,15 +64,15 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_forma
dataset = pickle.load(f)
else:
print("download dataset from openml")
dataset = openml.datasets.get_dataset(dataset_id)
dataset = openml.datasets.get_dataset(dataset_id) if openml else None
if not os.path.exists(data_dir):
os.makedirs(data_dir)
with open(filepath, "wb") as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print("Dataset name:", dataset.name)
print("Dataset name:", dataset.name) if dataset else None
try:
X, y, *__ = dataset.get_data(target=dataset.default_target_attribute, dataset_format=dataset_format)
except ValueError:
except (ValueError, AttributeError, TypeError):
from sklearn.datasets import fetch_openml
X, y = fetch_openml(data_id=dataset_id, return_X_y=True)

View File

@@ -127,9 +127,21 @@ def metric_loss_score(
import datasets
datasets_metric_name = huggingface_submetric_to_metric.get(metric_name, metric_name.split(":")[0])
metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
metric_mode = huggingface_metric_to_mode[datasets_metric_name]
# datasets>=3 removed load_metric; prefer evaluate if available
try:
import evaluate
metric = evaluate.load(datasets_metric_name, trust_remote_code=True)
except Exception:
if hasattr(datasets, "load_metric"):
metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
else:
from datasets import load_metric as _load_metric # older datasets
metric = _load_metric(datasets_metric_name, trust_remote_code=True)
if metric_name.startswith("seqeval"):
y_processed_true = [[labels[tr] for tr in each_list] for each_list in y_processed_true]
elif metric in ("pearsonr", "spearmanr"):

View File

@@ -111,7 +111,7 @@ def limit_resource(memory_limit, time_limit):
pass
class BaseEstimator:
class BaseEstimator(sklearn.base.ClassifierMixin, sklearn.base.BaseEstimator):
"""The abstract class for all learners.
Typical examples:

View File

@@ -77,6 +77,14 @@ class TrainingArgumentsForAuto(TrainingArguments):
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
# Newer versions of HuggingFace Transformers may access `TrainingArguments.generation_config`
# (e.g., in generation-aware trainers/callbacks). Keep this attribute to remain compatible
# while defaulting to None for non-generation tasks.
generation_config: Optional[object] = field(
default=None,
metadata={"help": "Optional generation config (or path) used by generation-aware trainers."},
)
@staticmethod
def load_args_from_console():
from dataclasses import fields

View File

@@ -1,3 +1,4 @@
import inspect
import time
try:
@@ -106,12 +107,17 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
def fit(self, X_train, y_train, budget=None, **kwargs):
import warnings
try:
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
except ImportError:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch
from pytorch_forecasting import TemporalFusionTransformer
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
# a bit of monkey patching to fix the MacOS test
# all the log_prediction method appears to do is plot stuff, which ?breaks github tests
@@ -132,12 +138,26 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
lr_logger = LearningRateMonitor() # log the learning rate
logger = TensorBoardLogger(kwargs.get("log_dir", "lightning_logs")) # logging results to a tensorboard
default_trainer_kwargs = dict(
gpus=self._kwargs.get("gpu_per_trial", [0]) if torch.cuda.is_available() else None,
max_epochs=max_epochs,
gradient_clip_val=gradient_clip_val,
callbacks=[lr_logger, early_stop_callback],
logger=logger,
)
# PyTorch Lightning >=2.0 replaced `gpus` with `accelerator`/`devices`.
# Also, passing `gpus=None` is not accepted on newer versions.
trainer_sig_params = inspect.signature(pl.Trainer.__init__).parameters
if torch.cuda.is_available() and "gpus" in trainer_sig_params:
gpus = self._kwargs.get("gpu_per_trial", None)
if gpus is not None:
default_trainer_kwargs["gpus"] = gpus
elif torch.cuda.is_available() and "devices" in trainer_sig_params:
devices = self._kwargs.get("gpu_per_trial", None)
if devices == -1:
devices = "auto"
if devices is not None:
default_trainer_kwargs["accelerator"] = "gpu"
default_trainer_kwargs["devices"] = devices
trainer = pl.Trainer(
**default_trainer_kwargs,
)
@@ -157,6 +177,13 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
val_dataloaders=val_dataloader,
)
best_model_path = trainer.checkpoint_callback.best_model_path
# PyTorch 2.6 changed `torch.load` default `weights_only` from False -> True.
# Some Lightning checkpoints (including those produced here) can require full unpickling.
# This path is generated locally during training, so it's trusted.
load_sig_params = inspect.signature(TemporalFusionTransformer.load_from_checkpoint).parameters
if "weights_only" in load_sig_params:
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path, weights_only=False)
else:
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
train_time = time.time() - current_time
self._model = best_tft

View File

@@ -9,6 +9,7 @@ import numpy as np
try:
import pandas as pd
from pandas import DataFrame, Series, to_datetime
from pandas.api.types import is_datetime64_any_dtype
from scipy.sparse import issparse
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
@@ -392,6 +393,15 @@ class DataTransformerTS:
assert len(self.num_columns) == 0, "Trying to call fit() twice, something is wrong"
for column in X.columns:
# Never treat the time column as a feature for sklearn preprocessing
if column == self.time_col:
continue
# Robust datetime detection (covers datetime64[ms/us/ns], tz-aware, etc.)
if is_datetime64_any_dtype(X[column]):
self.datetime_columns.append(column)
continue
# sklearn/utils/validation.py needs int/float values
if X[column].dtype.name in ("object", "category", "string"):
if (

View File

@@ -244,6 +244,7 @@ class BlendSearch(Searcher):
evaluated_rewards=evaluated_rewards,
)
except (AssertionError, ValueError):
try:
self._gs = GlobalSearch(
space=gs_space,
metric=metric,
@@ -251,6 +252,24 @@ class BlendSearch(Searcher):
seed=gs_seed,
sampler=sampler,
)
except ValueError:
# Ray Tune's OptunaSearch converts Tune domains into Optuna
# distributions. Optuna disallows integer log distributions
# with step != 1 (e.g., qlograndint with q>1), which can
# raise here. Fall back to FLAML's OptunaSearch wrapper,
# which handles these spaces more permissively.
if getattr(GlobalSearch, "__module__", "").startswith("ray.tune"):
from .suggestion import OptunaSearch as _FallbackOptunaSearch
self._gs = _FallbackOptunaSearch(
space=gs_space,
metric=metric,
mode=mode,
seed=gs_seed,
sampler=sampler,
)
else:
raise
self._gs.space = space
else:
self._gs = None

View File

@@ -35,6 +35,73 @@ from ..sample import (
Quantized,
Uniform,
)
# If Ray is installed, flaml.tune may re-export Ray Tune sampling functions.
# In that case, the search space contains Ray Tune Domain/Sampler objects,
# which should be accepted by our Optuna search-space conversion.
try:
from ray import __version__ as _ray_version # type: ignore
if str(_ray_version).startswith("1."):
from ray.tune.sample import ( # type: ignore
Categorical as _RayCategorical,
)
from ray.tune.sample import (
Domain as _RayDomain,
)
from ray.tune.sample import (
Float as _RayFloat,
)
from ray.tune.sample import (
Integer as _RayInteger,
)
from ray.tune.sample import (
LogUniform as _RayLogUniform,
)
from ray.tune.sample import (
Quantized as _RayQuantized,
)
from ray.tune.sample import (
Uniform as _RayUniform,
)
else:
from ray.tune.search.sample import ( # type: ignore
Categorical as _RayCategorical,
)
from ray.tune.search.sample import (
Domain as _RayDomain,
)
from ray.tune.search.sample import (
Float as _RayFloat,
)
from ray.tune.search.sample import (
Integer as _RayInteger,
)
from ray.tune.search.sample import (
LogUniform as _RayLogUniform,
)
from ray.tune.search.sample import (
Quantized as _RayQuantized,
)
from ray.tune.search.sample import (
Uniform as _RayUniform,
)
_FLOAT_TYPES = (Float, _RayFloat)
_INTEGER_TYPES = (Integer, _RayInteger)
_CATEGORICAL_TYPES = (Categorical, _RayCategorical)
_DOMAIN_TYPES = (Domain, _RayDomain)
_QUANTIZED_TYPES = (Quantized, _RayQuantized)
_UNIFORM_TYPES = (Uniform, _RayUniform)
_LOGUNIFORM_TYPES = (LogUniform, _RayLogUniform)
except Exception: # pragma: no cover
_FLOAT_TYPES = (Float,)
_INTEGER_TYPES = (Integer,)
_CATEGORICAL_TYPES = (Categorical,)
_DOMAIN_TYPES = (Domain,)
_QUANTIZED_TYPES = (Quantized,)
_UNIFORM_TYPES = (Uniform,)
_LOGUNIFORM_TYPES = (LogUniform,)
from ..trial import flatten_dict, unflatten_dict
from .variant_generator import parse_spec_vars
@@ -850,19 +917,22 @@ class OptunaSearch(Searcher):
def resolve_value(domain: Domain) -> ot.distributions.BaseDistribution:
quantize = None
sampler = domain.get_sampler()
if isinstance(sampler, Quantized):
# Ray Tune Domains and FLAML Domains both provide get_sampler(), but
# fall back to the .sampler attribute for robustness.
sampler = domain.get_sampler() if hasattr(domain, "get_sampler") else getattr(domain, "sampler", None)
if isinstance(sampler, _QUANTIZED_TYPES) or type(sampler).__name__ == "Quantized":
quantize = sampler.q
sampler = sampler.sampler
if isinstance(sampler, LogUniform):
sampler = getattr(sampler, "sampler", None) or sampler.get_sampler()
if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
logger.warning(
"Optuna does not handle quantization in loguniform "
"sampling. The parameter will be passed but it will "
"probably be ignored."
)
if isinstance(domain, Float):
if isinstance(sampler, LogUniform):
if isinstance(domain, _FLOAT_TYPES) or type(domain).__name__ == "Float":
if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
if quantize:
logger.warning(
"Optuna does not support both quantization and "
@@ -870,17 +940,17 @@ class OptunaSearch(Searcher):
)
return ot.distributions.LogUniformDistribution(domain.lower, domain.upper)
elif isinstance(sampler, Uniform):
elif isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
if quantize:
return ot.distributions.DiscreteUniformDistribution(domain.lower, domain.upper, quantize)
return ot.distributions.UniformDistribution(domain.lower, domain.upper)
elif isinstance(domain, Integer):
if isinstance(sampler, LogUniform):
elif isinstance(domain, _INTEGER_TYPES) or type(domain).__name__ == "Integer":
if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
# ``step`` argument Deprecated in v2.0.0. ``step`` argument should be 1 in Log Distribution
# The removal of this feature is currently scheduled for v4.0.0,
return ot.distributions.IntLogUniformDistribution(domain.lower, domain.upper - 1, step=1)
elif isinstance(sampler, Uniform):
elif isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
# Upper bound should be inclusive for quantization and
# exclusive otherwise
return ot.distributions.IntUniformDistribution(
@@ -888,13 +958,13 @@ class OptunaSearch(Searcher):
domain.upper - int(bool(not quantize)),
step=quantize or 1,
)
elif isinstance(domain, Categorical):
if isinstance(sampler, Uniform):
elif isinstance(domain, _CATEGORICAL_TYPES) or type(domain).__name__ == "Categorical":
if isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
return ot.distributions.CategoricalDistribution(domain.categories)
raise ValueError(
"Optuna search does not support parameters of type "
"`{}` with samplers of type `{}`".format(type(domain).__name__, type(domain.sampler).__name__)
"`{}` with samplers of type `{}`".format(type(domain).__name__, type(sampler).__name__)
)
# Parameter name is e.g. "a/b/c" for nested dicts

View File

@@ -1 +1 @@
__version__ = "2.3.7"
__version__ = "2.4.0"

View File

@@ -51,60 +51,59 @@ setuptools.setup(
"joblib<=1.3.2",
],
"test": [
"jupyter",
"numpy>=1.17,<2.0.0; python_version<'3.13'",
"numpy>2.0.0; python_version>='3.13'",
"jupyter; python_version<'3.13'",
"lightgbm>=2.3.1",
"xgboost>=0.90,<2.0.0",
"xgboost>=0.90,<2.0.0; python_version<'3.11'",
"xgboost>=2.0.0; python_version>='3.11'",
"scipy>=1.4.1",
"pandas>=1.1.4,<2.0.0; python_version<'3.10'",
"pandas>=1.1.4; python_version>='3.10'",
"scikit-learn>=1.0.0",
"scikit-learn>=1.2.0",
"thop",
"pytest>=6.1.1",
"pytest-rerunfailures>=13.0",
"coverage>=5.3",
"pre-commit",
"torch",
"torchvision",
"catboost>=0.26,<1.2; python_version<'3.11'",
"catboost>=0.26; python_version>='3.11'",
"catboost>=0.26; python_version<'3.13'",
"rgf-python",
"optuna>=2.8.0,<=3.6.1",
"openml",
"openml; python_version<'3.13'",
"statsmodels>=0.12.2",
"psutil==5.8.0",
"psutil",
"dataclasses",
"transformers[torch]==4.26",
"datasets<=3.5.0",
"nltk<=3.8.1", # 3.8.2 doesn't work with mlflow
"transformers[torch]",
"datasets",
"evaluate",
"nltk!=3.8.2", # 3.8.2 doesn't work with mlflow
"rouge_score",
"hcrystalball==0.1.10",
"hcrystalball",
"seqeval",
"pytorch-forecasting>=0.9.0,<=0.10.1; python_version<'3.11'",
# "pytorch-forecasting==0.10.1; python_version=='3.11'",
"mlflow==2.15.1",
"pytorch-forecasting; python_version<'3.13'",
"mlflow-skinny<=2.22.1", # Refer to https://mvnrepository.com/artifact/org.mlflow/mlflow-spark
"joblibspark>=0.5.0",
"joblib<=1.3.2",
"nbconvert",
"nbformat",
"ipykernel",
"pytorch-lightning<1.9.1", # test_forecast_panel
"tensorboardX==2.6", # test_forecast_panel
"requests<2.29.0", # https://github.com/docker/docker-py/issues/3113
"pytorch-lightning", # test_forecast_panel
"tensorboardX", # test_forecast_panel
"requests", # https://github.com/docker/docker-py/issues/3113
"packaging",
"pydantic==1.10.9",
"sympy",
"wolframalpha",
"dill", # a drop in replacement of pickle
],
"catboost": [
"catboost>=0.26,<1.2; python_version<'3.11'",
"catboost>=0.26,<=1.2.5; python_version>='3.11'",
"catboost>=0.26",
],
"blendsearch": [
"optuna>=2.8.0,<=3.6.1",
"packaging",
],
"ray": [
"ray[tune]~=1.13",
"ray[tune]>=1.13,<2.5.0",
],
"azureml": [
"azureml-mlflow",
@@ -131,33 +130,21 @@ setuptools.setup(
"seqeval",
],
"ts_forecast": [
"holidays<0.14", # to prevent installation error for prophet
"prophet>=1.0.1",
"holidays",
"prophet>=1.1.5",
"statsmodels>=0.12.2",
"hcrystalball==0.1.10",
"hcrystalball>=0.1.10",
],
"forecast": [
"holidays<0.14", # to prevent installation error for prophet
"prophet>=1.0.1",
"holidays",
"prophet>=1.1.5",
"statsmodels>=0.12.2",
"hcrystalball==0.1.10",
"pytorch-forecasting>=0.9.0; python_version<'3.11'",
# "pytorch-forecasting==0.10.1; python_version=='3.11'",
"pytorch-lightning==1.9.0",
"tensorboardX==2.6",
"hcrystalball>=0.1.10",
"pytorch-forecasting>=0.10.4; python_version<'3.13'",
"pytorch-lightning>=1.9.0",
"tensorboardX>=2.6",
],
"benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3", "pandas==1.1.4"],
"openai": ["openai==0.27.8", "diskcache"],
"autogen": ["openai==0.27.8", "diskcache", "termcolor"],
"mathchat": ["openai==0.27.8", "diskcache", "termcolor", "sympy", "pydantic==1.10.9", "wolframalpha"],
"retrievechat": [
"openai==0.27.8",
"diskcache",
"termcolor",
"chromadb",
"tiktoken",
"sentence_transformers",
],
"synapse": [
"joblibspark>=0.5.0",
"optuna>=2.8.0,<=3.6.1",
@@ -170,9 +157,9 @@ setuptools.setup(
"Operating System :: OS Independent",
# Specify the Python versions you support here.
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
],
python_requires=">=3.9",
python_requires=">=3.10",
)

View File

@@ -1,8 +1,23 @@
import sys
import pytest
try:
from minio.error import ServerError
except ImportError:
class ServerError(Exception):
pass
try:
from openml.exceptions import OpenMLServerException
except ImportError:
class OpenMLServerException(Exception):
pass
from requests.exceptions import ChunkedEncodingError, SSLError

View File

@@ -38,7 +38,7 @@ class TestLogging(unittest.TestCase):
"keep_search_state": True,
"learner_selector": "roundrobin",
}
X_train, y_train = fetch_california_housing(return_X_y=True)
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
n = len(y_train) >> 1
print(automl.model, automl.classes_, automl.predict(X_train))
automl.fit(

View File

@@ -47,7 +47,7 @@ class TestRegression(unittest.TestCase):
"n_jobs": 1,
"model_history": True,
}
X_train, y_train = fetch_california_housing(return_X_y=True)
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
n = int(len(y_train) * 9 // 10)
automl.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings)
assert automl._state.eval_method == "holdout"
@@ -141,7 +141,7 @@ class TestRegression(unittest.TestCase):
"n_concurrent_trials": 10,
"hpo_method": hpo_method,
}
X_train, y_train = fetch_california_housing(return_X_y=True)
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
try:
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
@@ -268,7 +268,7 @@ def test_reproducibility_of_regression_models(estimator: str):
"skip_transform": True,
"retrain_full": True,
}
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
automl.fit(X_train=X, y_train=y, **automl_settings)
best_model = automl.model
assert best_model is not None
@@ -314,7 +314,7 @@ def test_reproducibility_of_catboost_regression_model():
"skip_transform": True,
"retrain_full": True,
}
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
automl.fit(X_train=X, y_train=y, **automl_settings)
best_model = automl.model
assert best_model is not None
@@ -360,7 +360,7 @@ def test_reproducibility_of_lgbm_regression_model():
"skip_transform": True,
"retrain_full": True,
}
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
automl.fit(X_train=X, y_train=y, **automl_settings)
best_model = automl.model
assert best_model is not None
@@ -424,7 +424,7 @@ def test_reproducibility_of_underlying_regression_models(estimator: str):
"skip_transform": True,
"retrain_full": False,
}
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
automl.fit(X_train=X, y_train=y, **automl_settings)
best_model = automl.model
assert best_model is not None

View File

@@ -142,7 +142,7 @@ class TestScore:
def test_regression(self):
automl_experiment = AutoML()
X_train, y_train = fetch_california_housing(return_X_y=True)
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
n = int(len(y_train) * 9 // 10)
for each_estimator in [

View File

@@ -30,7 +30,7 @@ class TestTrainingLog(unittest.TestCase):
"keep_search_state": True,
"estimator_list": estimator_list,
}
X_train, y_train = fetch_california_housing(return_X_y=True)
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
# Check if the training log file is populated.
self.assertTrue(os.path.exists(filename))

View File

@@ -108,7 +108,14 @@ class TestWarmStart(unittest.TestCase):
def test_FLAML_sample_size_in_starting_points(self):
from minio.error import ServerError
try:
from openml.exceptions import OpenMLServerException
except ImportError:
class OpenMLServerException(Exception):
pass
from requests.exceptions import ChunkedEncodingError, SSLError
from flaml import AutoML

BIN
test/cal_housing_py3.pkz Normal file

Binary file not shown.

60
test/check_dependency.py Normal file
View File

@@ -0,0 +1,60 @@
import subprocess
from importlib.metadata import distributions
installed_libs = sorted(f"{dist.metadata['Name']}=={dist.version}" for dist in distributions())
first_tier_dependencies = [
"numpy",
"jupyter",
"lightgbm",
"xgboost",
"scipy",
"pandas",
"scikit-learn",
"thop",
"pytest",
"pytest-rerunfailures",
"coverage",
"pre-commit",
"torch",
"torchvision",
"catboost",
"rgf-python",
"optuna",
"openml",
"statsmodels",
"psutil",
"dataclasses",
"transformers[torch]",
"transformers",
"datasets",
"evaluate",
"nltk",
"rouge_score",
"hcrystalball",
"seqeval",
"pytorch-forecasting",
"mlflow-skinny",
"joblibspark",
"joblib",
"nbconvert",
"nbformat",
"ipykernel",
"pytorch-lightning",
"tensorboardX",
"requests",
"packaging",
"dill",
"ray",
"prophet",
]
for lib in installed_libs:
lib_name = lib.split("==")[0]
if lib_name in first_tier_dependencies:
print(lib)
# print current commit hash
commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
print(f"Current commit hash: {commit_hash}")

View File

@@ -2,11 +2,24 @@ from typing import Any, Dict, List, Union
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
import pytest
from sklearn.metrics import f1_score, r2_score
try:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
except ImportError: # pragma: no cover
CatBoostClassifier = None
CatBoostRegressor = None
Pool = None
def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> pd.DataFrame:
def _is_catboost_model_type(model_type: type) -> bool:
if CatBoostClassifier is not None and CatBoostRegressor is not None:
return model_type is CatBoostClassifier or model_type is CatBoostRegressor
return getattr(model_type, "__module__", "").startswith("catboost")
def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> List[float]:
"""Mimic the FLAML CV process to calculate the metrics across each fold.
:param X_train_all: X training data
@@ -17,7 +30,7 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
:return: An array containing the metrics
"""
rng = np.random.RandomState(2020)
all_fold_metrics: List[Dict[str, Union[int, float]]] = []
all_fold_metrics: List[float] = []
for train_index, val_index in kf.split(X_train_all, y_train_all):
X_train_split, y_train_split = X_train_all, y_train_all
train_index = rng.permutation(train_index)
@@ -25,9 +38,11 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
X_val = X_train_split.iloc[val_index]
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
model_type = type(model)
if model_type is not CatBoostClassifier and model_type is not CatBoostRegressor:
if not _is_catboost_model_type(model_type):
model.fit(X_train, y_train)
else:
if Pool is None:
pytest.skip("catboost is not installed")
use_best_model = True
n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
X_tr, y_tr = (X_train)[:n], y_train[:n]
@@ -38,5 +53,5 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
reproduced_metric = 1 - f1_score(y_val, y_pred_classes)
else:
reproduced_metric = 1 - r2_score(y_val, y_pred_classes)
all_fold_metrics.append(reproduced_metric)
all_fold_metrics.append(float(reproduced_metric))
return all_fold_metrics

View File

@@ -60,7 +60,7 @@ def test_housing(as_frame=True):
"starting_points": "data",
"max_iter": 0,
}
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame)
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame, data_home="test")
automl.fit(X_train, y_train, **automl_settings)
@@ -115,7 +115,7 @@ def test_suggest_classification():
def test_suggest_regression():
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
suggested = suggest_hyperparams("regression", X_train, y_train, "lgbm", location=location)
print(suggested)
suggested = preprocess_and_suggest_hyperparams("regression", X_train, y_train, "xgboost", location=location)
@@ -137,7 +137,7 @@ def test_rf():
print(rf)
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
rf = RandomForestRegressor(default_location=location)
rf.fit(X_train[:100], y_train[:100])
rf.predict(X_train)
@@ -155,7 +155,7 @@ def test_extratrees():
print(classifier)
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
regressor = ExtraTreesRegressor(default_location=location)
regressor.fit(X_train[:100], y_train[:100])
regressor.predict(X_train)
@@ -175,7 +175,7 @@ def test_lgbm():
print(classifier.classes_)
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
regressor = LGBMRegressor(default_location=location)
regressor.fit(X_train, y_train)
regressor.predict(X_train)
@@ -194,7 +194,7 @@ def test_xgboost():
print(classifier.classes_)
location = "test/default"
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
regressor = XGBRegressor(default_location=location)
regressor.fit(X_train[:100], y_train[:100])
regressor.predict(X_train)

View File

@@ -30,21 +30,33 @@ def test_build_portfolio(path="./test/nlp/default", strategy="greedy"):
@pytest.mark.skipif(sys.platform == "win32", reason="do not run on windows")
def test_starting_point_not_in_search_space():
from flaml import AutoML
"""Regression test for invalid starting points and custom_hp.
This test must not require network access to Hugging Face.
"""
"""
test starting_points located outside of the search space, and custom_hp is not set
"""
from flaml.automl.state import SearchState
from flaml.automl.task.factory import task_factory
this_estimator_name = "transformer"
X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
X_train, y_train, _, _, _ = get_toy_data_seqclassification()
task = task_factory("seq-classification", X_train, y_train)
estimator_class = task.estimator_class_from_str(this_estimator_name)
estimator_class.init()
automl = AutoML()
automl_settings = get_automl_settings(estimator_name=this_estimator_name)
automl_settings["starting_points"] = {this_estimator_name: [{"learning_rate": 2e-3}]}
automl.fit(X_train, y_train, **automl_settings)
assert automl._search_states[this_estimator_name].init_config[0]["learning_rate"] != 2e-3
# SearchState is where invalid starting points are filtered out when max_iter > 1.
search_state = SearchState(
learner_class=estimator_class,
data=X_train,
task=task,
starting_point={"learning_rate": 2e-3},
max_iter=3,
budget=10,
)
assert search_state.init_config and search_state.init_config[0].get("learning_rate") != 2e-3
"""
test starting_points located outside of the search space, and custom_hp is set
@@ -52,14 +64,14 @@ def test_starting_point_not_in_search_space():
from flaml import tune
X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
X_train, y_train, _, _, _ = get_toy_data_seqclassification()
this_estimator_name = "transformer_ms"
automl = AutoML()
automl_settings = get_automl_settings(estimator_name=this_estimator_name)
task = task_factory("seq-classification", X_train, y_train)
estimator_class = task.estimator_class_from_str(this_estimator_name)
estimator_class.init()
automl_settings["custom_hp"] = {
this_estimator_name: {
custom_hp = {
"model_path": {
"domain": "albert-base-v2",
},
@@ -70,21 +82,42 @@ def test_starting_point_not_in_search_space():
"domain": 2,
},
}
}
automl_settings["starting_points"] = "data:test/nlp/default/"
automl.fit(X_train, y_train, **automl_settings)
assert len(automl._search_states[this_estimator_name].init_config[0]) == len(
automl._search_states[this_estimator_name]._search_space_domain
) - len(automl_settings["custom_hp"][this_estimator_name]), (
# Simulate a suggested starting point (e.g. from portfolio) which becomes invalid
# after custom_hp constrains the space.
invalid_starting_points = [
{
"learning_rate": 1e-5,
"num_train_epochs": 1.0,
"per_device_train_batch_size": 8,
"seed": 43,
"global_max_steps": 100,
"model_path": "google/electra-base-discriminator",
}
]
search_state = SearchState(
learner_class=estimator_class,
data=X_train,
task=task,
starting_point=invalid_starting_points,
custom_hp=custom_hp,
max_iter=3,
budget=10,
)
assert search_state.init_config, "Expected a non-empty init_config list"
init_config0 = search_state.init_config[0]
assert init_config0 is not None
assert len(init_config0) == len(search_state._search_space_domain) - len(custom_hp), (
"The search space is updated with the custom_hp on {} hyperparameters of "
"the specified estimator without an initial value. Thus a valid init config "
"should only contain the cardinality of the search space minus {}".format(
len(automl_settings["custom_hp"][this_estimator_name]),
len(automl_settings["custom_hp"][this_estimator_name]),
len(custom_hp),
len(custom_hp),
)
)
assert automl._search_states[this_estimator_name].search_space["model_path"] == "albert-base-v2"
assert search_state.search_space["model_path"] == "albert-base-v2"
if os.path.exists("test/data/output/"):
try:
@@ -106,7 +139,13 @@ def test_points_to_evaluate():
automl_settings["custom_hp"] = {"transformer_ms": {"model_path": {"domain": "google/electra-small-discriminator"}}}
try:
automl.fit(X_train, y_train, **automl_settings)
except OSError as e:
message = str(e)
if "Too Many Requests" in message or "rate limit" in message.lower():
pytest.skip(f"Skipping HF model load/training: {message}")
raise
if os.path.exists("test/data/output/"):
try:
@@ -141,7 +180,14 @@ def test_zero_shot_nomodel():
fit_kwargs = automl_settings.pop("fit_kwargs_by_estimator", {}).get(estimator_name)
fit_kwargs.update(automl_settings)
pop_args(fit_kwargs)
try:
model.fit(X_train, y_train, **fit_kwargs)
except OSError as e:
message = str(e)
if "Too Many Requests" in message or "rate limit" in message.lower():
pytest.skip(f"Skipping HF model load/training: {message}")
raise
if os.path.exists("test/data/output/"):
try:

View File

@@ -7,7 +7,7 @@ from sklearn.model_selection import train_test_split
from flaml import tune
from flaml.automl.model import LGBMEstimator
data = fetch_california_housing(return_X_y=False, as_frame=True)
data = fetch_california_housing(return_X_y=False, as_frame=True, data_home="test")
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train_ref = ray.put(X_train)

View File

@@ -11,7 +11,7 @@ automl_settings = {
"task": "regression",
"log_file_name": "test/california.log",
}
X_train, y_train = fetch_california_housing(return_X_y=True)
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl.model)

View File

@@ -22,7 +22,7 @@ def base_automl(n_concurrent_trials=1, use_ray=False, use_spark=False, verbose=0
except (ServerError, Exception):
from sklearn.datasets import fetch_california_housing
X_train, y_train = fetch_california_housing(return_X_y=True)
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
automl = AutoML()
settings = {
"time_budget": 3, # total running time in seconds

View File

@@ -2,8 +2,23 @@ import os
import sys
import pytest
try:
from minio.error import ServerError
except ImportError:
class ServerError(Exception):
pass
try:
from openml.exceptions import OpenMLServerException
except ImportError:
class OpenMLServerException(Exception):
pass
from requests.exceptions import ChunkedEncodingError, SSLError
from flaml.tune.spark.utils import check_spark

View File

@@ -5,17 +5,38 @@ import sys
import unittest
import numpy as np
try:
import openml
except ImportError:
openml = None
import pandas as pd
import pytest
import scipy.sparse
try:
from minio.error import ServerError
except ImportError:
class ServerError(Exception):
pass
from requests.exceptions import SSLError
from sklearn.metrics import mean_absolute_error, mean_squared_error
from flaml import AutoVW
from flaml.tune import loguniform, polynomial_expansion_set
try:
from vowpalwabbit import pyvw
except ImportError:
skip_vw_test = True
else:
skip_vw_test = False
pytest.skip("skipping if no openml", allow_module_level=True) if openml is None else None
VW_DS_DIR = "test/data/"
NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
logger = logging.getLogger(__name__)
@@ -351,14 +372,9 @@ def get_vw_tuning_problem(tuning_hp="NamesapceInteraction"):
return vw_oml_problem_args, vw_online_aml_problem
@pytest.mark.skipif(
"3.10" in sys.version or "3.11" in sys.version,
reason="do not run on py >= 3.10",
)
@pytest.mark.skipif(skip_vw_test, reason="vowpalwabbit not installed")
class TestAutoVW(unittest.TestCase):
def test_vw_oml_problem_and_vanilla_vw(self):
from vowpalwabbit import pyvw
try:
vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem()
except (SSLError, ServerError, Exception) as e:

View File

@@ -6,12 +6,12 @@ from sklearn.model_selection import train_test_split
from flaml import tune
from flaml.automl.model import LGBMEstimator
data = fetch_california_housing(return_X_y=False, as_frame=True)
data = fetch_california_housing(return_X_y=False, as_frame=True, data_home="test")
df, X, y = data.frame, data.data, data.target
df_train, _, X_train, X_test, _, y_test = train_test_split(df, X, y, test_size=0.33, random_state=42)
csv_file_name = "test/housing.csv"
df_train.to_csv(csv_file_name, index=False)
# X, y = fetch_california_housing(return_X_y=True, as_frame=True)
# X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
# X_train, X_test, y_train, y_test = train_test_split(
# X, y, test_size=0.33, random_state=42
# )