mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-15 21:29:16 +08:00
Compare commits
48 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a74354f7a9 | ||
|
|
ced1d6f331 | ||
|
|
bb213e7ebd | ||
|
|
d241e8de90 | ||
|
|
0b138d9193 | ||
|
|
1c9835dc0a | ||
|
|
1285700d7a | ||
|
|
7f42bece89 | ||
|
|
e19107407b | ||
|
|
f5d6693253 | ||
|
|
d4e43c50a2 | ||
|
|
13aec414ea | ||
|
|
bb16dcde93 | ||
|
|
be81a76da9 | ||
|
|
2d16089529 | ||
|
|
01c3c83653 | ||
|
|
9b66103f7c | ||
|
|
48dfd72e64 | ||
|
|
dec92e5b02 | ||
|
|
22911ea1ef | ||
|
|
12183e5f73 | ||
|
|
c2b25310fc | ||
|
|
0f9420590d | ||
|
|
5107c506b4 | ||
|
|
9e219ef8dc | ||
|
|
6e4083743b | ||
|
|
17e95edd9e | ||
|
|
468bc62d27 | ||
|
|
437c239c11 | ||
|
|
8e753f1092 | ||
|
|
a3b57e11d4 | ||
|
|
a80dcf9925 | ||
|
|
7157af44e0 | ||
|
|
1798c4591e | ||
|
|
dd26263330 | ||
|
|
2ba5f8bed1 | ||
|
|
d0a11958a5 | ||
|
|
0ef9b00a75 | ||
|
|
840f76e5e5 | ||
|
|
d8b7d25b80 | ||
|
|
6d53929803 | ||
|
|
c038fbca07 | ||
|
|
6a99202492 | ||
|
|
42d1dcfa0e | ||
|
|
b83c8a7d3b | ||
|
|
b9194cdcf2 | ||
|
|
9a1f6b0291 | ||
|
|
07f4413aae |
@@ -1,5 +1,7 @@
|
||||
[run]
|
||||
branch = True
|
||||
source = flaml
|
||||
source =
|
||||
flaml
|
||||
omit =
|
||||
*test*
|
||||
*/test/*
|
||||
*/flaml/autogen/*
|
||||
|
||||
21
.github/workflows/CD.yml
vendored
21
.github/workflows/CD.yml
vendored
@@ -12,26 +12,17 @@ jobs:
|
||||
deploy:
|
||||
strategy:
|
||||
matrix:
|
||||
os: ['ubuntu-latest']
|
||||
python-version: [3.8]
|
||||
os: ["ubuntu-latest"]
|
||||
python-version: ["3.12"]
|
||||
runs-on: ${{ matrix.os }}
|
||||
environment: package
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Cache conda
|
||||
uses: actions/cache@v3
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
path: ~/conda_pkgs_dir
|
||||
key: conda-${{ matrix.os }}-python-${{ matrix.python-version }}-${{ hashFiles('environment.yml') }}
|
||||
- name: Setup Miniconda
|
||||
uses: conda-incubator/setup-miniconda@v2
|
||||
with:
|
||||
auto-update-conda: true
|
||||
auto-activate-base: false
|
||||
activate-environment: hcrystalball
|
||||
python-version: ${{ matrix.python-version }}
|
||||
use-only-tar-bz2: true
|
||||
- name: Install from source
|
||||
# This is required for the pre-commit tests
|
||||
shell: pwsh
|
||||
@@ -42,7 +33,7 @@ jobs:
|
||||
- name: Build
|
||||
shell: pwsh
|
||||
run: |
|
||||
pip install twine
|
||||
pip install twine wheel setuptools
|
||||
python setup.py sdist bdist_wheel
|
||||
- name: Publish to PyPI
|
||||
env:
|
||||
|
||||
8
.github/workflows/deploy-website.yml
vendored
8
.github/workflows/deploy-website.yml
vendored
@@ -37,11 +37,11 @@ jobs:
|
||||
- name: setup python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
python-version: "3.12"
|
||||
- name: pydoc-markdown install
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install pydoc-markdown==4.5.0
|
||||
pip install pydoc-markdown==4.7.0 setuptools
|
||||
- name: pydoc-markdown run
|
||||
run: |
|
||||
pydoc-markdown
|
||||
@@ -73,11 +73,11 @@ jobs:
|
||||
- name: setup python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
python-version: "3.12"
|
||||
- name: pydoc-markdown install
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install pydoc-markdown==4.5.0
|
||||
pip install pydoc-markdown==4.7.0 setuptools
|
||||
- name: pydoc-markdown run
|
||||
run: |
|
||||
pydoc-markdown
|
||||
|
||||
17
.github/workflows/openai.yml
vendored
17
.github/workflows/openai.yml
vendored
@@ -4,14 +4,15 @@
|
||||
name: OpenAI
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: ['main']
|
||||
paths:
|
||||
- 'flaml/autogen/**'
|
||||
- 'test/autogen/**'
|
||||
- 'notebook/autogen_openai_completion.ipynb'
|
||||
- 'notebook/autogen_chatgpt_gpt4.ipynb'
|
||||
- '.github/workflows/openai.yml'
|
||||
workflow_dispatch:
|
||||
# pull_request:
|
||||
# branches: ['main']
|
||||
# paths:
|
||||
# - 'flaml/autogen/**'
|
||||
# - 'test/autogen/**'
|
||||
# - 'notebook/autogen_openai_completion.ipynb'
|
||||
# - 'notebook/autogen_chatgpt_gpt4.ipynb'
|
||||
# - '.github/workflows/openai.yml'
|
||||
|
||||
permissions: {}
|
||||
|
||||
|
||||
114
.github/workflows/python-package.yml
vendored
114
.github/workflows/python-package.yml
vendored
@@ -14,10 +14,20 @@ on:
|
||||
- 'setup.py'
|
||||
pull_request:
|
||||
branches: ['main']
|
||||
paths:
|
||||
- 'flaml/**'
|
||||
- 'test/**'
|
||||
- 'notebook/**'
|
||||
- '.github/workflows/python-package.yml'
|
||||
- 'setup.py'
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
schedule:
|
||||
# Every other day at 02:00 UTC
|
||||
- cron: '0 2 */2 * *'
|
||||
|
||||
permissions: {}
|
||||
permissions:
|
||||
contents: write
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -29,8 +39,8 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-2019]
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
||||
os: [ubuntu-latest, windows-latest]
|
||||
python-version: ["3.10", "3.11", "3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
@@ -38,7 +48,7 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: On mac, install libomp to facilitate lgbm and xgboost install
|
||||
if: matrix.os == 'macOS-latest'
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: |
|
||||
brew update
|
||||
brew install libomp
|
||||
@@ -50,76 +60,76 @@ jobs:
|
||||
export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
|
||||
- name: Install packages and dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip wheel
|
||||
python -m pip install --upgrade pip wheel setuptools
|
||||
pip install -e .
|
||||
python -c "import flaml"
|
||||
pip install -e .[test]
|
||||
- name: On Ubuntu python 3.10, install pyspark 3.4.1
|
||||
if: matrix.python-version == '3.10' && matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
pip install pyspark==3.4.1
|
||||
pip list | grep "pyspark"
|
||||
- name: On Ubuntu python 3.11, install pyspark 3.5.1
|
||||
if: matrix.python-version == '3.11' && matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
pip install pyspark==3.5.1
|
||||
pip list | grep "pyspark"
|
||||
- name: If linux and python<3.11, install ray 2
|
||||
if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.11'
|
||||
- name: On Ubuntu python 3.12, install pyspark 4.0.1
|
||||
if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
pip install "ray[tune]<2.5.0"
|
||||
- name: If mac and python 3.10, install ray and xgboost 1
|
||||
if: matrix.os == 'macOS-latest' && matrix.python-version == '3.10'
|
||||
run: |
|
||||
pip install -e .[ray]
|
||||
# use macOS to test xgboost 1, but macOS also supports xgboost 2
|
||||
pip install "xgboost<2"
|
||||
- name: If linux, install prophet on python < 3.9
|
||||
if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.8'
|
||||
pip install pyspark==4.0.1
|
||||
pip list | grep "pyspark"
|
||||
# # TODO: support ray
|
||||
# - name: If linux and python<3.11, install ray 2
|
||||
# if: matrix.os == 'ubuntu-latest' && matrix.python-version < '3.11'
|
||||
# run: |
|
||||
# pip install "ray[tune]<2.5.0"
|
||||
- name: Install prophet when on linux
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
pip install -e .[forecast]
|
||||
- name: Install vw on python < 3.10
|
||||
if: matrix.python-version == '3.8' || matrix.python-version == '3.9'
|
||||
# TODO: support vw for python 3.10+
|
||||
- name: If linux and python<3.10, install vw
|
||||
if: matrix.os == 'ubuntu-latest' && matrix.python-version < '3.10'
|
||||
run: |
|
||||
pip install -e .[vw]
|
||||
- name: Test with pytest
|
||||
if: matrix.python-version != '3.10'
|
||||
- name: Pip freeze
|
||||
run: |
|
||||
pytest test
|
||||
pip freeze
|
||||
- name: Check dependencies
|
||||
run: |
|
||||
python test/check_dependency.py
|
||||
- name: Clear pip cache
|
||||
run: |
|
||||
pip cache purge
|
||||
- name: Test with pytest
|
||||
timeout-minutes: 120
|
||||
if: matrix.python-version != '3.11'
|
||||
run: |
|
||||
pytest test/ --ignore=test/autogen --reruns 2 --reruns-delay 10
|
||||
- name: Coverage
|
||||
if: matrix.python-version == '3.10'
|
||||
timeout-minutes: 120
|
||||
if: matrix.python-version == '3.11'
|
||||
run: |
|
||||
pip install coverage
|
||||
coverage run -a -m pytest test
|
||||
coverage run -a -m pytest test --ignore=test/autogen --reruns 2 --reruns-delay 10
|
||||
coverage xml
|
||||
- name: Upload coverage to Codecov
|
||||
if: matrix.python-version == '3.10'
|
||||
if: matrix.python-version == '3.11'
|
||||
uses: codecov/codecov-action@v3
|
||||
with:
|
||||
file: ./coverage.xml
|
||||
flags: unittests
|
||||
- name: Save dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global user.name 'github-actions[bot]'
|
||||
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
|
||||
git config advice.addIgnoredFile false
|
||||
|
||||
# docs:
|
||||
BRANCH=unit-tests-installed-dependencies
|
||||
git fetch origin
|
||||
git checkout -B "$BRANCH" "origin/$BRANCH"
|
||||
|
||||
# runs-on: ubuntu-latest
|
||||
|
||||
# steps:
|
||||
# - uses: actions/checkout@v3
|
||||
# - name: Setup Python
|
||||
# uses: actions/setup-python@v4
|
||||
# with:
|
||||
# python-version: '3.8'
|
||||
# - name: Compile documentation
|
||||
# run: |
|
||||
# pip install -e .
|
||||
# python -m pip install sphinx sphinx_rtd_theme
|
||||
# cd docs
|
||||
# make html
|
||||
# - name: Deploy to GitHub pages
|
||||
# if: ${{ github.ref == 'refs/heads/main' }}
|
||||
# uses: JamesIves/github-pages-deploy-action@3.6.2
|
||||
# with:
|
||||
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
# BRANCH: gh-pages
|
||||
# FOLDER: docs/_build/html
|
||||
# CLEAN: true
|
||||
pip freeze > installed_all_dependencies_${{ matrix.python-version }}_${{ matrix.os }}.txt
|
||||
python test/check_dependency.py > installed_first_tier_dependencies_${{ matrix.python-version }}_${{ matrix.os }}.txt
|
||||
git add installed_*dependencies*.txt
|
||||
mv coverage.xml ./coverage_${{ matrix.python-version }}_${{ matrix.os }}.xml || true
|
||||
git add -f ./coverage_${{ matrix.python-version }}_${{ matrix.os }}.xml || true
|
||||
git commit -m "Update installed dependencies for Python ${{ matrix.python-version }} on ${{ matrix.os }}" || exit 0
|
||||
git push origin "$BRANCH" --force
|
||||
|
||||
7
.gitignore
vendored
7
.gitignore
vendored
@@ -60,6 +60,7 @@ coverage.xml
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
junit
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
@@ -172,7 +173,7 @@ test/default
|
||||
test/housing.json
|
||||
test/nlp/default/transformer_ms/seq-classification.json
|
||||
|
||||
flaml/fabric/fanova/_fanova.c
|
||||
flaml/fabric/fanova/*fanova.c
|
||||
# local config files
|
||||
*.config.local
|
||||
|
||||
@@ -184,3 +185,7 @@ notebook/lightning_logs/
|
||||
lightning_logs/
|
||||
flaml/autogen/extensions/tmp/
|
||||
test/autogen/my_tmp/
|
||||
catboost_*
|
||||
|
||||
# Internal configs
|
||||
.pypirc
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# basic setup
|
||||
FROM mcr.microsoft.com/devcontainers/python:3.8
|
||||
FROM mcr.microsoft.com/devcontainers/python:3.10
|
||||
RUN apt-get update && apt-get -y update
|
||||
RUN apt-get install -y sudo git npm
|
||||
|
||||
|
||||
56
README.md
56
README.md
@@ -14,15 +14,9 @@
|
||||
<br>
|
||||
</p>
|
||||
|
||||
:fire: FLAML supports AutoML and Hyperparameter Tuning in [Microsoft Fabric Data Science](https://learn.microsoft.com/en-us/fabric/data-science/automated-machine-learning-fabric). In addition, we've introduced Python 3.11 support, along with a range of new estimators, and comprehensive integration with MLflow—thanks to contributions from the Microsoft Fabric product team.
|
||||
:fire: FLAML supports AutoML and Hyperparameter Tuning in [Microsoft Fabric Data Science](https://learn.microsoft.com/en-us/fabric/data-science/automated-machine-learning-fabric). In addition, we've introduced Python 3.11 and 3.12 support, along with a range of new estimators, and comprehensive integration with MLflow—thanks to contributions from the Microsoft Fabric product team.
|
||||
|
||||
:fire: Heads-up: We have migrated [AutoGen](https://microsoft.github.io/autogen/) into a dedicated [github repository](https://github.com/microsoft/autogen). Alongside this move, we have also launched a dedicated [Discord](https://discord.gg/pAbnFJrkgZ) server and a [website](https://microsoft.github.io/autogen/) for comprehensive documentation.
|
||||
|
||||
:fire: The automated multi-agent chat framework in [AutoGen](https://microsoft.github.io/autogen/) is in preview from v2.0.0.
|
||||
|
||||
:fire: FLAML is highlighted in OpenAI's [cookbook](https://github.com/openai/openai-cookbook#related-resources-from-around-the-web).
|
||||
|
||||
:fire: [autogen](https://microsoft.github.io/autogen/) is released with support for ChatGPT and GPT-4, based on [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673).
|
||||
:fire: Heads-up: [AutoGen](https://microsoft.github.io/autogen/) has moved to a dedicated [GitHub repository](https://github.com/microsoft/autogen). FLAML no longer includes the `autogen` module—please use AutoGen directly.
|
||||
|
||||
## What is FLAML
|
||||
|
||||
@@ -30,7 +24,7 @@ FLAML is a lightweight Python library for efficient automation of machine
|
||||
learning and AI operations. It automates workflow based on large language models, machine learning models, etc.
|
||||
and optimizes their performance.
|
||||
|
||||
- FLAML enables building next-gen GPT-X applications based on multi-agent conversations with minimal effort. It simplifies the orchestration, automation and optimization of a complex GPT-X workflow. It maximizes the performance of GPT-X models and augments their weakness.
|
||||
- FLAML enables economical automation and tuning for ML/AI workflows, including model selection and hyperparameter optimization under resource constraints.
|
||||
- For common machine learning tasks like classification and regression, it quickly finds quality models for user-provided data with low computational resources. It is easy to customize or extend. Users can find their desired customizability from a smooth range.
|
||||
- It supports fast and economical automatic tuning (e.g., inference hyperparameters for foundation models, configurations in MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations), capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping.
|
||||
|
||||
@@ -40,16 +34,16 @@ FLAML has a .NET implementation in [ML.NET](http://dot.net/ml), an open-source,
|
||||
|
||||
## Installation
|
||||
|
||||
FLAML requires **Python version >= 3.8**. It can be installed from pip:
|
||||
The latest version of FLAML requires **Python >= 3.10 and \< 3.13**. While other Python versions may work for core components, full model support is not guaranteed. FLAML can be installed via `pip`:
|
||||
|
||||
```bash
|
||||
pip install flaml
|
||||
```
|
||||
|
||||
Minimal dependencies are installed without extra options. You can install extra options based on the feature you need. For example, use the following to install the dependencies needed by the [`autogen`](https://microsoft.github.io/autogen/) package.
|
||||
Minimal dependencies are installed without extra options. You can install extra options based on the feature you need. For example, use the following to install the dependencies needed by the [`automl`](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML) module.
|
||||
|
||||
```bash
|
||||
pip install "flaml[autogen]"
|
||||
pip install "flaml[automl]"
|
||||
```
|
||||
|
||||
Find more options in [Installation](https://microsoft.github.io/FLAML/docs/Installation).
|
||||
@@ -57,39 +51,6 @@ Each of the [`notebook examples`](https://github.com/microsoft/FLAML/tree/main/n
|
||||
|
||||
## Quickstart
|
||||
|
||||
- (New) The [autogen](https://microsoft.github.io/autogen/) package enables the next-gen GPT-X applications with a generic multi-agent conversation framework.
|
||||
It offers customizable and conversable agents which integrate LLMs, tools and human.
|
||||
By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For example,
|
||||
|
||||
```python
|
||||
from flaml import autogen
|
||||
|
||||
assistant = autogen.AssistantAgent("assistant")
|
||||
user_proxy = autogen.UserProxyAgent("user_proxy")
|
||||
user_proxy.initiate_chat(
|
||||
assistant,
|
||||
message="Show me the YTD gain of 10 largest technology companies as of today.",
|
||||
)
|
||||
# This initiates an automated chat between the two agents to solve the task
|
||||
```
|
||||
|
||||
Autogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers a drop-in replacement of `openai.Completion` or `openai.ChatCompletion` with powerful functionalites like tuning, caching, templating, filtering. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.
|
||||
|
||||
```python
|
||||
# perform tuning
|
||||
config, analysis = autogen.Completion.tune(
|
||||
data=tune_data,
|
||||
metric="success",
|
||||
mode="max",
|
||||
eval_func=eval_func,
|
||||
inference_budget=0.05,
|
||||
optimization_budget=3,
|
||||
num_samples=-1,
|
||||
)
|
||||
# perform inference for a test instance
|
||||
response = autogen.Completion.create(context=test_instance, **config)
|
||||
```
|
||||
|
||||
- With three lines of code, you can start using this economical and fast
|
||||
AutoML engine as a [scikit-learn style estimator](https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML).
|
||||
|
||||
@@ -111,7 +72,10 @@ automl.fit(X_train, y_train, task="classification", estimator_list=["lgbm"])
|
||||
|
||||
```python
|
||||
from flaml import tune
|
||||
tune.run(evaluation_function, config={…}, low_cost_partial_config={…}, time_budget_s=3600)
|
||||
|
||||
tune.run(
|
||||
evaluation_function, config={…}, low_cost_partial_config={…}, time_budget_s=3600
|
||||
)
|
||||
```
|
||||
|
||||
- [Zero-shot AutoML](https://microsoft.github.io/FLAML/docs/Use-Cases/Zero-Shot-AutoML) allows using the existing training API from lightgbm, xgboost etc. while getting the benefit of AutoML in choosing high-performance hyperparameter configurations per task.
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from flaml.automl import AutoML, logger_formatter
|
||||
@@ -12,7 +13,8 @@ from flaml.version import __version__
|
||||
|
||||
# Set the root logger.
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
if logger.level == logging.NOTSET:
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
if not has_automl:
|
||||
logger.warning("flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.")
|
||||
warnings.warn("flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.")
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
import warnings
|
||||
|
||||
from .agentchat import *
|
||||
from .code_utils import DEFAULT_MODEL, FAST_MODEL
|
||||
from .oai import *
|
||||
|
||||
warnings.warn(
|
||||
"The `flaml.autogen` module is deprecated and will be removed in a future release. "
|
||||
"Please refer to `https://github.com/microsoft/autogen` for latest usage.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@@ -156,7 +156,7 @@ class MathUserProxyAgent(UserProxyAgent):
|
||||
when the number of auto reply reaches the max_consecutive_auto_reply or when is_termination_msg is True.
|
||||
default_auto_reply (str or dict or None): the default auto reply message when no code execution or llm based reply is generated.
|
||||
max_invalid_q_per_step (int): (ADDED) the maximum number of invalid queries per step.
|
||||
**kwargs (dict): other kwargs in [UserProxyAgent](user_proxy_agent#__init__).
|
||||
**kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).
|
||||
"""
|
||||
super().__init__(
|
||||
name=name,
|
||||
|
||||
@@ -123,7 +123,7 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
can be found at `https://www.sbert.net/docs/pretrained_models.html`. The default model is a
|
||||
fast model. If you want to use a high performance model, `all-mpnet-base-v2` is recommended.
|
||||
- customized_prompt (Optional, str): the customized prompt for the retrieve chat. Default is None.
|
||||
**kwargs (dict): other kwargs in [UserProxyAgent](user_proxy_agent#__init__).
|
||||
**kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).
|
||||
"""
|
||||
super().__init__(
|
||||
name=name,
|
||||
|
||||
@@ -4,12 +4,14 @@
|
||||
# * project root for license information.
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import as_completed
|
||||
from functools import partial
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
@@ -176,10 +178,11 @@ class AutoML(BaseEstimator):
|
||||
['auto', 'cv', 'holdout'].
|
||||
split_ratio: A float of the valiation data percentage for holdout.
|
||||
n_splits: An integer of the number of folds for cross - validation.
|
||||
log_type: A string of the log type, one of
|
||||
['better', 'all'].
|
||||
'better' only logs configs with better loss than previos iters
|
||||
'all' logs all the tried configs.
|
||||
log_type: Specifies which logs to save. One of ['better', 'all']. Default is 'better'.
|
||||
- 'better': Logs configs and models (if `model_history` is True) only when the loss improves,
|
||||
to `log_file_name` and MLflow, respectively.
|
||||
- 'all': Logs all configs and models (if `model_history` is True), regardless of performance.
|
||||
Note: Configs are always logged to MLflow if MLflow logging is enabled.
|
||||
model_history: A boolean of whether to keep the best
|
||||
model per estimator. Make sure memory is large enough if setting to True. Default False.
|
||||
log_training_metric: A boolean of whether to log the training
|
||||
@@ -187,9 +190,16 @@ class AutoML(BaseEstimator):
|
||||
mem_thres: A float of the memory size constraint in bytes.
|
||||
pred_time_limit: A float of the prediction latency constraint in seconds.
|
||||
It refers to the average prediction time per row in validation data.
|
||||
train_time_limit: A float of the training time constraint in seconds.
|
||||
train_time_limit: None or a float of the training time constraint in seconds for each trial.
|
||||
Only valid for sequential search.
|
||||
verbose: int, default=3 | Controls the verbosity, higher means more
|
||||
messages.
|
||||
verbose=0: logger level = CRITICAL
|
||||
verbose=1: logger level = ERROR
|
||||
verbose=2: logger level = WARNING
|
||||
verbose=3: logger level = INFO
|
||||
verbose=4: logger level = DEBUG
|
||||
verbose>5: logger level = NOTSET
|
||||
retrain_full: bool or str, default=True | whether to retrain the
|
||||
selected model on the full training data when using holdout.
|
||||
True - retrain only after search finishes; False - no retraining;
|
||||
@@ -203,7 +213,7 @@ class AutoML(BaseEstimator):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
@@ -393,6 +403,58 @@ class AutoML(BaseEstimator):
|
||||
self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor"
|
||||
self.best_run_id = None
|
||||
|
||||
def __getstate__(self):
|
||||
"""Customize pickling to avoid serializing runtime-only objects.
|
||||
|
||||
MLflow's sklearn flavor serializes estimators via (cloud)pickle. During
|
||||
AutoML fitting we may attach an internal mlflow integration instance
|
||||
which holds `concurrent.futures.Future` objects and executors containing
|
||||
thread locks, which are not picklable.
|
||||
"""
|
||||
|
||||
state = self.__dict__.copy()
|
||||
# Keep mlflow_integration for post-load visualization (e.g., infos), but
|
||||
# strip non-picklable runtime-only members (thread futures, clients).
|
||||
mlflow_integration = state.get("mlflow_integration", None)
|
||||
if mlflow_integration is not None:
|
||||
import copy
|
||||
|
||||
mi = copy.copy(mlflow_integration)
|
||||
# These are runtime-only and often contain locks / threads.
|
||||
if hasattr(mi, "futures"):
|
||||
mi.futures = {}
|
||||
if hasattr(mi, "futures_log_model"):
|
||||
mi.futures_log_model = {}
|
||||
if hasattr(mi, "train_func"):
|
||||
mi.train_func = None
|
||||
if hasattr(mi, "mlflow_client"):
|
||||
mi.mlflow_client = None
|
||||
state["mlflow_integration"] = mi
|
||||
# MLflow signature objects may hold references to Spark/pandas-on-Spark
|
||||
# inputs and can indirectly capture SparkContext, which is not picklable.
|
||||
state.pop("estimator_signature", None)
|
||||
state.pop("pipeline_signature", None)
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__dict__.update(state)
|
||||
# Ensure mlflow_integration runtime members exist post-unpickle.
|
||||
mi = getattr(self, "mlflow_integration", None)
|
||||
if mi is not None:
|
||||
if not hasattr(mi, "futures") or mi.futures is None:
|
||||
mi.futures = {}
|
||||
if not hasattr(mi, "futures_log_model") or mi.futures_log_model is None:
|
||||
mi.futures_log_model = {}
|
||||
if not hasattr(mi, "train_func"):
|
||||
mi.train_func = None
|
||||
if not hasattr(mi, "mlflow_client") or mi.mlflow_client is None:
|
||||
try:
|
||||
import mlflow as _mlflow
|
||||
|
||||
mi.mlflow_client = _mlflow.tracking.MlflowClient()
|
||||
except Exception:
|
||||
mi.mlflow_client = None
|
||||
|
||||
def get_params(self, deep: bool = False) -> dict:
|
||||
return self._settings.copy()
|
||||
|
||||
@@ -424,6 +486,8 @@ class AutoML(BaseEstimator):
|
||||
If `model_history` was set to True, then the returned model is trained.
|
||||
"""
|
||||
state = self._search_states.get(estimator_name)
|
||||
if state and estimator_name == self._best_estimator:
|
||||
return self.model
|
||||
return state and getattr(state, "trained_estimator", None)
|
||||
|
||||
@property
|
||||
@@ -739,7 +803,7 @@ class AutoML(BaseEstimator):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
@@ -1084,17 +1148,344 @@ class AutoML(BaseEstimator):
|
||||
return self._state.data_size[0] if self._sample else None
|
||||
|
||||
def pickle(self, output_file_name):
|
||||
"""Serialize the AutoML instance to a pickle file.
|
||||
|
||||
Notes:
|
||||
When the trained estimator(s) are Spark-based, they may hold references
|
||||
to SparkContext/SparkSession via Spark ML objects. Such objects are not
|
||||
safely picklable and can cause pickling/broadcast errors.
|
||||
|
||||
This method externalizes Spark ML models into an adjacent artifact
|
||||
directory and stores only lightweight metadata in the pickle.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
|
||||
def _safe_name(name: str) -> str:
|
||||
return re.sub(r"[^A-Za-z0-9_.-]+", "_", name)
|
||||
|
||||
def _iter_trained_estimators():
|
||||
trained = getattr(self, "_trained_estimator", None)
|
||||
if trained is not None:
|
||||
yield "_trained_estimator", trained
|
||||
for est_name in getattr(self, "estimator_list", []) or []:
|
||||
ss = getattr(self, "_search_states", {}).get(est_name)
|
||||
te = ss and getattr(ss, "trained_estimator", None)
|
||||
if te is not None:
|
||||
yield f"_search_states.{est_name}.trained_estimator", te
|
||||
|
||||
def _scrub_pyspark_refs(root_obj):
|
||||
"""Best-effort removal of pyspark objects prior to pickling.
|
||||
|
||||
SparkContext/SparkSession and Spark DataFrame objects are not picklable.
|
||||
This function finds such objects within common containers and instance
|
||||
attributes and replaces them with None, returning a restore mapping.
|
||||
"""
|
||||
|
||||
try:
|
||||
import pyspark
|
||||
from pyspark.broadcast import Broadcast
|
||||
from pyspark.sql import DataFrame as SparkDataFrame
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
try:
|
||||
import pyspark.pandas as ps
|
||||
|
||||
psDataFrameType = getattr(ps, "DataFrame", None)
|
||||
psSeriesType = getattr(ps, "Series", None)
|
||||
except Exception:
|
||||
psDataFrameType = None
|
||||
psSeriesType = None
|
||||
|
||||
bad_types = [
|
||||
pyspark.SparkContext,
|
||||
SparkSession,
|
||||
SparkDataFrame,
|
||||
Broadcast,
|
||||
]
|
||||
if psDataFrameType is not None:
|
||||
bad_types.append(psDataFrameType)
|
||||
if psSeriesType is not None:
|
||||
bad_types.append(psSeriesType)
|
||||
bad_types = tuple(t for t in bad_types if t is not None)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
restore = {}
|
||||
visited = set()
|
||||
|
||||
def _mark(parent, key, value, path):
|
||||
restore[(id(parent), key)] = (parent, key, value)
|
||||
try:
|
||||
if isinstance(parent, dict):
|
||||
parent[key] = None
|
||||
elif isinstance(parent, list):
|
||||
parent[key] = None
|
||||
elif isinstance(parent, tuple):
|
||||
# tuples are immutable; we can't modify in-place
|
||||
pass
|
||||
else:
|
||||
setattr(parent, key, None)
|
||||
except Exception:
|
||||
# Best-effort.
|
||||
pass
|
||||
|
||||
def _walk(obj, depth, parent=None, key=None, path="self"):
|
||||
if obj is None:
|
||||
return
|
||||
oid = id(obj)
|
||||
if oid in visited:
|
||||
return
|
||||
visited.add(oid)
|
||||
|
||||
if isinstance(obj, bad_types):
|
||||
if parent is not None:
|
||||
_mark(parent, key, obj, path)
|
||||
return
|
||||
if depth <= 0:
|
||||
return
|
||||
|
||||
if isinstance(obj, dict):
|
||||
for k, v in list(obj.items()):
|
||||
_walk(v, depth - 1, parent=obj, key=k, path=f"{path}[{k!r}]")
|
||||
return
|
||||
if isinstance(obj, list):
|
||||
for i, v in enumerate(list(obj)):
|
||||
_walk(v, depth - 1, parent=obj, key=i, path=f"{path}[{i}]")
|
||||
return
|
||||
if isinstance(obj, tuple):
|
||||
# Can't scrub inside tuples safely; but still inspect for diagnostics.
|
||||
for i, v in enumerate(obj):
|
||||
_walk(v, depth - 1, parent=None, key=None, path=f"{path}[{i}]")
|
||||
return
|
||||
if isinstance(obj, set):
|
||||
for v in list(obj):
|
||||
_walk(v, depth - 1, parent=None, key=None, path=f"{path}{{...}}")
|
||||
return
|
||||
|
||||
d = getattr(obj, "__dict__", None)
|
||||
if isinstance(d, dict):
|
||||
for attr, v in list(d.items()):
|
||||
_walk(v, depth - 1, parent=obj, key=attr, path=f"{path}.{attr}")
|
||||
|
||||
_walk(root_obj, depth=6)
|
||||
return restore
|
||||
|
||||
# Temporarily remove non-picklable pieces (e.g., SparkContext-backed objects)
|
||||
# and externalize spark models.
|
||||
estimator_to_training_function = {}
|
||||
spark_restore = []
|
||||
artifact_dir = None
|
||||
state_restore = {}
|
||||
automl_restore = {}
|
||||
scrub_restore = {}
|
||||
|
||||
try:
|
||||
# Signatures are only used for MLflow logging; they are not required
|
||||
# for inference and can capture SparkContext via pyspark objects.
|
||||
for attr in ("estimator_signature", "pipeline_signature"):
|
||||
if hasattr(self, attr):
|
||||
automl_restore[attr] = getattr(self, attr)
|
||||
setattr(self, attr, None)
|
||||
|
||||
for estimator in self.estimator_list:
|
||||
search_state = self._search_states[estimator]
|
||||
if hasattr(search_state, "training_function"):
|
||||
estimator_to_training_function[estimator] = search_state.training_function
|
||||
del search_state.training_function
|
||||
|
||||
# AutoMLState may keep Spark / pandas-on-Spark dataframes which are not picklable.
|
||||
# They are not required for inference, so strip them for serialization.
|
||||
state = getattr(self, "_state", None)
|
||||
if state is not None:
|
||||
for attr in (
|
||||
"X_train",
|
||||
"y_train",
|
||||
"X_train_all",
|
||||
"y_train_all",
|
||||
"X_val",
|
||||
"y_val",
|
||||
"weight_val",
|
||||
"groups_val",
|
||||
"sample_weight_all",
|
||||
"groups",
|
||||
"groups_all",
|
||||
"kf",
|
||||
):
|
||||
if hasattr(state, attr):
|
||||
state_restore[attr] = getattr(state, attr)
|
||||
setattr(state, attr, None)
|
||||
|
||||
for key, est in _iter_trained_estimators():
|
||||
if getattr(est, "estimator_baseclass", None) != "spark":
|
||||
continue
|
||||
|
||||
# Drop training data reference (Spark DataFrame / pandas-on-Spark).
|
||||
old_df_train = getattr(est, "df_train", None)
|
||||
old_model = getattr(est, "_model", None)
|
||||
|
||||
model_meta = None
|
||||
if old_model is not None:
|
||||
if artifact_dir is None:
|
||||
artifact_dir = output_file_name + ".flaml_artifacts"
|
||||
os.makedirs(artifact_dir, exist_ok=True)
|
||||
# store relative dirname so the pickle+folder can be moved together
|
||||
self._flaml_pickle_artifacts_dirname = os.path.basename(artifact_dir)
|
||||
|
||||
model_dir = os.path.join(artifact_dir, _safe_name(key))
|
||||
# Spark ML models are saved as directories.
|
||||
try:
|
||||
writer = old_model.write()
|
||||
writer.overwrite().save(model_dir)
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
"Failed to externalize Spark model for pickling. "
|
||||
"Please ensure the Spark ML model supports write().overwrite().save(path)."
|
||||
) from e
|
||||
|
||||
model_meta = {
|
||||
"path": os.path.relpath(model_dir, os.path.dirname(output_file_name) or "."),
|
||||
"class": old_model.__class__.__module__ + "." + old_model.__class__.__name__,
|
||||
}
|
||||
# Replace in-memory Spark model with metadata only.
|
||||
est._model = None
|
||||
est._flaml_spark_model_meta = model_meta
|
||||
|
||||
est.df_train = None
|
||||
spark_restore.append((est, old_model, old_df_train, model_meta))
|
||||
|
||||
with open(output_file_name, "wb") as f:
|
||||
try:
|
||||
pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
|
||||
except Exception:
|
||||
# Some pyspark objects can still be captured indirectly.
|
||||
scrub_restore = _scrub_pyspark_refs(self)
|
||||
if scrub_restore:
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
|
||||
else:
|
||||
raise
|
||||
finally:
|
||||
# Restore training_function and Spark models so current object remains usable.
|
||||
for estimator, tf in estimator_to_training_function.items():
|
||||
self._search_states[estimator].training_function = tf
|
||||
|
||||
for attr, val in automl_restore.items():
|
||||
setattr(self, attr, val)
|
||||
|
||||
state = getattr(self, "_state", None)
|
||||
if state is not None and state_restore:
|
||||
for attr, val in state_restore.items():
|
||||
setattr(state, attr, val)
|
||||
|
||||
for est, old_model, old_df_train, model_meta in spark_restore:
|
||||
est._model = old_model
|
||||
est.df_train = old_df_train
|
||||
if model_meta is not None and hasattr(est, "_flaml_spark_model_meta"):
|
||||
delattr(est, "_flaml_spark_model_meta")
|
||||
|
||||
if scrub_restore:
|
||||
for _, (parent, key, value) in scrub_restore.items():
|
||||
try:
|
||||
if isinstance(parent, dict):
|
||||
parent[key] = value
|
||||
elif isinstance(parent, list):
|
||||
parent[key] = value
|
||||
else:
|
||||
setattr(parent, key, value)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def load_pickle(cls, input_file_name: str, load_spark_models: bool = True):
|
||||
"""Load an AutoML instance saved by :meth:`pickle`.
|
||||
|
||||
Args:
|
||||
input_file_name: Path to the pickle file created by :meth:`pickle`.
|
||||
load_spark_models: Whether to load externalized Spark ML models back
|
||||
into the estimator objects. If False, Spark estimators will remain
|
||||
without their underlying Spark model and cannot be used for predict.
|
||||
|
||||
Returns:
|
||||
The deserialized AutoML instance.
|
||||
"""
|
||||
import importlib
|
||||
import os
|
||||
import pickle
|
||||
|
||||
estimator_to_training_function = {}
|
||||
for estimator in self.estimator_list:
|
||||
search_state = self._search_states[estimator]
|
||||
if hasattr(search_state, "training_function"):
|
||||
estimator_to_training_function[estimator] = search_state.training_function
|
||||
del search_state.training_function
|
||||
with open(input_file_name, "rb") as f:
|
||||
automl = pickle.load(f)
|
||||
|
||||
with open(output_file_name, "wb") as f:
|
||||
pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
|
||||
# Recreate per-estimator training_function if it was removed for pickling.
|
||||
try:
|
||||
for est_name, ss in getattr(automl, "_search_states", {}).items():
|
||||
if not hasattr(ss, "training_function"):
|
||||
ss.training_function = partial(
|
||||
AutoMLState._compute_with_config_base,
|
||||
state=automl._state,
|
||||
estimator=est_name,
|
||||
)
|
||||
except Exception:
|
||||
# Best-effort; training_function is only needed for re-searching.
|
||||
pass
|
||||
|
||||
if not load_spark_models:
|
||||
return automl
|
||||
|
||||
base_dir = os.path.dirname(input_file_name) or "."
|
||||
|
||||
def _iter_trained_estimators_loaded():
|
||||
trained = getattr(automl, "_trained_estimator", None)
|
||||
if trained is not None:
|
||||
yield trained
|
||||
for ss in getattr(automl, "_search_states", {}).values():
|
||||
te = ss and getattr(ss, "trained_estimator", None)
|
||||
if te is not None:
|
||||
yield te
|
||||
|
||||
for est in _iter_trained_estimators_loaded():
|
||||
meta = getattr(est, "_flaml_spark_model_meta", None)
|
||||
if not meta:
|
||||
continue
|
||||
model_path = meta.get("path")
|
||||
model_class = meta.get("class")
|
||||
if not model_path or not model_class:
|
||||
continue
|
||||
|
||||
abs_model_path = os.path.join(base_dir, model_path)
|
||||
|
||||
module_name, _, class_name = model_class.rpartition(".")
|
||||
try:
|
||||
module = importlib.import_module(module_name)
|
||||
model_cls = getattr(module, class_name)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to import Spark model class '{model_class}'") from e
|
||||
|
||||
# Most Spark ML models support either Class.load(path) or Class.read().load(path).
|
||||
if hasattr(model_cls, "load"):
|
||||
est._model = model_cls.load(abs_model_path)
|
||||
elif hasattr(model_cls, "read"):
|
||||
est._model = model_cls.read().load(abs_model_path)
|
||||
else:
|
||||
try:
|
||||
from pyspark.ml.pipeline import PipelineModel
|
||||
|
||||
loaded_model = PipelineModel.load(abs_model_path)
|
||||
if not isinstance(loaded_model, model_cls):
|
||||
raise RuntimeError(
|
||||
f"Loaded model type '{type(loaded_model).__name__}' does not match expected type '{model_class}'."
|
||||
)
|
||||
est._model = loaded_model
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Spark model class '{model_class}' does not support load/read(). "
|
||||
"Unable to restore Spark model from artifacts."
|
||||
) from e
|
||||
|
||||
return automl
|
||||
|
||||
@property
|
||||
def trainable(self) -> Callable[[dict], float | None]:
|
||||
@@ -1332,7 +1723,8 @@ class AutoML(BaseEstimator):
|
||||
mem_thres: A float of the memory size constraint in bytes.
|
||||
pred_time_limit: A float of the prediction latency constraint in seconds.
|
||||
It refers to the average prediction time per row in validation data.
|
||||
train_time_limit: None or a float of the training time constraint in seconds.
|
||||
train_time_limit: None or a float of the training time constraint in seconds for each trial.
|
||||
Only valid for sequential search.
|
||||
X_val: None or a numpy array or a pandas dataframe of validation data.
|
||||
y_val: None or a numpy array or a pandas series of validation labels.
|
||||
sample_weight_val: None or a numpy array of the sample weight of
|
||||
@@ -1345,6 +1737,12 @@ class AutoML(BaseEstimator):
|
||||
for training data.
|
||||
verbose: int, default=3 | Controls the verbosity, higher means more
|
||||
messages.
|
||||
verbose=0: logger level = CRITICAL
|
||||
verbose=1: logger level = ERROR
|
||||
verbose=2: logger level = WARNING
|
||||
verbose=3: logger level = INFO
|
||||
verbose=4: logger level = DEBUG
|
||||
verbose>5: logger level = NOTSET
|
||||
retrain_full: bool or str, default=True | whether to retrain the
|
||||
selected model on the full training data when using holdout.
|
||||
True - retrain only after search finishes; False - no retraining;
|
||||
@@ -1358,7 +1756,7 @@ class AutoML(BaseEstimator):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
@@ -1623,6 +2021,13 @@ class AutoML(BaseEstimator):
|
||||
_ch.setFormatter(logger_formatter)
|
||||
logger.addHandler(_ch)
|
||||
|
||||
if model_history:
|
||||
logger.warning(
|
||||
"With `model_history` set to `True` by default, all intermediate models are retained in memory, "
|
||||
"which may significantly increase memory usage and slow down training. "
|
||||
"Consider setting `model_history=False` to optimize memory and accelerate the training process."
|
||||
)
|
||||
|
||||
if not use_ray and not use_spark and n_concurrent_trials > 1:
|
||||
if ray_available:
|
||||
logger.warning(
|
||||
@@ -1708,7 +2113,7 @@ class AutoML(BaseEstimator):
|
||||
if not (mlflow.active_run() is not None or is_autolog_enabled()):
|
||||
self.mlflow_integration.only_history = True
|
||||
except KeyError:
|
||||
print("Not in Fabric, Skipped")
|
||||
logger.info("Not in Fabric, Skipped")
|
||||
task.validate_data(
|
||||
self,
|
||||
self._state,
|
||||
@@ -2132,7 +2537,7 @@ class AutoML(BaseEstimator):
|
||||
use_spark=True,
|
||||
force_cancel=self._force_cancel,
|
||||
mlflow_exp_name=self._mlflow_exp_name,
|
||||
automl_info=(mlflow_log_latency,), # pass automl info to tune.run
|
||||
automl_info=(mlflow_log_latency, self._log_type), # pass automl info to tune.run
|
||||
extra_tag=self.autolog_extra_tag,
|
||||
# raise_on_failed_trial=False,
|
||||
# keep_checkpoints_num=1,
|
||||
@@ -2195,7 +2600,9 @@ class AutoML(BaseEstimator):
|
||||
if better or self._log_type == "all":
|
||||
self._log_trial(search_state, estimator)
|
||||
if self.mlflow_integration:
|
||||
self.mlflow_integration.record_state(self, search_state, estimator)
|
||||
self.mlflow_integration.record_state(
|
||||
self, search_state, estimator, better or self._log_type == "all"
|
||||
)
|
||||
|
||||
def _log_trial(self, search_state, estimator):
|
||||
if self._training_log:
|
||||
@@ -2437,7 +2844,9 @@ class AutoML(BaseEstimator):
|
||||
if better or self._log_type == "all":
|
||||
self._log_trial(search_state, estimator)
|
||||
if self.mlflow_integration:
|
||||
self.mlflow_integration.record_state(self, search_state, estimator)
|
||||
self.mlflow_integration.record_state(
|
||||
self, search_state, estimator, better or self._log_type == "all"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
" at {:.1f}s,\testimator {}'s best error={:.4f},\tbest estimator {}'s best error={:.4f}".format(
|
||||
@@ -2529,6 +2938,21 @@ class AutoML(BaseEstimator):
|
||||
self._selected = state = self._search_states[estimator]
|
||||
state.best_config_sample_size = self._state.data_size[0]
|
||||
state.best_config = state.init_config[0] if state.init_config else {}
|
||||
self._track_iter = 0
|
||||
self._config_history[self._track_iter] = (estimator, state.best_config, self._state.time_from_start)
|
||||
self._best_iteration = self._track_iter
|
||||
state.val_loss = getattr(state, "val_loss", float("inf"))
|
||||
state.best_loss = getattr(state, "best_loss", float("inf"))
|
||||
state.config = getattr(state, "config", state.best_config.copy())
|
||||
state.metric_for_logging = getattr(state, "metric_for_logging", None)
|
||||
state.sample_size = getattr(state, "sample_size", self._state.data_size[0])
|
||||
state.learner_class = getattr(state, "learner_class", self._state.learner_classes.get(estimator))
|
||||
if hasattr(self, "mlflow_integration") and self.mlflow_integration:
|
||||
self.mlflow_integration.record_state(
|
||||
automl=self,
|
||||
search_state=state,
|
||||
estimator=estimator,
|
||||
)
|
||||
elif self._use_ray is False and self._use_spark is False:
|
||||
self._search_sequential()
|
||||
else:
|
||||
@@ -2700,16 +3124,47 @@ class AutoML(BaseEstimator):
|
||||
):
|
||||
if mlflow.active_run() is None:
|
||||
mlflow.start_run(run_id=self.mlflow_integration.parent_run_id)
|
||||
self.mlflow_integration.log_model(
|
||||
self._trained_estimator.model,
|
||||
self.best_estimator,
|
||||
signature=self.estimator_signature,
|
||||
)
|
||||
self.mlflow_integration.pickle_and_log_automl_artifacts(
|
||||
self, self.model, self.best_estimator, signature=self.pipeline_signature
|
||||
)
|
||||
if self.best_estimator.endswith("_spark"):
|
||||
self.mlflow_integration.log_model(
|
||||
self._trained_estimator.model,
|
||||
self.best_estimator,
|
||||
signature=self.estimator_signature,
|
||||
run_id=self.mlflow_integration.parent_run_id,
|
||||
)
|
||||
else:
|
||||
self.mlflow_integration.pickle_and_log_automl_artifacts(
|
||||
self,
|
||||
self.model,
|
||||
self.best_estimator,
|
||||
signature=self.pipeline_signature,
|
||||
run_id=self.mlflow_integration.parent_run_id,
|
||||
)
|
||||
else:
|
||||
logger.info("not retraining because the time budget is too small.")
|
||||
logger.warning("not retraining because the time budget is too small.")
|
||||
self.wait_futures()
|
||||
|
||||
def wait_futures(self):
|
||||
if self.mlflow_integration is not None:
|
||||
logger.debug("Collecting results from submitted record_state tasks")
|
||||
t1 = time.perf_counter()
|
||||
for future in as_completed(self.mlflow_integration.futures):
|
||||
_task = self.mlflow_integration.futures[future]
|
||||
try:
|
||||
result = future.result()
|
||||
logger.debug(f"Result for record_state task {_task}: {result}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Exception for record_state task {_task}: {e}")
|
||||
for future in as_completed(self.mlflow_integration.futures_log_model):
|
||||
_task = self.mlflow_integration.futures_log_model[future]
|
||||
try:
|
||||
result = future.result()
|
||||
logger.debug(f"Result for log_model task {_task}: {result}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Exception for log_model task {_task}: {e}")
|
||||
t2 = time.perf_counter()
|
||||
logger.debug(f"Collecting results from tasks submitted to executors costs {t2-t1} seconds.")
|
||||
else:
|
||||
logger.debug("No futures to wait for.")
|
||||
|
||||
def __del__(self):
|
||||
if (
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
try:
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
|
||||
except ImportError:
|
||||
pass
|
||||
except ImportError as e:
|
||||
print(f"scikit-learn is required for HistGradientBoostingEstimator. Please install it; error: {e}")
|
||||
|
||||
from flaml import tune
|
||||
from flaml.automl.model import SKLearnEstimator
|
||||
|
||||
@@ -2,13 +2,18 @@
|
||||
# * Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# * Licensed under the MIT License. See LICENSE file in the
|
||||
# * project root for license information.
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
import random
|
||||
import re
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from decimal import ROUND_HALF_UP, Decimal
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from flaml.automl.spark import DataFrame, Series, pd, ps, psDataFrame, psSeries
|
||||
from flaml.automl.spark import DataFrame, F, Series, T, pd, ps, psDataFrame, psSeries
|
||||
from flaml.automl.training_log import training_log_reader
|
||||
|
||||
try:
|
||||
@@ -19,6 +24,7 @@ except ImportError:
|
||||
if TYPE_CHECKING:
|
||||
from flaml.automl.task import Task
|
||||
|
||||
|
||||
TS_TIMESTAMP_COL = "ds"
|
||||
TS_VALUE_COL = "y"
|
||||
|
||||
@@ -45,7 +51,10 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_forma
|
||||
"""
|
||||
import pickle
|
||||
|
||||
import openml
|
||||
try:
|
||||
import openml
|
||||
except ImportError:
|
||||
openml = None
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
filename = "openml_ds" + str(dataset_id) + ".pkl"
|
||||
@@ -56,15 +65,15 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_forma
|
||||
dataset = pickle.load(f)
|
||||
else:
|
||||
print("download dataset from openml")
|
||||
dataset = openml.datasets.get_dataset(dataset_id)
|
||||
dataset = openml.datasets.get_dataset(dataset_id) if openml else None
|
||||
if not os.path.exists(data_dir):
|
||||
os.makedirs(data_dir)
|
||||
with open(filepath, "wb") as f:
|
||||
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
||||
print("Dataset name:", dataset.name)
|
||||
print("Dataset name:", dataset.name) if dataset else None
|
||||
try:
|
||||
X, y, *__ = dataset.get_data(target=dataset.default_target_attribute, dataset_format=dataset_format)
|
||||
except ValueError:
|
||||
except (ValueError, AttributeError, TypeError):
|
||||
from sklearn.datasets import fetch_openml
|
||||
|
||||
X, y = fetch_openml(data_id=dataset_id, return_X_y=True)
|
||||
@@ -445,3 +454,343 @@ class DataTransformer:
|
||||
def group_counts(groups):
|
||||
_, i, c = np.unique(groups, return_counts=True, return_index=True)
|
||||
return c[np.argsort(i)]
|
||||
|
||||
|
||||
def get_random_dataframe(n_rows: int = 200, ratio_none: float = 0.1, seed: int = 42) -> DataFrame:
|
||||
"""Generate a random pandas DataFrame with various data types for testing.
|
||||
This function creates a DataFrame with multiple column types including:
|
||||
- Timestamps
|
||||
- Integers
|
||||
- Floats
|
||||
- Categorical values
|
||||
- Booleans
|
||||
- Lists (tags)
|
||||
- Decimal strings
|
||||
- UUIDs
|
||||
- Binary data (as hex strings)
|
||||
- JSON blobs
|
||||
- Nullable text fields
|
||||
Parameters
|
||||
----------
|
||||
n_rows : int, default=200
|
||||
Number of rows in the generated DataFrame
|
||||
ratio_none : float, default=0.1
|
||||
Probability of generating None values in applicable columns
|
||||
seed : int, default=42
|
||||
Random seed for reproducibility
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
A DataFrame with 14 columns of various data types
|
||||
Examples
|
||||
--------
|
||||
>>> df = get_random_dataframe(100, 0.05, 123)
|
||||
>>> df.shape
|
||||
(100, 14)
|
||||
>>> df.dtypes
|
||||
timestamp datetime64[ns]
|
||||
id int64
|
||||
score float64
|
||||
status object
|
||||
flag object
|
||||
count object
|
||||
value object
|
||||
tags object
|
||||
rating object
|
||||
uuid object
|
||||
binary object
|
||||
json_blob object
|
||||
category category
|
||||
nullable_text object
|
||||
dtype: object
|
||||
"""
|
||||
|
||||
np.random.seed(seed)
|
||||
random.seed(seed)
|
||||
|
||||
def random_tags():
|
||||
tags = ["AI", "ML", "data", "robotics", "vision"]
|
||||
return random.sample(tags, k=random.randint(1, 3)) if random.random() > ratio_none else None
|
||||
|
||||
def random_decimal():
|
||||
return (
|
||||
str(Decimal(random.uniform(1, 5)).quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
|
||||
if random.random() > ratio_none
|
||||
else None
|
||||
)
|
||||
|
||||
def random_json_blob():
|
||||
blob = {"a": random.randint(1, 10), "b": random.random()}
|
||||
return json.dumps(blob) if random.random() > ratio_none else None
|
||||
|
||||
def random_binary():
|
||||
return bytes(random.randint(0, 255) for _ in range(4)).hex() if random.random() > ratio_none else None
|
||||
|
||||
data = {
|
||||
"timestamp": [
|
||||
datetime(2020, 1, 1) + timedelta(days=np.random.randint(0, 1000)) if np.random.rand() > ratio_none else None
|
||||
for _ in range(n_rows)
|
||||
],
|
||||
"id": range(1, n_rows + 1),
|
||||
"score": np.random.uniform(0, 100, n_rows),
|
||||
"status": np.random.choice(
|
||||
["active", "inactive", "pending", None],
|
||||
size=n_rows,
|
||||
p=[(1 - ratio_none) / 3, (1 - ratio_none) / 3, (1 - ratio_none) / 3, ratio_none],
|
||||
),
|
||||
"flag": np.random.choice(
|
||||
[True, False, None], size=n_rows, p=[(1 - ratio_none) / 2, (1 - ratio_none) / 2, ratio_none]
|
||||
),
|
||||
"count": [np.random.randint(0, 100) if np.random.rand() > ratio_none else None for _ in range(n_rows)],
|
||||
"value": [round(np.random.normal(50, 15), 2) if np.random.rand() > ratio_none else None for _ in range(n_rows)],
|
||||
"tags": [random_tags() for _ in range(n_rows)],
|
||||
"rating": [random_decimal() for _ in range(n_rows)],
|
||||
"uuid": [str(uuid.uuid4()) if np.random.rand() > ratio_none else None for _ in range(n_rows)],
|
||||
"binary": [random_binary() for _ in range(n_rows)],
|
||||
"json_blob": [random_json_blob() for _ in range(n_rows)],
|
||||
"category": pd.Categorical(
|
||||
np.random.choice(
|
||||
["A", "B", "C", None],
|
||||
size=n_rows,
|
||||
p=[(1 - ratio_none) / 3, (1 - ratio_none) / 3, (1 - ratio_none) / 3, ratio_none],
|
||||
)
|
||||
),
|
||||
"nullable_text": [random.choice(["Good", "Bad", "Average", None]) for _ in range(n_rows)],
|
||||
}
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
def auto_convert_dtypes_spark(
|
||||
df: psDataFrame,
|
||||
na_values: list = None,
|
||||
category_threshold: float = 0.3,
|
||||
convert_threshold: float = 0.6,
|
||||
sample_ratio: float = 0.1,
|
||||
) -> tuple[psDataFrame, dict]:
|
||||
"""Automatically convert data types in a PySpark DataFrame using heuristics.
|
||||
|
||||
This function analyzes a sample of the DataFrame to infer appropriate data types
|
||||
and applies the conversions. It handles timestamps, numeric values, booleans,
|
||||
and categorical fields.
|
||||
|
||||
Args:
|
||||
df: A PySpark DataFrame to convert.
|
||||
na_values: List of strings to be considered as NA/NaN. Defaults to
|
||||
['NA', 'na', 'NULL', 'null', ''].
|
||||
category_threshold: Maximum ratio of unique values to total values
|
||||
to consider a column categorical. Defaults to 0.3.
|
||||
convert_threshold: Minimum ratio of successfully converted values required
|
||||
to apply a type conversion. Defaults to 0.6.
|
||||
sample_ratio: Fraction of data to sample for type inference. Defaults to 0.1.
|
||||
|
||||
Returns:
|
||||
tuple: (The DataFrame with converted types, A dictionary mapping column names to
|
||||
their inferred types as strings)
|
||||
|
||||
Note:
|
||||
- 'category' in the schema dict is conceptual as PySpark doesn't have a true
|
||||
category type like pandas
|
||||
- The function uses sampling for efficiency with large datasets
|
||||
"""
|
||||
n_rows = df.count()
|
||||
if na_values is None:
|
||||
na_values = ["NA", "na", "NULL", "null", ""]
|
||||
|
||||
# Normalize NA-like values
|
||||
for colname, coltype in df.dtypes:
|
||||
if coltype == "string":
|
||||
df = df.withColumn(
|
||||
colname,
|
||||
F.when(F.trim(F.lower(F.col(colname))).isin([v.lower() for v in na_values]), None).otherwise(
|
||||
F.col(colname)
|
||||
),
|
||||
)
|
||||
|
||||
schema = {}
|
||||
for colname in df.columns:
|
||||
# Sample once at an appropriate ratio
|
||||
sample_ratio_to_use = min(1.0, sample_ratio if n_rows * sample_ratio > 100 else 100 / n_rows)
|
||||
col_sample = df.select(colname).sample(withReplacement=False, fraction=sample_ratio_to_use).dropna()
|
||||
sample_count = col_sample.count()
|
||||
|
||||
inferred_type = "string" # Default
|
||||
|
||||
if col_sample.dtypes[0][1] != "string":
|
||||
schema[colname] = col_sample.dtypes[0][1]
|
||||
continue
|
||||
|
||||
if sample_count == 0:
|
||||
schema[colname] = "string"
|
||||
continue
|
||||
|
||||
# Check if timestamp
|
||||
ts_col = col_sample.withColumn("parsed", F.to_timestamp(F.col(colname)))
|
||||
|
||||
# Check numeric
|
||||
if (
|
||||
col_sample.withColumn("n", F.col(colname).cast("double")).filter("n is not null").count()
|
||||
>= sample_count * convert_threshold
|
||||
):
|
||||
# All whole numbers?
|
||||
all_whole = (
|
||||
col_sample.withColumn("n", F.col(colname).cast("double"))
|
||||
.filter("n is not null")
|
||||
.withColumn("frac", F.abs(F.col("n") % 1))
|
||||
.filter("frac > 0.000001")
|
||||
.count()
|
||||
== 0
|
||||
)
|
||||
inferred_type = "int" if all_whole else "double"
|
||||
|
||||
# Check low-cardinality (category-like)
|
||||
elif (
|
||||
sample_count > 0
|
||||
and col_sample.select(F.countDistinct(F.col(colname))).collect()[0][0] / sample_count <= category_threshold
|
||||
):
|
||||
inferred_type = "category" # Will just be string, but marked as such
|
||||
|
||||
# Check if timestamp
|
||||
elif ts_col.filter(F.col("parsed").isNotNull()).count() >= sample_count * convert_threshold:
|
||||
inferred_type = "timestamp"
|
||||
|
||||
schema[colname] = inferred_type
|
||||
|
||||
# Apply inferred schema
|
||||
for colname, inferred_type in schema.items():
|
||||
if inferred_type == "int":
|
||||
df = df.withColumn(colname, F.col(colname).cast(T.IntegerType()))
|
||||
elif inferred_type == "double":
|
||||
df = df.withColumn(colname, F.col(colname).cast(T.DoubleType()))
|
||||
elif inferred_type == "boolean":
|
||||
df = df.withColumn(
|
||||
colname,
|
||||
F.when(F.lower(F.col(colname)).isin("true", "yes", "1"), True)
|
||||
.when(F.lower(F.col(colname)).isin("false", "no", "0"), False)
|
||||
.otherwise(None),
|
||||
)
|
||||
elif inferred_type == "timestamp":
|
||||
df = df.withColumn(colname, F.to_timestamp(F.col(colname)))
|
||||
elif inferred_type == "category":
|
||||
df = df.withColumn(colname, F.col(colname).cast(T.StringType())) # Marked conceptually
|
||||
|
||||
# otherwise keep as string (or original type)
|
||||
|
||||
return df, schema
|
||||
|
||||
|
||||
def auto_convert_dtypes_pandas(
|
||||
df: DataFrame,
|
||||
na_values: list = None,
|
||||
category_threshold: float = 0.3,
|
||||
convert_threshold: float = 0.6,
|
||||
sample_ratio: float = 1.0,
|
||||
) -> tuple[DataFrame, dict]:
|
||||
"""Automatically convert data types in a pandas DataFrame using heuristics.
|
||||
|
||||
This function analyzes the DataFrame to infer appropriate data types
|
||||
and applies the conversions. It handles timestamps, timedeltas, numeric values,
|
||||
and categorical fields.
|
||||
|
||||
Args:
|
||||
df: A pandas DataFrame to convert.
|
||||
na_values: List of strings to be considered as NA/NaN. Defaults to
|
||||
['NA', 'na', 'NULL', 'null', ''].
|
||||
category_threshold: Maximum ratio of unique values to total values
|
||||
to consider a column categorical. Defaults to 0.3.
|
||||
convert_threshold: Minimum ratio of successfully converted values required
|
||||
to apply a type conversion. Defaults to 0.6.
|
||||
sample_ratio: Fraction of data to sample for type inference. Not used in pandas version
|
||||
but included for API compatibility. Defaults to 1.0.
|
||||
|
||||
Returns:
|
||||
tuple: (The DataFrame with converted types, A dictionary mapping column names to
|
||||
their inferred types as strings)
|
||||
"""
|
||||
if na_values is None:
|
||||
na_values = {"NA", "na", "NULL", "null", ""}
|
||||
# Remove the empty string separately (handled by the regex `^\s*$`)
|
||||
vals = [re.escape(v) for v in na_values if v != ""]
|
||||
# Build inner alternation group
|
||||
inner = "|".join(vals) if vals else ""
|
||||
if inner:
|
||||
pattern = re.compile(rf"^\s*(?:{inner})?\s*$")
|
||||
else:
|
||||
pattern = re.compile(r"^\s*$")
|
||||
|
||||
df_converted = df.convert_dtypes()
|
||||
schema = {}
|
||||
|
||||
# Sample if needed (for API compatibility)
|
||||
if sample_ratio < 1.0:
|
||||
df = df.sample(frac=sample_ratio)
|
||||
|
||||
n_rows = len(df)
|
||||
|
||||
for col in df.columns:
|
||||
series = df[col]
|
||||
# Replace NA-like values if string
|
||||
if series.dtype == object:
|
||||
mask = series.astype(str).str.match(pattern)
|
||||
series_cleaned = series.where(~mask, np.nan)
|
||||
else:
|
||||
series_cleaned = series
|
||||
|
||||
# Skip conversion if already non-object data type, except bool which can potentially be categorical
|
||||
if (
|
||||
not isinstance(series_cleaned.dtype, pd.BooleanDtype)
|
||||
and not isinstance(series_cleaned.dtype, pd.StringDtype)
|
||||
and series_cleaned.dtype != "object"
|
||||
):
|
||||
# Keep the original data type for non-object dtypes
|
||||
df_converted[col] = series
|
||||
schema[col] = str(series_cleaned.dtype)
|
||||
continue
|
||||
|
||||
# print(f"type: {series_cleaned.dtype}, column: {series_cleaned.name}")
|
||||
|
||||
if not isinstance(series_cleaned.dtype, pd.BooleanDtype):
|
||||
# Try numeric (int or float)
|
||||
numeric = pd.to_numeric(series_cleaned, errors="coerce")
|
||||
if numeric.notna().sum() >= n_rows * convert_threshold:
|
||||
if (numeric.dropna() % 1 == 0).all():
|
||||
try:
|
||||
df_converted[col] = numeric.astype("int") # Nullable integer
|
||||
schema[col] = "int"
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
df_converted[col] = numeric.astype("double")
|
||||
schema[col] = "double"
|
||||
continue
|
||||
|
||||
# Try datetime
|
||||
datetime_converted = pd.to_datetime(series_cleaned, errors="coerce")
|
||||
if datetime_converted.notna().sum() >= n_rows * convert_threshold:
|
||||
df_converted[col] = datetime_converted
|
||||
schema[col] = "timestamp"
|
||||
continue
|
||||
|
||||
# Try timedelta
|
||||
try:
|
||||
timedelta_converted = pd.to_timedelta(series_cleaned, errors="coerce")
|
||||
if timedelta_converted.notna().sum() >= n_rows * convert_threshold:
|
||||
df_converted[col] = timedelta_converted
|
||||
schema[col] = "timedelta"
|
||||
continue
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
# Try category
|
||||
try:
|
||||
unique_ratio = series_cleaned.nunique(dropna=True) / n_rows if n_rows > 0 else 1.0
|
||||
if unique_ratio <= category_threshold:
|
||||
df_converted[col] = series_cleaned.astype("category")
|
||||
schema[col] = "category"
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
df_converted[col] = series_cleaned.astype("string")
|
||||
schema[col] = "string"
|
||||
|
||||
return df_converted, schema
|
||||
|
||||
@@ -1,7 +1,37 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
class ColoredFormatter(logging.Formatter):
|
||||
# ANSI escape codes for colors
|
||||
COLORS = {
|
||||
# logging.DEBUG: "\033[36m", # Cyan
|
||||
# logging.INFO: "\033[32m", # Green
|
||||
logging.WARNING: "\033[33m", # Yellow
|
||||
logging.ERROR: "\033[31m", # Red
|
||||
logging.CRITICAL: "\033[1;31m", # Bright Red
|
||||
}
|
||||
RESET = "\033[0m" # Reset to default
|
||||
|
||||
def __init__(self, fmt, datefmt, use_color=True):
|
||||
super().__init__(fmt, datefmt)
|
||||
self.use_color = use_color
|
||||
|
||||
def format(self, record):
|
||||
formatted = super().format(record)
|
||||
if self.use_color:
|
||||
color = self.COLORS.get(record.levelno, "")
|
||||
if color:
|
||||
return f"{color}{formatted}{self.RESET}"
|
||||
return formatted
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger_formatter = logging.Formatter(
|
||||
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
|
||||
use_color = True
|
||||
if os.getenv("FLAML_LOG_NO_COLOR"):
|
||||
use_color = False
|
||||
|
||||
logger_formatter = ColoredFormatter(
|
||||
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S", use_color
|
||||
)
|
||||
logger.propagate = False
|
||||
|
||||
@@ -127,9 +127,21 @@ def metric_loss_score(
|
||||
import datasets
|
||||
|
||||
datasets_metric_name = huggingface_submetric_to_metric.get(metric_name, metric_name.split(":")[0])
|
||||
metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
|
||||
metric_mode = huggingface_metric_to_mode[datasets_metric_name]
|
||||
|
||||
# datasets>=3 removed load_metric; prefer evaluate if available
|
||||
try:
|
||||
import evaluate
|
||||
|
||||
metric = evaluate.load(datasets_metric_name, trust_remote_code=True)
|
||||
except Exception:
|
||||
if hasattr(datasets, "load_metric"):
|
||||
metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
|
||||
else:
|
||||
from datasets import load_metric as _load_metric # older datasets
|
||||
|
||||
metric = _load_metric(datasets_metric_name, trust_remote_code=True)
|
||||
|
||||
if metric_name.startswith("seqeval"):
|
||||
y_processed_true = [[labels[tr] for tr in each_list] for each_list in y_processed_true]
|
||||
elif metric in ("pearsonr", "spearmanr"):
|
||||
@@ -604,7 +616,12 @@ def _eval_estimator(
|
||||
logger.warning(f"ValueError {e} happened in `metric_loss_score`, set `val_loss` to `np.inf`")
|
||||
metric_for_logging = {"pred_time": pred_time}
|
||||
if log_training_metric:
|
||||
train_pred_y = get_y_pred(estimator, X_train, eval_metric, task)
|
||||
# For time series forecasting, X_train may be a sampled dataset whose
|
||||
# test partition can be empty. Use the training partition from X_val
|
||||
# (which is the dataset used to define y_train above) to keep shapes
|
||||
# aligned and avoid empty prediction inputs.
|
||||
X_train_for_metric = X_val.X_train if isinstance(X_val, TimeSeriesDataset) else X_train
|
||||
train_pred_y = get_y_pred(estimator, X_train_for_metric, eval_metric, task)
|
||||
metric_for_logging["train_loss"] = metric_loss_score(
|
||||
eval_metric,
|
||||
train_pred_y,
|
||||
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
import shutil
|
||||
import signal
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
@@ -89,24 +90,28 @@ def limit_resource(memory_limit, time_limit):
|
||||
except ValueError:
|
||||
# According to https://bugs.python.org/issue40518, it's a mac-specific error.
|
||||
pass
|
||||
main_thread = False
|
||||
if time_limit is not None:
|
||||
alarm_set = False
|
||||
if time_limit is not None and threading.current_thread() is threading.main_thread():
|
||||
try:
|
||||
signal.signal(signal.SIGALRM, TimeoutHandler)
|
||||
signal.alarm(int(time_limit) or 1)
|
||||
main_thread = True
|
||||
alarm_set = True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if main_thread:
|
||||
if alarm_set:
|
||||
signal.alarm(0)
|
||||
if memory_limit > 0:
|
||||
resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
|
||||
try:
|
||||
resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
class BaseEstimator:
|
||||
class BaseEstimator(sklearn.base.ClassifierMixin, sklearn.base.BaseEstimator):
|
||||
"""The abstract class for all learners.
|
||||
|
||||
Typical examples:
|
||||
@@ -130,7 +135,8 @@ class BaseEstimator:
|
||||
self._task = task if isinstance(task, Task) else task_factory(task, None, None)
|
||||
self.params = self.config2params(config)
|
||||
self.estimator_class = self._model = None
|
||||
if "_estimator_type" in config:
|
||||
self.estimator_baseclass = "sklearn"
|
||||
if "_estimator_type" in self.params:
|
||||
self._estimator_type = self.params.pop("_estimator_type")
|
||||
else:
|
||||
self._estimator_type = "classifier" if self._task.is_classification() else "regressor"
|
||||
@@ -434,6 +440,7 @@ class SparkEstimator(BaseEstimator):
|
||||
raise SPARK_ERROR
|
||||
super().__init__(task, **config)
|
||||
self.df_train = None
|
||||
self.estimator_baseclass = "spark"
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
@@ -969,7 +976,7 @@ class TransformersEstimator(BaseEstimator):
|
||||
from .nlp.huggingface.utils import tokenize_text
|
||||
from .nlp.utils import is_a_list_of_str
|
||||
|
||||
is_str = str(X.dtypes[0]) in ("string", "str")
|
||||
is_str = str(X.dtypes.iloc[0]) in ("string", "str")
|
||||
is_list_of_str = is_a_list_of_str(X[list(X.keys())[0]].to_list()[0])
|
||||
|
||||
if is_str or is_list_of_str:
|
||||
@@ -1691,7 +1698,7 @@ class XGBoostEstimator(SKLearnEstimator):
|
||||
# use_label_encoder is deprecated in 1.7.
|
||||
if xgboost_version < "1.7.0":
|
||||
params["use_label_encoder"] = params.get("use_label_encoder", False)
|
||||
if "n_jobs" in config:
|
||||
if "n_jobs" in params:
|
||||
params["nthread"] = params.pop("n_jobs")
|
||||
return params
|
||||
|
||||
@@ -1891,7 +1898,7 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
params = super().config2params(config)
|
||||
if "max_leaves" in params:
|
||||
params["max_leaf_nodes"] = params.get("max_leaf_nodes", params.pop("max_leaves"))
|
||||
if not self._task.is_classification() and "criterion" in config:
|
||||
if not self._task.is_classification() and "criterion" in params:
|
||||
params.pop("criterion")
|
||||
if "random_state" not in params:
|
||||
params["random_state"] = 12032022
|
||||
@@ -2066,8 +2073,8 @@ class CatBoostEstimator(BaseEstimator):
|
||||
self.estimator_class = CatBoostRegressor
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
|
||||
if "is_retrain" in kwargs:
|
||||
kwargs.pop("is_retrain")
|
||||
kwargs.pop("is_retrain", None)
|
||||
kwargs.pop("groups", None)
|
||||
start_time = time.time()
|
||||
deadline = start_time + budget if budget else np.inf
|
||||
train_dir = f"catboost_{str(start_time)}"
|
||||
@@ -2342,9 +2349,12 @@ class SGDEstimator(SKLearnEstimator):
|
||||
params = super().config2params(config)
|
||||
params["tol"] = params.get("tol", 0.0001)
|
||||
params["loss"] = params.get("loss", None)
|
||||
if params["loss"] is None and self._task.is_classification():
|
||||
params["loss"] = "log_loss" if SKLEARN_VERSION >= "1.1" else "log"
|
||||
if not self._task.is_classification():
|
||||
if params["loss"] is None:
|
||||
if self._task.is_classification():
|
||||
params["loss"] = "log_loss" if SKLEARN_VERSION >= "1.1" else "log"
|
||||
else:
|
||||
params["loss"] = "squared_error"
|
||||
if not self._task.is_classification() and "n_jobs" in params:
|
||||
params.pop("n_jobs")
|
||||
|
||||
if params.get("penalty") != "elasticnet":
|
||||
@@ -2815,7 +2825,7 @@ class suppress_stdout_stderr:
|
||||
# Open a pair of null files
|
||||
self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
|
||||
# Save the actual stdout (1) and stderr (2) file descriptors.
|
||||
self.save_fds = (os.dup(1), os.dup(2))
|
||||
self.save_fds = [os.dup(1), os.dup(2)]
|
||||
|
||||
def __enter__(self):
|
||||
# Assign the null pointers to stdout and stderr.
|
||||
@@ -2827,5 +2837,5 @@ class suppress_stdout_stderr:
|
||||
os.dup2(self.save_fds[0], 1)
|
||||
os.dup2(self.save_fds[1], 2)
|
||||
# Close the null files
|
||||
os.close(self.null_fds[0])
|
||||
os.close(self.null_fds[1])
|
||||
for fd in self.null_fds + self.save_fds:
|
||||
os.close(fd)
|
||||
|
||||
@@ -5,7 +5,7 @@ from typing import List, Optional
|
||||
from flaml.automl.task.task import NLG_TASKS
|
||||
|
||||
try:
|
||||
from transformers import TrainingArguments
|
||||
from transformers import Seq2SeqTrainingArguments as TrainingArguments
|
||||
except ImportError:
|
||||
TrainingArguments = object
|
||||
|
||||
@@ -77,6 +77,14 @@ class TrainingArgumentsForAuto(TrainingArguments):
|
||||
|
||||
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
|
||||
|
||||
# Newer versions of HuggingFace Transformers may access `TrainingArguments.generation_config`
|
||||
# (e.g., in generation-aware trainers/callbacks). Keep this attribute to remain compatible
|
||||
# while defaulting to None for non-generation tasks.
|
||||
generation_config: Optional[object] = field(
|
||||
default=None,
|
||||
metadata={"help": "Optional generation config (or path) used by generation-aware trainers."},
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load_args_from_console():
|
||||
from dataclasses import fields
|
||||
|
||||
@@ -396,7 +396,7 @@ def load_model(checkpoint_path, task, num_labels=None):
|
||||
|
||||
if task in (SEQCLASSIFICATION, SEQREGRESSION):
|
||||
return AutoModelForSequenceClassification.from_pretrained(
|
||||
checkpoint_path, config=model_config, ignore_mismatched_sizes=True
|
||||
checkpoint_path, config=model_config, ignore_mismatched_sizes=True, trust_remote_code=True
|
||||
)
|
||||
elif task == TOKENCLASSIFICATION:
|
||||
return AutoModelForTokenClassification.from_pretrained(checkpoint_path, config=model_config)
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import atexit
|
||||
import logging
|
||||
import os
|
||||
|
||||
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
|
||||
@@ -10,13 +12,14 @@ try:
|
||||
from pyspark.pandas import Series as psSeries
|
||||
from pyspark.pandas import set_option
|
||||
from pyspark.sql import DataFrame as sparkDataFrame
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.util import VersionUtils
|
||||
except ImportError:
|
||||
|
||||
class psDataFrame:
|
||||
pass
|
||||
|
||||
F = T = ps = sparkDataFrame = psSeries = psDataFrame
|
||||
F = T = ps = sparkDataFrame = SparkSession = psSeries = psDataFrame
|
||||
_spark_major_minor_version = set_option = None
|
||||
ERROR = ImportError(
|
||||
"""Please run pip install flaml[spark]
|
||||
@@ -32,3 +35,60 @@ try:
|
||||
from pandas import DataFrame, Series
|
||||
except ImportError:
|
||||
DataFrame = Series = pd = None
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def disable_spark_ansi_mode():
|
||||
"""Disable Spark ANSI mode if it is enabled."""
|
||||
spark = SparkSession.getActiveSession() if hasattr(SparkSession, "getActiveSession") else None
|
||||
adjusted = False
|
||||
try:
|
||||
ps_conf = ps.get_option("compute.fail_on_ansi_mode")
|
||||
except Exception:
|
||||
ps_conf = None
|
||||
ansi_conf = [None, ps_conf] # ansi_conf and ps_conf original values
|
||||
# Spark may store the config as string 'true'/'false' (or boolean in some contexts)
|
||||
if spark is not None:
|
||||
ansi_conf[0] = spark.conf.get("spark.sql.ansi.enabled")
|
||||
ansi_enabled = (
|
||||
(isinstance(ansi_conf[0], str) and ansi_conf[0].lower() == "true")
|
||||
or (isinstance(ansi_conf[0], bool) and ansi_conf[0] is True)
|
||||
or ansi_conf[0] is None
|
||||
)
|
||||
try:
|
||||
if ansi_enabled:
|
||||
logger.debug("Adjusting spark.sql.ansi.enabled to false")
|
||||
spark.conf.set("spark.sql.ansi.enabled", "false")
|
||||
adjusted = True
|
||||
except Exception:
|
||||
# If reading/setting options fail for some reason, keep going and let
|
||||
# pandas-on-Spark raise a meaningful error later.
|
||||
logger.exception("Failed to set spark.sql.ansi.enabled")
|
||||
|
||||
if ansi_conf[1]:
|
||||
logger.debug("Adjusting pandas-on-Spark compute.fail_on_ansi_mode to False")
|
||||
ps.set_option("compute.fail_on_ansi_mode", False)
|
||||
adjusted = True
|
||||
|
||||
return spark, ansi_conf, adjusted
|
||||
|
||||
|
||||
def restore_spark_ansi_mode(spark, ansi_conf, adjusted):
|
||||
"""Restore Spark ANSI mode to its original setting."""
|
||||
# Restore the original spark.sql.ansi.enabled to avoid persistent side-effects.
|
||||
if adjusted and spark and ansi_conf[0] is not None:
|
||||
try:
|
||||
logger.debug(f"Restoring spark.sql.ansi.enabled to {ansi_conf[0]}")
|
||||
spark.conf.set("spark.sql.ansi.enabled", ansi_conf[0])
|
||||
except Exception:
|
||||
logger.exception("Failed to restore spark.sql.ansi.enabled")
|
||||
|
||||
if adjusted and ansi_conf[1]:
|
||||
logger.debug(f"Restoring pandas-on-Spark compute.fail_on_ansi_mode to {ansi_conf[1]}")
|
||||
ps.set_option("compute.fail_on_ansi_mode", ansi_conf[1])
|
||||
|
||||
|
||||
spark, ansi_conf, adjusted = disable_spark_ansi_mode()
|
||||
atexit.register(restore_spark_ansi_mode, spark, ansi_conf, adjusted)
|
||||
|
||||
@@ -59,17 +59,29 @@ def to_pandas_on_spark(
|
||||
```
|
||||
"""
|
||||
set_option("compute.default_index_type", default_index_type)
|
||||
if isinstance(df, (DataFrame, Series)):
|
||||
return ps.from_pandas(df)
|
||||
elif isinstance(df, sparkDataFrame):
|
||||
if _spark_major_minor_version[0] == 3 and _spark_major_minor_version[1] < 3:
|
||||
return df.to_pandas_on_spark(index_col=index_col)
|
||||
try:
|
||||
orig_ps_conf = ps.get_option("compute.fail_on_ansi_mode")
|
||||
except Exception:
|
||||
orig_ps_conf = None
|
||||
if orig_ps_conf:
|
||||
ps.set_option("compute.fail_on_ansi_mode", False)
|
||||
|
||||
try:
|
||||
if isinstance(df, (DataFrame, Series)):
|
||||
return ps.from_pandas(df)
|
||||
elif isinstance(df, sparkDataFrame):
|
||||
if _spark_major_minor_version[0] == 3 and _spark_major_minor_version[1] < 3:
|
||||
return df.to_pandas_on_spark(index_col=index_col)
|
||||
else:
|
||||
return df.pandas_api(index_col=index_col)
|
||||
elif isinstance(df, (psDataFrame, psSeries)):
|
||||
return df
|
||||
else:
|
||||
return df.pandas_api(index_col=index_col)
|
||||
elif isinstance(df, (psDataFrame, psSeries)):
|
||||
return df
|
||||
else:
|
||||
raise TypeError(f"{type(df)} is not one of pandas.DataFrame, pandas.Series and pyspark.sql.DataFrame")
|
||||
raise TypeError(f"{type(df)} is not one of pandas.DataFrame, pandas.Series and pyspark.sql.DataFrame")
|
||||
finally:
|
||||
# Restore original config
|
||||
if orig_ps_conf:
|
||||
ps.set_option("compute.fail_on_ansi_mode", orig_ps_conf)
|
||||
|
||||
|
||||
def train_test_split_pyspark(
|
||||
|
||||
@@ -442,8 +442,8 @@ class GenericTask(Task):
|
||||
X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||
if data_is_df:
|
||||
X_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all.reset_index(drop=True, inplace=True)
|
||||
|
||||
X_train, y_train = X_train_all, y_train_all
|
||||
state.groups_all = state.groups
|
||||
@@ -746,7 +746,10 @@ class GenericTask(Task):
|
||||
elif isinstance(kf, TimeSeriesSplit):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
else:
|
||||
kf = kf.split(X_train_split)
|
||||
try:
|
||||
kf = kf.split(X_train_split)
|
||||
except TypeError:
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
|
||||
for train_index, val_index in kf:
|
||||
if shuffle:
|
||||
@@ -769,10 +772,10 @@ class GenericTask(Task):
|
||||
if not is_spark_dataframe:
|
||||
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
||||
if weight is not None:
|
||||
fit_kwargs["sample_weight"], weight_val = (
|
||||
weight[train_index],
|
||||
weight[val_index],
|
||||
fit_kwargs["sample_weight"] = (
|
||||
weight[train_index] if isinstance(weight, np.ndarray) else weight.iloc[train_index]
|
||||
)
|
||||
weight_val = weight[val_index] if isinstance(weight, np.ndarray) else weight.iloc[val_index]
|
||||
if groups is not None:
|
||||
fit_kwargs["groups"] = (
|
||||
groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]
|
||||
|
||||
@@ -192,7 +192,7 @@ class Task(ABC):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
|
||||
@@ -151,7 +151,7 @@ class TimeSeriesTask(Task):
|
||||
raise ValueError("Must supply either X_train_all and y_train_all, or dataframe and label")
|
||||
|
||||
try:
|
||||
dataframe[self.time_col] = pd.to_datetime(dataframe[self.time_col])
|
||||
dataframe.loc[:, self.time_col] = pd.to_datetime(dataframe[self.time_col])
|
||||
except Exception:
|
||||
raise ValueError(
|
||||
f"For '{TS_FORECAST}' task, time column {self.time_col} must contain timestamp values."
|
||||
@@ -529,7 +529,7 @@ def remove_ts_duplicates(
|
||||
duplicates = X.duplicated()
|
||||
|
||||
if any(duplicates):
|
||||
logger.warning("Duplicate timestamp values found in timestamp column. " f"\n{X.loc[duplicates, X][time_col]}")
|
||||
logger.warning("Duplicate timestamp values found in timestamp column. " f"\n{X.loc[duplicates, time_col]}")
|
||||
X = X.drop_duplicates()
|
||||
logger.warning("Removed duplicate rows based on all columns")
|
||||
assert (
|
||||
|
||||
@@ -76,6 +76,8 @@ class SklearnWrapper:
|
||||
self.pca = None
|
||||
|
||||
def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
|
||||
if "is_retrain" in kwargs:
|
||||
kwargs.pop("is_retrain")
|
||||
self._X = X
|
||||
self._y = y
|
||||
|
||||
@@ -92,7 +94,14 @@ class SklearnWrapper:
|
||||
|
||||
for i, model in enumerate(self.models):
|
||||
offset = i + self.lags
|
||||
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
|
||||
if len(X) - offset > 2:
|
||||
# series with length 2 will meet All features are either constant or ignored.
|
||||
# TODO: see why the non-constant features are ignored. Selector?
|
||||
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
|
||||
elif len(X) > offset and "catboost" not in str(model).lower():
|
||||
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
|
||||
else:
|
||||
print("[INFO]: Length of data should longer than period + lags.")
|
||||
return self
|
||||
|
||||
def predict(self, X, X_train=None, y_train=None):
|
||||
|
||||
@@ -264,7 +264,8 @@ class TCNEstimator(TimeSeriesEstimator):
|
||||
def predict(self, X):
|
||||
X = self.enrich(X)
|
||||
if isinstance(X, TimeSeriesDataset):
|
||||
df = X.X_val
|
||||
# Use X_train if X_val is empty (e.g., when computing training metrics)
|
||||
df = X.X_val if len(X.test_data) > 0 else X.X_train
|
||||
else:
|
||||
df = X
|
||||
dataset = DataframeDataset(
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import inspect
|
||||
import time
|
||||
|
||||
try:
|
||||
@@ -106,12 +107,17 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
import warnings
|
||||
|
||||
import pytorch_lightning as pl
|
||||
try:
|
||||
import lightning.pytorch as pl
|
||||
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
|
||||
from lightning.pytorch.loggers import TensorBoardLogger
|
||||
except ImportError:
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
import torch
|
||||
from pytorch_forecasting import TemporalFusionTransformer
|
||||
from pytorch_forecasting.metrics import QuantileLoss
|
||||
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
|
||||
# a bit of monkey patching to fix the MacOS test
|
||||
# all the log_prediction method appears to do is plot stuff, which ?breaks github tests
|
||||
@@ -132,12 +138,26 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
|
||||
lr_logger = LearningRateMonitor() # log the learning rate
|
||||
logger = TensorBoardLogger(kwargs.get("log_dir", "lightning_logs")) # logging results to a tensorboard
|
||||
default_trainer_kwargs = dict(
|
||||
gpus=self._kwargs.get("gpu_per_trial", [0]) if torch.cuda.is_available() else None,
|
||||
max_epochs=max_epochs,
|
||||
gradient_clip_val=gradient_clip_val,
|
||||
callbacks=[lr_logger, early_stop_callback],
|
||||
logger=logger,
|
||||
)
|
||||
|
||||
# PyTorch Lightning >=2.0 replaced `gpus` with `accelerator`/`devices`.
|
||||
# Also, passing `gpus=None` is not accepted on newer versions.
|
||||
trainer_sig_params = inspect.signature(pl.Trainer.__init__).parameters
|
||||
if torch.cuda.is_available() and "gpus" in trainer_sig_params:
|
||||
gpus = self._kwargs.get("gpu_per_trial", None)
|
||||
if gpus is not None:
|
||||
default_trainer_kwargs["gpus"] = gpus
|
||||
elif torch.cuda.is_available() and "devices" in trainer_sig_params:
|
||||
devices = self._kwargs.get("gpu_per_trial", None)
|
||||
if devices == -1:
|
||||
devices = "auto"
|
||||
if devices is not None:
|
||||
default_trainer_kwargs["accelerator"] = "gpu"
|
||||
default_trainer_kwargs["devices"] = devices
|
||||
trainer = pl.Trainer(
|
||||
**default_trainer_kwargs,
|
||||
)
|
||||
@@ -157,7 +177,14 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
|
||||
val_dataloaders=val_dataloader,
|
||||
)
|
||||
best_model_path = trainer.checkpoint_callback.best_model_path
|
||||
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
|
||||
# PyTorch 2.6 changed `torch.load` default `weights_only` from False -> True.
|
||||
# Some Lightning checkpoints (including those produced here) can require full unpickling.
|
||||
# This path is generated locally during training, so it's trusted.
|
||||
load_sig_params = inspect.signature(TemporalFusionTransformer.load_from_checkpoint).parameters
|
||||
if "weights_only" in load_sig_params:
|
||||
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path, weights_only=False)
|
||||
else:
|
||||
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
|
||||
train_time = time.time() - current_time
|
||||
self._model = best_tft
|
||||
return train_time
|
||||
@@ -170,7 +197,11 @@ class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
|
||||
last_data_cols = self.group_ids.copy()
|
||||
last_data_cols.append(self.target_names[0])
|
||||
last_data = self.data[lambda x: x.time_idx == x.time_idx.max()][last_data_cols]
|
||||
decoder_data = X.X_val if isinstance(X, TimeSeriesDataset) else X
|
||||
# Use X_train if test_data is empty (e.g., when computing training metrics)
|
||||
if isinstance(X, TimeSeriesDataset):
|
||||
decoder_data = X.X_val if len(X.test_data) > 0 else X.X_train
|
||||
else:
|
||||
decoder_data = X
|
||||
if "time_idx" not in decoder_data:
|
||||
decoder_data = add_time_idx_col(decoder_data)
|
||||
decoder_data["time_idx"] += encoder_data["time_idx"].max() + 1 - decoder_data["time_idx"].min()
|
||||
|
||||
@@ -9,6 +9,7 @@ import numpy as np
|
||||
try:
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series, to_datetime
|
||||
from pandas.api.types import is_datetime64_any_dtype
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
@@ -120,7 +121,12 @@ class TimeSeriesDataset:
|
||||
|
||||
@property
|
||||
def X_all(self) -> pd.DataFrame:
|
||||
return pd.concat([self.X_train, self.X_val], axis=0)
|
||||
# Remove empty or all-NA columns before concatenation
|
||||
X_train_filtered = self.X_train.dropna(axis=1, how="all")
|
||||
X_val_filtered = self.X_val.dropna(axis=1, how="all")
|
||||
|
||||
# Concatenate the filtered DataFrames
|
||||
return pd.concat([X_train_filtered, X_val_filtered], axis=0)
|
||||
|
||||
@property
|
||||
def y_train(self) -> pd.DataFrame:
|
||||
@@ -392,6 +398,15 @@ class DataTransformerTS:
|
||||
assert len(self.num_columns) == 0, "Trying to call fit() twice, something is wrong"
|
||||
|
||||
for column in X.columns:
|
||||
# Never treat the time column as a feature for sklearn preprocessing
|
||||
if column == self.time_col:
|
||||
continue
|
||||
|
||||
# Robust datetime detection (covers datetime64[ms/us/ns], tz-aware, etc.)
|
||||
if is_datetime64_any_dtype(X[column]):
|
||||
self.datetime_columns.append(column)
|
||||
continue
|
||||
|
||||
# sklearn/utils/validation.py needs int/float values
|
||||
if X[column].dtype.name in ("object", "category", "string"):
|
||||
if (
|
||||
@@ -462,7 +477,7 @@ class DataTransformerTS:
|
||||
if "__NAN__" not in X[col].cat.categories:
|
||||
X[col] = X[col].cat.add_categories("__NAN__").fillna("__NAN__")
|
||||
else:
|
||||
X[col] = X[col].fillna("__NAN__")
|
||||
X[col] = X[col].fillna("__NAN__").infer_objects(copy=False)
|
||||
X[col] = X[col].astype("category")
|
||||
|
||||
for column in self.num_columns:
|
||||
|
||||
@@ -194,7 +194,13 @@ class Orbit(TimeSeriesEstimator):
|
||||
|
||||
elif isinstance(X, TimeSeriesDataset):
|
||||
data = X
|
||||
X = data.test_data[[self.time_col] + X.regressors]
|
||||
# By default we predict on the dataset's test partition.
|
||||
# Some internal call paths (e.g., training-metric logging) may pass a
|
||||
# dataset whose test partition is empty; fall back to train partition.
|
||||
if data.test_data is not None and len(data.test_data):
|
||||
X = data.test_data[data.regressors + [data.time_col]]
|
||||
else:
|
||||
X = data.train_data[data.regressors + [data.time_col]]
|
||||
|
||||
if self._model is not None:
|
||||
forecast = self._model.predict(X, **kwargs)
|
||||
@@ -301,7 +307,13 @@ class Prophet(TimeSeriesEstimator):
|
||||
|
||||
if isinstance(X, TimeSeriesDataset):
|
||||
data = X
|
||||
X = data.test_data[data.regressors + [data.time_col]]
|
||||
# By default we predict on the dataset's test partition.
|
||||
# Some internal call paths (e.g., training-metric logging) may pass a
|
||||
# dataset whose test partition is empty; fall back to train partition.
|
||||
if data.test_data is not None and len(data.test_data):
|
||||
X = data.test_data[data.regressors + [data.time_col]]
|
||||
else:
|
||||
X = data.train_data[data.regressors + [data.time_col]]
|
||||
|
||||
X = X.rename(columns={self.time_col: "ds"})
|
||||
if self._model is not None:
|
||||
@@ -327,11 +339,19 @@ class StatsModelsEstimator(TimeSeriesEstimator):
|
||||
|
||||
if isinstance(X, TimeSeriesDataset):
|
||||
data = X
|
||||
X = data.test_data[data.regressors + [data.time_col]]
|
||||
# By default we predict on the dataset's test partition.
|
||||
# Some internal call paths (e.g., training-metric logging) may pass a
|
||||
# dataset whose test partition is empty; fall back to train partition.
|
||||
if data.test_data is not None and len(data.test_data):
|
||||
X = data.test_data[data.regressors + [data.time_col]]
|
||||
else:
|
||||
X = data.train_data[data.regressors + [data.time_col]]
|
||||
else:
|
||||
X = X[self.regressors + [self.time_col]]
|
||||
|
||||
if isinstance(X, DataFrame):
|
||||
if X.shape[0] == 0:
|
||||
return pd.Series([], name=self.target_names[0], dtype=float)
|
||||
start = X[self.time_col].iloc[0]
|
||||
end = X[self.time_col].iloc[-1]
|
||||
if len(self.regressors):
|
||||
@@ -829,6 +849,13 @@ class TS_SKLearn(TimeSeriesEstimator):
|
||||
if isinstance(X, TimeSeriesDataset):
|
||||
data = X
|
||||
X = data.test_data
|
||||
# By default we predict on the dataset's test partition.
|
||||
# Some internal call paths (e.g., training-metric logging) may pass a
|
||||
# dataset whose test partition is empty; fall back to train partition.
|
||||
if data.test_data is not None and len(data.test_data):
|
||||
X = data.test_data
|
||||
else:
|
||||
X = data.train_data
|
||||
|
||||
if self._model is not None:
|
||||
X = X[self.regressors]
|
||||
|
||||
@@ -32,6 +32,7 @@ def construct_portfolio(regret_matrix, meta_features, regret_bound):
|
||||
if meta_features is not None:
|
||||
scaler = RobustScaler()
|
||||
meta_features = meta_features.loc[tasks]
|
||||
meta_features = meta_features.astype(float)
|
||||
meta_features.loc[:, :] = scaler.fit_transform(meta_features)
|
||||
nearest_task = {}
|
||||
for t in tasks:
|
||||
|
||||
@@ -26,6 +26,7 @@ def config_predictor_tuple(tasks, configs, meta_features, regret_matrix):
|
||||
# pre-processing
|
||||
scaler = RobustScaler()
|
||||
meta_features_norm = meta_features.loc[tasks] # this makes a copy
|
||||
meta_features_norm = meta_features_norm.astype(float)
|
||||
meta_features_norm.loc[:, :] = scaler.fit_transform(meta_features_norm)
|
||||
|
||||
proc = {
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
import atexit
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from concurrent.futures import ThreadPoolExecutor, wait
|
||||
from typing import MutableMapping
|
||||
|
||||
import mlflow
|
||||
@@ -12,14 +16,15 @@ import pandas as pd
|
||||
from mlflow.entities import Metric, Param, RunTag
|
||||
from mlflow.exceptions import MlflowException
|
||||
from mlflow.utils.autologging_utils import AUTOLOGGING_INTEGRATIONS, autologging_is_disabled
|
||||
from packaging.requirements import Requirement
|
||||
from scipy.sparse import issparse
|
||||
from sklearn import tree
|
||||
|
||||
try:
|
||||
from pyspark.ml import Pipeline as SparkPipeline
|
||||
from pyspark.ml import PipelineModel as SparkPipelineModel
|
||||
except ImportError:
|
||||
|
||||
class SparkPipeline:
|
||||
class SparkPipelineModel:
|
||||
pass
|
||||
|
||||
|
||||
@@ -32,6 +37,84 @@ from flaml.version import __version__
|
||||
|
||||
SEARCH_MAX_RESULTS = 5000 # Each train should not have more than 5000 trials
|
||||
IS_RENAME_CHILD_RUN = os.environ.get("FLAML_IS_RENAME_CHILD_RUN", "false").lower() == "true"
|
||||
REMOVE_REQUIREMENT_LIST = [
|
||||
"synapseml-cognitive",
|
||||
"synapseml-core",
|
||||
"synapseml-deep-learning",
|
||||
"synapseml-internal",
|
||||
"synapseml-mlflow",
|
||||
"synapseml-opencv",
|
||||
"synapseml-vw",
|
||||
"synapseml-lightgbm",
|
||||
"synapseml-utils",
|
||||
"nni",
|
||||
"optuna",
|
||||
]
|
||||
OPTIONAL_REMOVE_REQUIREMENT_LIST = ["pytorch-lightning", "transformers"]
|
||||
|
||||
os.environ["MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR"] = os.environ.get("MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR", "false")
|
||||
|
||||
MLFLOW_NUM_WORKERS = int(os.environ.get("FLAML_MLFLOW_NUM_WORKERS", os.cpu_count() * 4 if os.cpu_count() else 2))
|
||||
executor = ThreadPoolExecutor(max_workers=MLFLOW_NUM_WORKERS)
|
||||
atexit.register(lambda: executor.shutdown(wait=True))
|
||||
|
||||
IS_CLEAN_LOGS = os.environ.get("FLAML_IS_CLEAN_LOGS", "1")
|
||||
if IS_CLEAN_LOGS == "1":
|
||||
logging.getLogger("synapse.ml").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("mlflow.utils").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("mlflow.utils.environment").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("mlflow.models.model").setLevel(logging.CRITICAL)
|
||||
warnings.simplefilter("ignore", category=FutureWarning)
|
||||
warnings.simplefilter("ignore", category=UserWarning)
|
||||
|
||||
|
||||
def convert_requirement(requirement_list: list[str]):
|
||||
ret = (
|
||||
[Requirement(s.strip().lower()) for s in requirement_list]
|
||||
if mlflow.__version__ <= "2.17.0"
|
||||
else requirement_list
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
def time_it(func_or_code=None):
|
||||
"""
|
||||
Decorator or function that measures execution time.
|
||||
|
||||
Can be used in three ways:
|
||||
1. As a decorator with no arguments: @time_it
|
||||
2. As a decorator with arguments: @time_it()
|
||||
3. As a function call with a string of code to execute and time: time_it("some_code()")
|
||||
|
||||
Args:
|
||||
func_or_code (callable or str, optional): Either a function to decorate or
|
||||
a string of code to execute and time.
|
||||
|
||||
Returns:
|
||||
callable or None: Returns a decorated function if used as a decorator,
|
||||
or None if used to execute a string of code.
|
||||
"""
|
||||
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
start_time = time.time()
|
||||
result = func(*args, **kwargs)
|
||||
end_time = time.time()
|
||||
logger.debug(f"Execution of {func.__name__} took {end_time - start_time:.4f} seconds")
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
|
||||
if callable(func_or_code):
|
||||
return decorator(func_or_code)
|
||||
elif func_or_code is None:
|
||||
return decorator
|
||||
else:
|
||||
start_time = time.time()
|
||||
exec(func_or_code)
|
||||
end_time = time.time()
|
||||
logger.debug(f"Execution\n```\n{func_or_code}\n```\ntook {end_time - start_time:.4f} seconds")
|
||||
|
||||
|
||||
def flatten_dict(d: MutableMapping, sep: str = ".") -> MutableMapping:
|
||||
@@ -49,23 +132,28 @@ def is_autolog_enabled():
|
||||
return not all(autologging_is_disabled(k) for k in AUTOLOGGING_INTEGRATIONS.keys())
|
||||
|
||||
|
||||
def get_mlflow_log_latency(model_history=False):
|
||||
def get_mlflow_log_latency(model_history=False, delete_run=True):
|
||||
try:
|
||||
FLAML_MLFLOW_LOG_LATENCY = float(os.getenv("FLAML_MLFLOW_LOG_LATENCY", 0))
|
||||
except ValueError:
|
||||
FLAML_MLFLOW_LOG_LATENCY = 0
|
||||
if FLAML_MLFLOW_LOG_LATENCY >= 0.1:
|
||||
return FLAML_MLFLOW_LOG_LATENCY
|
||||
st = time.time()
|
||||
with mlflow.start_run(nested=True, run_name="get_mlflow_log_latency") as run:
|
||||
if model_history:
|
||||
sk_model = tree.DecisionTreeClassifier()
|
||||
mlflow.sklearn.log_model(sk_model, "sk_models")
|
||||
mlflow.sklearn.log_model(Pipeline([("estimator", sk_model)]), "sk_pipeline")
|
||||
mlflow.sklearn.log_model(sk_model, "model")
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
pickle_fpath = os.path.join(tmpdir, f"tmp_{int(time.time()*1000)}")
|
||||
pickle_fpath = os.path.join(tmpdir, f"tmp_{int(time.time() * 1000)}")
|
||||
with open(pickle_fpath, "wb") as f:
|
||||
pickle.dump(sk_model, f)
|
||||
mlflow.log_artifact(pickle_fpath, "sk_model1")
|
||||
mlflow.log_artifact(pickle_fpath, "sk_model2")
|
||||
mlflow.log_artifact(pickle_fpath, "sk_model")
|
||||
mlflow.set_tag("synapseml.ui.visible", "false") # not shown inline in fabric
|
||||
mlflow.delete_run(run.info.run_id)
|
||||
if delete_run:
|
||||
mlflow.delete_run(run.info.run_id)
|
||||
et = time.time()
|
||||
return et - st
|
||||
return 3 * (et - st)
|
||||
|
||||
|
||||
def infer_signature(X_train=None, y_train=None, dataframe=None, label=None):
|
||||
@@ -98,12 +186,76 @@ def infer_signature(X_train=None, y_train=None, dataframe=None, label=None):
|
||||
)
|
||||
|
||||
|
||||
def update_and_install_requirements(
|
||||
run_id=None,
|
||||
model_name=None,
|
||||
model_version=None,
|
||||
remove_list=None,
|
||||
artifact_path="model",
|
||||
dst_path=None,
|
||||
install_with_ipython=False,
|
||||
):
|
||||
if not (run_id or (model_name and model_version)):
|
||||
raise ValueError(
|
||||
"Please provide `run_id` or both `model_name` and `model_version`. If all three are provided, `run_id` will be used."
|
||||
)
|
||||
|
||||
if install_with_ipython:
|
||||
from IPython import get_ipython
|
||||
|
||||
if not remove_list:
|
||||
remove_list = [
|
||||
"synapseml-cognitive",
|
||||
"synapseml-core",
|
||||
"synapseml-deep-learning",
|
||||
"synapseml-internal",
|
||||
"synapseml-mlflow",
|
||||
"synapseml-opencv",
|
||||
"synapseml-vw",
|
||||
"synapseml-lightgbm",
|
||||
"synapseml-utils",
|
||||
"flaml", # flaml is needed for AutoML models, should be pre-installed in the runtime
|
||||
"pyspark", # fabric internal pyspark should be pre-installed in the runtime
|
||||
]
|
||||
|
||||
# Download model artifacts
|
||||
client = mlflow.MlflowClient()
|
||||
if not run_id:
|
||||
run_id = client.get_model_version(model_name, model_version).run_id
|
||||
if not dst_path:
|
||||
dst_path = os.path.join(tempfile.gettempdir(), "model_artifacts")
|
||||
os.makedirs(dst_path, exist_ok=True)
|
||||
client.download_artifacts(run_id, artifact_path, dst_path)
|
||||
requirements_path = os.path.join(dst_path, artifact_path, "requirements.txt")
|
||||
with open(requirements_path) as f:
|
||||
reqs = f.read().splitlines()
|
||||
old_reqs = [Requirement(req) for req in reqs if req]
|
||||
old_reqs_dict = {req.name: str(req) for req in old_reqs}
|
||||
for req in remove_list:
|
||||
req = Requirement(req)
|
||||
if req.name in old_reqs_dict:
|
||||
old_reqs_dict.pop(req.name, None)
|
||||
new_reqs_list = list(old_reqs_dict.values())
|
||||
|
||||
with open(requirements_path, "w") as f:
|
||||
f.write("\n".join(new_reqs_list))
|
||||
|
||||
if install_with_ipython:
|
||||
get_ipython().run_line_magic("pip", f"install -r {requirements_path} -q")
|
||||
else:
|
||||
logger.info(f"You can run `pip install -r {requirements_path}` to install dependencies.")
|
||||
return requirements_path
|
||||
|
||||
|
||||
def _mlflow_wrapper(evaluation_func, mlflow_exp_id, mlflow_config=None, extra_tags=None, autolog=False):
|
||||
def wrapped(*args, **kwargs):
|
||||
if mlflow_config is not None:
|
||||
from synapse.ml.mlflow import set_mlflow_env_config
|
||||
try:
|
||||
from synapse.ml.mlflow import set_mlflow_env_config
|
||||
|
||||
set_mlflow_env_config(mlflow_config)
|
||||
set_mlflow_env_config(mlflow_config)
|
||||
except Exception:
|
||||
pass
|
||||
import mlflow
|
||||
|
||||
if mlflow_exp_id is not None:
|
||||
@@ -124,7 +276,20 @@ def _mlflow_wrapper(evaluation_func, mlflow_exp_id, mlflow_config=None, extra_ta
|
||||
|
||||
|
||||
def _get_notebook_name():
|
||||
return None
|
||||
try:
|
||||
import re
|
||||
|
||||
from synapse.ml.mlflow import get_mlflow_env_config
|
||||
from synapse.ml.mlflow.shared_platform_utils import get_artifact
|
||||
|
||||
notebook_id = get_mlflow_env_config(False).artifact_id
|
||||
current_notebook = get_artifact(notebook_id)
|
||||
notebook_name = re.sub("\\W+", "-", current_notebook.displayName).strip()
|
||||
|
||||
return notebook_name
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get notebook name: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def safe_json_dumps(obj):
|
||||
@@ -163,6 +328,8 @@ class MLflowIntegration:
|
||||
self.has_model = False
|
||||
self.only_history = False
|
||||
self._do_log_model = True
|
||||
self.futures = {}
|
||||
self.futures_log_model = {}
|
||||
|
||||
self.extra_tag = (
|
||||
extra_tag
|
||||
@@ -170,6 +337,9 @@ class MLflowIntegration:
|
||||
else {"extra_tag.sid": f"flaml_{__version__}_{int(time.time())}_{random.randint(1001, 9999)}"}
|
||||
)
|
||||
self.start_time = time.time()
|
||||
self.experiment_type = experiment_type
|
||||
self.update_autolog_state()
|
||||
|
||||
self.mlflow_client = mlflow.tracking.MlflowClient()
|
||||
parent_run_info = mlflow.active_run().info if mlflow.active_run() is not None else None
|
||||
if parent_run_info:
|
||||
@@ -188,8 +358,6 @@ class MLflowIntegration:
|
||||
mlflow.set_experiment(experiment_name=mlflow_exp_name)
|
||||
self.experiment_id = mlflow.tracking.fluent._active_experiment_id
|
||||
self.experiment_name = mlflow.get_experiment(self.experiment_id).name
|
||||
self.experiment_type = experiment_type
|
||||
self.update_autolog_state()
|
||||
|
||||
if self.autolog:
|
||||
# only end user created parent run in autolog scenario
|
||||
@@ -197,9 +365,12 @@ class MLflowIntegration:
|
||||
|
||||
def set_mlflow_config(self):
|
||||
if self.driver_mlflow_env_config is not None:
|
||||
from synapse.ml.mlflow import set_mlflow_env_config
|
||||
try:
|
||||
from synapse.ml.mlflow import set_mlflow_env_config
|
||||
|
||||
set_mlflow_env_config(self.driver_mlflow_env_config)
|
||||
set_mlflow_env_config(self.driver_mlflow_env_config)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def wrap_evaluation_function(self, evaluation_function):
|
||||
wrapped_evaluation_function = _mlflow_wrapper(
|
||||
@@ -267,6 +438,7 @@ class MLflowIntegration:
|
||||
else:
|
||||
_tags = []
|
||||
self.mlflow_client.log_batch(run_id=target_id, metrics=_metrics, params=[], tags=_tags)
|
||||
return f"Successfully copy_mlflow_run run_id {src_id} to run_id {target_id}"
|
||||
|
||||
def record_trial(self, result, trial, metric):
|
||||
if isinstance(result, dict):
|
||||
@@ -334,12 +506,31 @@ class MLflowIntegration:
|
||||
self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id)
|
||||
self.has_summary = True
|
||||
|
||||
def log_model(self, model, estimator, signature=None):
|
||||
def log_model(self, model, estimator, signature=None, run_id=None):
|
||||
if not self._do_log_model:
|
||||
return
|
||||
logger.debug(f"logging model {estimator}")
|
||||
ret_message = f"Successfully log_model {estimator} to run_id {run_id}"
|
||||
optional_remove_list = (
|
||||
[] if estimator in ["transformer", "transformer_ms", "tcn", "tft"] else OPTIONAL_REMOVE_REQUIREMENT_LIST
|
||||
)
|
||||
run = mlflow.active_run()
|
||||
if run and run.info.run_id == self.parent_run_id:
|
||||
logger.debug(
|
||||
f"Current active run_id {run.info.run_id} == parent_run_id {self.parent_run_id}, Starting run_id {run_id}"
|
||||
)
|
||||
mlflow.start_run(run_id=run_id, nested=True)
|
||||
elif run and run.info.run_id != run_id:
|
||||
ret_message = (
|
||||
f"Error: Should log_model {estimator} to run_id {run_id}, but logged to run_id {run.info.run_id}"
|
||||
)
|
||||
logger.error(ret_message)
|
||||
else:
|
||||
logger.debug(f"No active run, start run_id {run_id}")
|
||||
mlflow.start_run(run_id=run_id)
|
||||
logger.debug(f"logged model {estimator} to run_id {mlflow.active_run().info.run_id}")
|
||||
if estimator.endswith("_spark"):
|
||||
mlflow.spark.log_model(model, estimator, signature=signature)
|
||||
# mlflow.spark.log_model(model, estimator, signature=signature)
|
||||
mlflow.spark.log_model(model, "model", signature=signature)
|
||||
elif estimator in ["lgbm"]:
|
||||
mlflow.lightgbm.log_model(model, estimator, signature=signature)
|
||||
@@ -352,42 +543,93 @@ class MLflowIntegration:
|
||||
elif estimator in ["prophet"]:
|
||||
mlflow.prophet.log_model(model, estimator, signature=signature)
|
||||
elif estimator in ["orbit"]:
|
||||
pass
|
||||
logger.warning(f"Unsupported model: {estimator}. No model logged.")
|
||||
else:
|
||||
mlflow.sklearn.log_model(model, estimator, signature=signature)
|
||||
future = executor.submit(
|
||||
lambda: mlflow.models.model.update_model_requirements(
|
||||
model_uri=f"runs:/{run_id}/{'model' if estimator.endswith('_spark') else estimator}",
|
||||
operation="remove",
|
||||
requirement_list=convert_requirement(REMOVE_REQUIREMENT_LIST + optional_remove_list),
|
||||
)
|
||||
)
|
||||
self.futures[future] = f"run_{run_id}_requirements_updated"
|
||||
if not run or run.info.run_id == self.parent_run_id:
|
||||
logger.debug(f"Ending current run_id {mlflow.active_run().info.run_id}")
|
||||
mlflow.end_run()
|
||||
return ret_message
|
||||
|
||||
def _pickle_and_log_artifact(self, obj, artifact_name, pickle_fname="temp_.pkl"):
|
||||
def _pickle_and_log_artifact(self, obj, artifact_name, pickle_fname="temp_.pkl", run_id=None):
|
||||
if not self._do_log_model:
|
||||
return
|
||||
return True
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
pickle_fpath = os.path.join(tmpdir, pickle_fname)
|
||||
try:
|
||||
with open(pickle_fpath, "wb") as f:
|
||||
pickle.dump(obj, f)
|
||||
mlflow.log_artifact(pickle_fpath, artifact_name)
|
||||
self.mlflow_client.log_artifact(run_id, pickle_fpath, artifact_name)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to pickle and log artifact {artifact_name}, error: {e}")
|
||||
logger.debug(f"Failed to pickle and log {artifact_name}, error: {e}")
|
||||
return False
|
||||
|
||||
def pickle_and_log_automl_artifacts(self, automl, model, estimator, signature=None):
|
||||
def _log_pipeline(self, pipeline, flavor_name, pipeline_name, signature, run_id, estimator=None):
|
||||
logger.debug(f"logging pipeline {flavor_name}:{pipeline_name}:{estimator}")
|
||||
ret_message = f"Successfully _log_pipeline {flavor_name}:{pipeline_name}:{estimator} to run_id {run_id}"
|
||||
optional_remove_list = (
|
||||
[] if estimator in ["transformer", "transformer_ms", "tcn", "tft"] else OPTIONAL_REMOVE_REQUIREMENT_LIST
|
||||
)
|
||||
run = mlflow.active_run()
|
||||
if run and run.info.run_id == self.parent_run_id:
|
||||
logger.debug(
|
||||
f"Current active run_id {run.info.run_id} == parent_run_id {self.parent_run_id}, Starting run_id {run_id}"
|
||||
)
|
||||
mlflow.start_run(run_id=run_id, nested=True)
|
||||
elif run and run.info.run_id != run_id:
|
||||
ret_message = f"Error: Should _log_pipeline {flavor_name}:{pipeline_name}:{estimator} model to run_id {run_id}, but logged to run_id {run.info.run_id}"
|
||||
logger.error(ret_message)
|
||||
else:
|
||||
logger.debug(f"No active run, start run_id {run_id}")
|
||||
mlflow.start_run(run_id=run_id)
|
||||
logger.debug(
|
||||
f"logging pipeline {flavor_name}:{pipeline_name}:{estimator} to run_id {mlflow.active_run().info.run_id}"
|
||||
)
|
||||
if flavor_name == "sklearn":
|
||||
mlflow.sklearn.log_model(pipeline, pipeline_name, signature=signature)
|
||||
elif flavor_name == "spark":
|
||||
mlflow.spark.log_model(pipeline, pipeline_name, signature=signature)
|
||||
else:
|
||||
logger.warning(f"Unsupported pipeline flavor: {flavor_name}. No model logged.")
|
||||
future = executor.submit(
|
||||
lambda: mlflow.models.model.update_model_requirements(
|
||||
model_uri=f"runs:/{run_id}/{pipeline_name}",
|
||||
operation="remove",
|
||||
requirement_list=convert_requirement(REMOVE_REQUIREMENT_LIST + optional_remove_list),
|
||||
)
|
||||
)
|
||||
self.futures[future] = f"run_{run_id}_requirements_updated"
|
||||
if not run or run.info.run_id == self.parent_run_id:
|
||||
logger.debug(f"Ending current run_id {mlflow.active_run().info.run_id}")
|
||||
mlflow.end_run()
|
||||
return ret_message
|
||||
|
||||
def pickle_and_log_automl_artifacts(self, automl, model, estimator, signature=None, run_id=None):
|
||||
"""log automl artifacts to mlflow
|
||||
load back with `automl = mlflow.pyfunc.load_model(model_run_id_or_uri)`, then do prediction with `automl.predict(X)`
|
||||
"""
|
||||
logger.debug(f"logging automl artifacts {estimator}")
|
||||
self._pickle_and_log_artifact(automl.feature_transformer, "feature_transformer", "feature_transformer.pkl")
|
||||
self._pickle_and_log_artifact(automl.label_transformer, "label_transformer", "label_transformer.pkl")
|
||||
# Test test_mlflow 1 and 4 will get error: TypeError: cannot pickle '_io.TextIOWrapper' object
|
||||
# try:
|
||||
# self._pickle_and_log_artifact(automl, "automl", "automl.pkl")
|
||||
# except TypeError:
|
||||
# pass
|
||||
logger.debug(f"logging automl estimator {estimator}")
|
||||
# self._pickle_and_log_artifact(
|
||||
# automl.feature_transformer, "feature_transformer", "feature_transformer.pkl", run_id
|
||||
# )
|
||||
# self._pickle_and_log_artifact(automl.label_transformer, "label_transformer", "label_transformer.pkl", run_id)
|
||||
if estimator.endswith("_spark"):
|
||||
# spark pipeline is not supported yet
|
||||
return
|
||||
feature_transformer = automl.feature_transformer
|
||||
if isinstance(feature_transformer, Pipeline):
|
||||
if isinstance(feature_transformer, Pipeline) and not estimator.endswith("_spark"):
|
||||
pipeline = feature_transformer
|
||||
pipeline.steps.append(("estimator", model))
|
||||
elif isinstance(feature_transformer, SparkPipeline):
|
||||
elif isinstance(feature_transformer, SparkPipelineModel) and estimator.endswith("_spark"):
|
||||
pipeline = feature_transformer
|
||||
pipeline.stages.append(model)
|
||||
elif not estimator.endswith("_spark"):
|
||||
@@ -395,24 +637,26 @@ class MLflowIntegration:
|
||||
steps.append(("estimator", model))
|
||||
pipeline = Pipeline(steps)
|
||||
else:
|
||||
stages = [feature_transformer]
|
||||
stages = []
|
||||
if feature_transformer is not None:
|
||||
stages.append(feature_transformer)
|
||||
stages.append(model)
|
||||
pipeline = SparkPipeline(stages=stages)
|
||||
if isinstance(pipeline, SparkPipeline):
|
||||
pipeline = SparkPipelineModel(stages=stages)
|
||||
if isinstance(pipeline, SparkPipelineModel):
|
||||
logger.debug(f"logging spark pipeline {estimator}")
|
||||
mlflow.spark.log_model(pipeline, "automl_pipeline", signature=signature)
|
||||
self._log_pipeline(pipeline, "spark", "model", signature, run_id, estimator)
|
||||
else:
|
||||
# Add a log named "model" to fit default settings
|
||||
logger.debug(f"logging sklearn pipeline {estimator}")
|
||||
mlflow.sklearn.log_model(pipeline, "automl_pipeline", signature=signature)
|
||||
mlflow.sklearn.log_model(pipeline, "model", signature=signature)
|
||||
self._log_pipeline(pipeline, "sklearn", "model", signature, run_id, estimator)
|
||||
return f"Successfully pickle_and_log_automl_artifacts {estimator} to run_id {run_id}"
|
||||
|
||||
def record_state(self, automl, search_state, estimator):
|
||||
@time_it
|
||||
def record_state(self, automl, search_state, estimator, is_log_model=True):
|
||||
_st = time.time()
|
||||
automl_metric_name = (
|
||||
automl._state.metric if isinstance(automl._state.metric, str) else automl._state.error_metric
|
||||
)
|
||||
|
||||
if automl._state.error_metric.startswith("1-"):
|
||||
automl_metric_value = 1 - search_state.val_loss
|
||||
elif automl._state.error_metric.startswith("-"):
|
||||
@@ -425,6 +669,8 @@ class MLflowIntegration:
|
||||
else:
|
||||
config = search_state.config
|
||||
|
||||
self.automl_user_configurations = safe_json_dumps(automl._automl_user_configurations)
|
||||
|
||||
info = {
|
||||
"metrics": {
|
||||
"iter_counter": automl._track_iter,
|
||||
@@ -445,7 +691,7 @@ class MLflowIntegration:
|
||||
"flaml.meric": automl_metric_name,
|
||||
"flaml.run_source": "flaml-automl",
|
||||
"flaml.log_type": self.log_type,
|
||||
"flaml.automl_user_configurations": safe_json_dumps(automl._automl_user_configurations),
|
||||
"flaml.automl_user_configurations": self.automl_user_configurations,
|
||||
},
|
||||
"params": {
|
||||
"sample_size": search_state.sample_size,
|
||||
@@ -472,33 +718,70 @@ class MLflowIntegration:
|
||||
run_name = f"{self.parent_run_name}_child_{self.child_counter}"
|
||||
else:
|
||||
run_name = None
|
||||
_t1 = time.time()
|
||||
wait(self.futures_log_model)
|
||||
_t2 = time.time() - _t1
|
||||
logger.debug(f"wait futures_log_model in record_state took {_t2} seconds")
|
||||
with mlflow.start_run(nested=True, run_name=run_name) as child_run:
|
||||
self._log_info_to_run(info, child_run.info.run_id, log_params=True)
|
||||
if automl._state.model_history:
|
||||
self.log_model(
|
||||
search_state.trained_estimator._model, estimator, signature=automl.estimator_signature
|
||||
)
|
||||
self.pickle_and_log_automl_artifacts(
|
||||
automl, search_state.trained_estimator, estimator, signature=automl.pipeline_signature
|
||||
)
|
||||
future = executor.submit(lambda: self._log_info_to_run(info, child_run.info.run_id, log_params=True))
|
||||
self.futures[future] = f"iter_{automl._track_iter}_log_info_to_run"
|
||||
future = executor.submit(lambda: self._log_automl_configurations(child_run.info.run_id))
|
||||
self.futures[future] = f"iter_{automl._track_iter}_log_automl_configurations"
|
||||
if automl._state.model_history and is_log_model:
|
||||
if estimator.endswith("_spark"):
|
||||
future = executor.submit(
|
||||
lambda: self.log_model(
|
||||
search_state.trained_estimator._model,
|
||||
estimator,
|
||||
automl.estimator_signature,
|
||||
child_run.info.run_id,
|
||||
)
|
||||
)
|
||||
self.futures_log_model[future] = f"record_state-log_model_{estimator}"
|
||||
else:
|
||||
future = executor.submit(
|
||||
lambda: self.pickle_and_log_automl_artifacts(
|
||||
automl,
|
||||
search_state.trained_estimator,
|
||||
estimator,
|
||||
automl.pipeline_signature,
|
||||
child_run.info.run_id,
|
||||
)
|
||||
)
|
||||
self.futures_log_model[future] = f"record_state-pickle_and_log_automl_artifacts_{estimator}"
|
||||
self.manual_run_ids.append(child_run.info.run_id)
|
||||
self.child_counter += 1
|
||||
return f"Successfully record_state iteration {automl._track_iter}"
|
||||
|
||||
@time_it
|
||||
def log_automl(self, automl):
|
||||
self.set_best_iter(automl)
|
||||
if self.autolog:
|
||||
if self.parent_run_id is not None:
|
||||
mlflow.start_run(run_id=self.parent_run_id, experiment_id=self.experiment_id)
|
||||
mlflow.log_metric("best_validation_loss", automl._state.best_loss)
|
||||
mlflow.log_metric("best_iteration", automl._best_iteration)
|
||||
mlflow.log_metric("num_child_runs", len(self.infos))
|
||||
if automl._trained_estimator is not None and not self.has_model:
|
||||
self.log_model(
|
||||
automl._trained_estimator._model, automl.best_estimator, signature=automl.estimator_signature
|
||||
)
|
||||
self.pickle_and_log_automl_artifacts(
|
||||
automl, automl.model, automl.best_estimator, signature=automl.pipeline_signature
|
||||
)
|
||||
mlflow.log_metrics(
|
||||
{
|
||||
"best_validation_loss": automl._state.best_loss,
|
||||
"best_iteration": automl._best_iteration,
|
||||
"num_child_runs": len(self.infos),
|
||||
}
|
||||
)
|
||||
if (
|
||||
automl._trained_estimator is not None
|
||||
and not self.has_model
|
||||
and automl._trained_estimator._model is not None
|
||||
):
|
||||
if automl.best_estimator.endswith("_spark"):
|
||||
self.log_model(
|
||||
automl._trained_estimator._model,
|
||||
automl.best_estimator,
|
||||
automl.estimator_signature,
|
||||
self.parent_run_id,
|
||||
)
|
||||
else:
|
||||
self.pickle_and_log_automl_artifacts(
|
||||
automl, automl.model, automl.best_estimator, automl.pipeline_signature, self.parent_run_id
|
||||
)
|
||||
self.has_model = True
|
||||
|
||||
self.adopt_children(automl)
|
||||
@@ -514,31 +797,68 @@ class MLflowIntegration:
|
||||
conf = automl._config_history[automl._best_iteration][1].copy()
|
||||
if "ml" in conf.keys():
|
||||
conf = conf["ml"]
|
||||
|
||||
mlflow.log_params(conf)
|
||||
mlflow.log_param("best_learner", automl._best_estimator)
|
||||
params_arr = [
|
||||
Param(key, str(value)) for key, value in {**conf, "best_learner": automl._best_estimator}.items()
|
||||
]
|
||||
self.mlflow_client.log_batch(run_id=self.parent_run_id, metrics=[], params=params_arr, tags=[])
|
||||
if not self.has_summary:
|
||||
logger.info(f"logging best model {automl.best_estimator}")
|
||||
self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id)
|
||||
future = executor.submit(lambda: self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id))
|
||||
self.futures[future] = "log_automl_copy_mlflow_run"
|
||||
future = executor.submit(lambda: self._log_automl_configurations(self.parent_run_id))
|
||||
self.futures[future] = "log_automl_log_automl_configurations"
|
||||
self.has_summary = True
|
||||
if automl._trained_estimator is not None and not self.has_model:
|
||||
self.log_model(
|
||||
automl._trained_estimator._model,
|
||||
automl.best_estimator,
|
||||
signature=automl.estimator_signature,
|
||||
)
|
||||
self.pickle_and_log_automl_artifacts(
|
||||
automl, automl.model, automl.best_estimator, signature=automl.pipeline_signature
|
||||
)
|
||||
_t1 = time.time()
|
||||
wait(self.futures_log_model)
|
||||
_t2 = time.time() - _t1
|
||||
logger.debug(f"wait futures_log_model in log_automl took {_t2} seconds")
|
||||
if (
|
||||
automl._trained_estimator is not None
|
||||
and not self.has_model
|
||||
and automl._trained_estimator._model is not None
|
||||
):
|
||||
if automl.best_estimator.endswith("_spark"):
|
||||
future = executor.submit(
|
||||
lambda: self.log_model(
|
||||
automl._trained_estimator._model,
|
||||
automl.best_estimator,
|
||||
signature=automl.estimator_signature,
|
||||
run_id=self.parent_run_id,
|
||||
)
|
||||
)
|
||||
self.futures_log_model[future] = f"log_automl-log_model_{automl.best_estimator}"
|
||||
else:
|
||||
future = executor.submit(
|
||||
lambda: self.pickle_and_log_automl_artifacts(
|
||||
automl,
|
||||
automl.model,
|
||||
automl.best_estimator,
|
||||
signature=automl.pipeline_signature,
|
||||
run_id=self.parent_run_id,
|
||||
)
|
||||
)
|
||||
self.futures_log_model[
|
||||
future
|
||||
] = f"log_automl-pickle_and_log_automl_artifacts_{automl.best_estimator}"
|
||||
self.has_model = True
|
||||
|
||||
def resume_mlflow(self):
|
||||
if len(self.resume_params) > 0:
|
||||
mlflow.autolog(**self.resume_params)
|
||||
|
||||
def _log_automl_configurations(self, run_id):
|
||||
self.mlflow_client.log_text(
|
||||
run_id=run_id,
|
||||
text=self.automl_user_configurations,
|
||||
artifact_file="automl_configurations/automl_user_configurations.json",
|
||||
)
|
||||
return f"Successfully _log_automl_configurations to run_id {run_id}"
|
||||
|
||||
def _log_info_to_run(self, info, run_id, log_params=False):
|
||||
_metrics = [Metric(key, value, int(time.time() * 1000), 0) for key, value in info["metrics"].items()]
|
||||
_tags = [RunTag(key, str(value)) for key, value in info["tags"].items()]
|
||||
_tags = [
|
||||
RunTag(key, str(value)[:5000]) for key, value in info["tags"].items()
|
||||
] # AML will raise error if value length > 5000
|
||||
_params = [
|
||||
Param(key, str(value))
|
||||
for key, value in info["params"].items()
|
||||
@@ -554,6 +874,7 @@ class MLflowIntegration:
|
||||
_tags = [RunTag("mlflow.parentRunId", run_id)]
|
||||
self.mlflow_client.log_batch(run_id=run.info.run_id, metrics=_metrics, params=[], tags=_tags)
|
||||
del info["submetrics"]["values"]
|
||||
return f"Successfully _log_info_to_run to run_id {run_id}"
|
||||
|
||||
def adopt_children(self, result=None):
|
||||
"""
|
||||
@@ -575,6 +896,7 @@ class MLflowIntegration:
|
||||
),
|
||||
)
|
||||
self.child_counter = 0
|
||||
num_infos = len(self.infos)
|
||||
|
||||
# From latest to earliest, remove duplicate cross-validation runs
|
||||
_exist_child_run_params = [] # for deduplication of cross-validation child runs
|
||||
@@ -639,22 +961,37 @@ class MLflowIntegration:
|
||||
)
|
||||
self.mlflow_client.set_tag(child_run_id, "flaml.child_counter", self.child_counter)
|
||||
|
||||
# merge autolog child run and corresponding manual run
|
||||
flaml_info = self.infos[self.child_counter]
|
||||
child_run = self.mlflow_client.get_run(child_run_id)
|
||||
self._log_info_to_run(flaml_info, child_run_id, log_params=False)
|
||||
# Merge autolog child run and corresponding FLAML trial info (if available).
|
||||
# In nested scenarios (e.g., Tune -> AutoML -> MLflow autolog), MLflow can create
|
||||
# more child runs than the number of FLAML trials recorded in self.infos.
|
||||
# TODO: need more tests in nested scenarios.
|
||||
flaml_info = None
|
||||
child_run = None
|
||||
if self.child_counter < num_infos:
|
||||
flaml_info = self.infos[self.child_counter]
|
||||
child_run = self.mlflow_client.get_run(child_run_id)
|
||||
self._log_info_to_run(flaml_info, child_run_id, log_params=False)
|
||||
|
||||
if self.experiment_type == "automl":
|
||||
if "learner" not in child_run.data.params:
|
||||
self.mlflow_client.log_param(child_run_id, "learner", flaml_info["params"]["learner"])
|
||||
if "sample_size" not in child_run.data.params:
|
||||
self.mlflow_client.log_param(
|
||||
child_run_id, "sample_size", flaml_info["params"]["sample_size"]
|
||||
)
|
||||
if self.experiment_type == "automl":
|
||||
if "learner" not in child_run.data.params:
|
||||
self.mlflow_client.log_param(child_run_id, "learner", flaml_info["params"]["learner"])
|
||||
if "sample_size" not in child_run.data.params:
|
||||
self.mlflow_client.log_param(
|
||||
child_run_id, "sample_size", flaml_info["params"]["sample_size"]
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
"No corresponding FLAML info for MLflow child run %s (child_counter=%s, infos=%s); skipping merge.",
|
||||
child_run_id,
|
||||
self.child_counter,
|
||||
num_infos,
|
||||
)
|
||||
|
||||
if self.child_counter == best_iteration:
|
||||
if flaml_info is not None and self.child_counter == best_iteration:
|
||||
self.mlflow_client.set_tag(child_run_id, "flaml.best_run", True)
|
||||
if result is not None:
|
||||
if child_run is None:
|
||||
child_run = self.mlflow_client.get_run(child_run_id)
|
||||
result.best_run_id = child_run_id
|
||||
result.best_run_name = child_run.info.run_name
|
||||
self.best_run_id = child_run_id
|
||||
@@ -678,7 +1015,7 @@ class MLflowIntegration:
|
||||
self.resume_mlflow()
|
||||
|
||||
|
||||
def register_automl_pipeline(automl, model_name=None, signature=None):
|
||||
def register_automl_pipeline(automl, model_name=None, signature=None, artifact_path="model"):
|
||||
pipeline = automl.automl_pipeline
|
||||
if pipeline is None:
|
||||
logger.warning("pipeline not found, cannot register it")
|
||||
@@ -688,7 +1025,7 @@ def register_automl_pipeline(automl, model_name=None, signature=None):
|
||||
if automl.best_run_id is None:
|
||||
mlflow.sklearn.log_model(
|
||||
pipeline,
|
||||
"automl_pipeline",
|
||||
artifact_path,
|
||||
registered_model_name=model_name,
|
||||
signature=automl.pipeline_signature if signature is None else signature,
|
||||
)
|
||||
@@ -698,5 +1035,5 @@ def register_automl_pipeline(automl, model_name=None, signature=None):
|
||||
return mvs[0]
|
||||
else:
|
||||
best_run = mlflow.get_run(automl.best_run_id)
|
||||
model_uri = f"runs:/{best_run.info.run_id}/automl_pipeline"
|
||||
model_uri = f"runs:/{best_run.info.run_id}/{artifact_path}"
|
||||
return mlflow.register_model(model_uri, model_name)
|
||||
|
||||
37
flaml/tune/logger.py
Normal file
37
flaml/tune/logger.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
class ColoredFormatter(logging.Formatter):
|
||||
# ANSI escape codes for colors
|
||||
COLORS = {
|
||||
# logging.DEBUG: "\033[36m", # Cyan
|
||||
# logging.INFO: "\033[32m", # Green
|
||||
logging.WARNING: "\033[33m", # Yellow
|
||||
logging.ERROR: "\033[31m", # Red
|
||||
logging.CRITICAL: "\033[1;31m", # Bright Red
|
||||
}
|
||||
RESET = "\033[0m" # Reset to default
|
||||
|
||||
def __init__(self, fmt, datefmt, use_color=True):
|
||||
super().__init__(fmt, datefmt)
|
||||
self.use_color = use_color
|
||||
|
||||
def format(self, record):
|
||||
formatted = super().format(record)
|
||||
if self.use_color:
|
||||
color = self.COLORS.get(record.levelno, "")
|
||||
if color:
|
||||
return f"{color}{formatted}{self.RESET}"
|
||||
return formatted
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
use_color = True
|
||||
if os.getenv("FLAML_LOG_NO_COLOR"):
|
||||
use_color = False
|
||||
|
||||
logger_formatter = ColoredFormatter(
|
||||
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S", use_color
|
||||
)
|
||||
logger.propagate = False
|
||||
@@ -244,13 +244,32 @@ class BlendSearch(Searcher):
|
||||
evaluated_rewards=evaluated_rewards,
|
||||
)
|
||||
except (AssertionError, ValueError):
|
||||
self._gs = GlobalSearch(
|
||||
space=gs_space,
|
||||
metric=metric,
|
||||
mode=mode,
|
||||
seed=gs_seed,
|
||||
sampler=sampler,
|
||||
)
|
||||
try:
|
||||
self._gs = GlobalSearch(
|
||||
space=gs_space,
|
||||
metric=metric,
|
||||
mode=mode,
|
||||
seed=gs_seed,
|
||||
sampler=sampler,
|
||||
)
|
||||
except ValueError:
|
||||
# Ray Tune's OptunaSearch converts Tune domains into Optuna
|
||||
# distributions. Optuna disallows integer log distributions
|
||||
# with step != 1 (e.g., qlograndint with q>1), which can
|
||||
# raise here. Fall back to FLAML's OptunaSearch wrapper,
|
||||
# which handles these spaces more permissively.
|
||||
if getattr(GlobalSearch, "__module__", "").startswith("ray.tune"):
|
||||
from .suggestion import OptunaSearch as _FallbackOptunaSearch
|
||||
|
||||
self._gs = _FallbackOptunaSearch(
|
||||
space=gs_space,
|
||||
metric=metric,
|
||||
mode=mode,
|
||||
seed=gs_seed,
|
||||
sampler=sampler,
|
||||
)
|
||||
else:
|
||||
raise
|
||||
self._gs.space = space
|
||||
else:
|
||||
self._gs = None
|
||||
|
||||
@@ -35,6 +35,73 @@ from ..sample import (
|
||||
Quantized,
|
||||
Uniform,
|
||||
)
|
||||
|
||||
# If Ray is installed, flaml.tune may re-export Ray Tune sampling functions.
|
||||
# In that case, the search space contains Ray Tune Domain/Sampler objects,
|
||||
# which should be accepted by our Optuna search-space conversion.
|
||||
try:
|
||||
from ray import __version__ as _ray_version # type: ignore
|
||||
|
||||
if str(_ray_version).startswith("1."):
|
||||
from ray.tune.sample import ( # type: ignore
|
||||
Categorical as _RayCategorical,
|
||||
)
|
||||
from ray.tune.sample import (
|
||||
Domain as _RayDomain,
|
||||
)
|
||||
from ray.tune.sample import (
|
||||
Float as _RayFloat,
|
||||
)
|
||||
from ray.tune.sample import (
|
||||
Integer as _RayInteger,
|
||||
)
|
||||
from ray.tune.sample import (
|
||||
LogUniform as _RayLogUniform,
|
||||
)
|
||||
from ray.tune.sample import (
|
||||
Quantized as _RayQuantized,
|
||||
)
|
||||
from ray.tune.sample import (
|
||||
Uniform as _RayUniform,
|
||||
)
|
||||
else:
|
||||
from ray.tune.search.sample import ( # type: ignore
|
||||
Categorical as _RayCategorical,
|
||||
)
|
||||
from ray.tune.search.sample import (
|
||||
Domain as _RayDomain,
|
||||
)
|
||||
from ray.tune.search.sample import (
|
||||
Float as _RayFloat,
|
||||
)
|
||||
from ray.tune.search.sample import (
|
||||
Integer as _RayInteger,
|
||||
)
|
||||
from ray.tune.search.sample import (
|
||||
LogUniform as _RayLogUniform,
|
||||
)
|
||||
from ray.tune.search.sample import (
|
||||
Quantized as _RayQuantized,
|
||||
)
|
||||
from ray.tune.search.sample import (
|
||||
Uniform as _RayUniform,
|
||||
)
|
||||
|
||||
_FLOAT_TYPES = (Float, _RayFloat)
|
||||
_INTEGER_TYPES = (Integer, _RayInteger)
|
||||
_CATEGORICAL_TYPES = (Categorical, _RayCategorical)
|
||||
_DOMAIN_TYPES = (Domain, _RayDomain)
|
||||
_QUANTIZED_TYPES = (Quantized, _RayQuantized)
|
||||
_UNIFORM_TYPES = (Uniform, _RayUniform)
|
||||
_LOGUNIFORM_TYPES = (LogUniform, _RayLogUniform)
|
||||
except Exception: # pragma: no cover
|
||||
_FLOAT_TYPES = (Float,)
|
||||
_INTEGER_TYPES = (Integer,)
|
||||
_CATEGORICAL_TYPES = (Categorical,)
|
||||
_DOMAIN_TYPES = (Domain,)
|
||||
_QUANTIZED_TYPES = (Quantized,)
|
||||
_UNIFORM_TYPES = (Uniform,)
|
||||
_LOGUNIFORM_TYPES = (LogUniform,)
|
||||
from ..trial import flatten_dict, unflatten_dict
|
||||
from .variant_generator import parse_spec_vars
|
||||
|
||||
@@ -850,19 +917,22 @@ class OptunaSearch(Searcher):
|
||||
def resolve_value(domain: Domain) -> ot.distributions.BaseDistribution:
|
||||
quantize = None
|
||||
|
||||
sampler = domain.get_sampler()
|
||||
if isinstance(sampler, Quantized):
|
||||
# Ray Tune Domains and FLAML Domains both provide get_sampler(), but
|
||||
# fall back to the .sampler attribute for robustness.
|
||||
sampler = domain.get_sampler() if hasattr(domain, "get_sampler") else getattr(domain, "sampler", None)
|
||||
|
||||
if isinstance(sampler, _QUANTIZED_TYPES) or type(sampler).__name__ == "Quantized":
|
||||
quantize = sampler.q
|
||||
sampler = sampler.sampler
|
||||
if isinstance(sampler, LogUniform):
|
||||
sampler = getattr(sampler, "sampler", None) or sampler.get_sampler()
|
||||
if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
|
||||
logger.warning(
|
||||
"Optuna does not handle quantization in loguniform "
|
||||
"sampling. The parameter will be passed but it will "
|
||||
"probably be ignored."
|
||||
)
|
||||
|
||||
if isinstance(domain, Float):
|
||||
if isinstance(sampler, LogUniform):
|
||||
if isinstance(domain, _FLOAT_TYPES) or type(domain).__name__ == "Float":
|
||||
if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
|
||||
if quantize:
|
||||
logger.warning(
|
||||
"Optuna does not support both quantization and "
|
||||
@@ -870,17 +940,17 @@ class OptunaSearch(Searcher):
|
||||
)
|
||||
return ot.distributions.LogUniformDistribution(domain.lower, domain.upper)
|
||||
|
||||
elif isinstance(sampler, Uniform):
|
||||
elif isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
|
||||
if quantize:
|
||||
return ot.distributions.DiscreteUniformDistribution(domain.lower, domain.upper, quantize)
|
||||
return ot.distributions.UniformDistribution(domain.lower, domain.upper)
|
||||
|
||||
elif isinstance(domain, Integer):
|
||||
if isinstance(sampler, LogUniform):
|
||||
elif isinstance(domain, _INTEGER_TYPES) or type(domain).__name__ == "Integer":
|
||||
if isinstance(sampler, _LOGUNIFORM_TYPES) or type(sampler).__name__ == "LogUniform":
|
||||
# ``step`` argument Deprecated in v2.0.0. ``step`` argument should be 1 in Log Distribution
|
||||
# The removal of this feature is currently scheduled for v4.0.0,
|
||||
return ot.distributions.IntLogUniformDistribution(domain.lower, domain.upper - 1, step=1)
|
||||
elif isinstance(sampler, Uniform):
|
||||
elif isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
|
||||
# Upper bound should be inclusive for quantization and
|
||||
# exclusive otherwise
|
||||
return ot.distributions.IntUniformDistribution(
|
||||
@@ -888,16 +958,16 @@ class OptunaSearch(Searcher):
|
||||
domain.upper - int(bool(not quantize)),
|
||||
step=quantize or 1,
|
||||
)
|
||||
elif isinstance(domain, Categorical):
|
||||
if isinstance(sampler, Uniform):
|
||||
elif isinstance(domain, _CATEGORICAL_TYPES) or type(domain).__name__ == "Categorical":
|
||||
if isinstance(sampler, _UNIFORM_TYPES) or type(sampler).__name__ == "Uniform":
|
||||
return ot.distributions.CategoricalDistribution(domain.categories)
|
||||
|
||||
raise ValueError(
|
||||
"Optuna search does not support parameters of type "
|
||||
"`{}` with samplers of type `{}`".format(type(domain).__name__, type(domain.sampler).__name__)
|
||||
"`{}` with samplers of type `{}`".format(type(domain).__name__, type(sampler).__name__)
|
||||
)
|
||||
|
||||
# Parameter name is e.g. "a/b/c" for nested dicts
|
||||
values = {"/".join(path): resolve_value(domain) for path, domain in domain_vars}
|
||||
|
||||
return values
|
||||
return values
|
||||
|
||||
@@ -261,7 +261,7 @@ def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
|
||||
low_cost[i] = point
|
||||
if len(low_cost) > len(domain.categories):
|
||||
if domain.ordered:
|
||||
low_cost[-1] = int(np.where(ind == low_cost[-1])[0])
|
||||
low_cost[-1] = int(np.where(ind == low_cost[-1])[0].item())
|
||||
domain.low_cost_point = low_cost[-1]
|
||||
return
|
||||
if low_cost:
|
||||
|
||||
@@ -162,6 +162,10 @@ def broadcast_code(custom_code="", file_name="mylearner"):
|
||||
assert isinstance(MyLargeLGBM(), LGBMEstimator)
|
||||
```
|
||||
"""
|
||||
# Check if Spark is available
|
||||
spark_available, _ = check_spark()
|
||||
|
||||
# Write to local driver file system
|
||||
flaml_path = os.path.dirname(os.path.abspath(__file__))
|
||||
custom_code = textwrap.dedent(custom_code)
|
||||
custom_path = os.path.join(flaml_path, file_name + ".py")
|
||||
@@ -169,6 +173,24 @@ def broadcast_code(custom_code="", file_name="mylearner"):
|
||||
with open(custom_path, "w") as f:
|
||||
f.write(custom_code)
|
||||
|
||||
# If using Spark, broadcast the code content to executors
|
||||
if spark_available:
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
bc_code = spark.sparkContext.broadcast(custom_code)
|
||||
|
||||
# Execute a job to ensure the code is distributed to all executors
|
||||
def _write_code(bc):
|
||||
code = bc.value
|
||||
import os
|
||||
|
||||
module_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name + ".py")
|
||||
os.makedirs(os.path.dirname(module_path), exist_ok=True)
|
||||
with open(module_path, "w") as f:
|
||||
f.write(code)
|
||||
return True
|
||||
|
||||
spark.sparkContext.parallelize(range(1)).map(lambda _: _write_code(bc_code)).collect()
|
||||
|
||||
return custom_path
|
||||
|
||||
|
||||
|
||||
@@ -21,11 +21,11 @@ except (ImportError, AssertionError):
|
||||
from .analysis import ExperimentAnalysis as EA
|
||||
else:
|
||||
ray_available = True
|
||||
|
||||
import logging
|
||||
|
||||
from flaml.tune.spark.utils import PySparkOvertimeMonitor, check_spark
|
||||
|
||||
from .logger import logger, logger_formatter
|
||||
from .result import DEFAULT_METRIC
|
||||
from .trial import Trial
|
||||
|
||||
@@ -41,8 +41,6 @@ except ImportError:
|
||||
internal_mlflow = False
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.propagate = False
|
||||
_use_ray = True
|
||||
_runner = None
|
||||
_verbose = 0
|
||||
@@ -197,9 +195,16 @@ def report(_metric=None, **kwargs):
|
||||
global _training_iteration
|
||||
if _use_ray:
|
||||
try:
|
||||
from ray import tune
|
||||
from ray import __version__ as ray_version
|
||||
|
||||
return tune.report(_metric, **kwargs)
|
||||
if ray_version.startswith("1."):
|
||||
from ray import tune
|
||||
|
||||
return tune.report(_metric, **kwargs)
|
||||
else: # ray>=2
|
||||
from ray.air import session
|
||||
|
||||
return session.report(metrics={"metric": _metric, **kwargs})
|
||||
except ImportError:
|
||||
# calling tune.report() outside tune.run()
|
||||
return
|
||||
@@ -260,6 +265,8 @@ def run(
|
||||
mlflow_exp_name: Optional[str] = None,
|
||||
automl_info: Optional[Tuple[float]] = None,
|
||||
extra_tag: Optional[dict] = None,
|
||||
cost_attr: Optional[str] = "auto",
|
||||
cost_budget: Optional[float] = None,
|
||||
**ray_args,
|
||||
):
|
||||
"""The function-based way of performing HPO.
|
||||
@@ -462,6 +469,12 @@ def run(
|
||||
overwritten by the value of `n_concurrent_trials` in AutoML. When <= 0, the concurrent trials
|
||||
will be set to the number of executors.
|
||||
extra_tag: dict, default=None | Extra tags to be added to the mlflow runs created by autologging.
|
||||
cost_attr: None or str to specify the attribute to evaluate the cost of different trials.
|
||||
Default is "auto", which means that we will automatically choose the cost attribute to use (depending
|
||||
on the nature of the resource budget). When cost_attr is set to None, cost differences between different trials will be omitted
|
||||
in our search algorithm. When cost_attr is set to a str different from "auto" and "time_total_s",
|
||||
this cost_attr must be available in the result dict of the trial.
|
||||
cost_budget: A float of the cost budget. Only valid when cost_attr is a str different from "auto" and "time_total_s".
|
||||
**ray_args: keyword arguments to pass to ray.tune.run().
|
||||
Only valid when use_ray=True.
|
||||
"""
|
||||
@@ -506,10 +519,6 @@ def run(
|
||||
elif not logger.hasHandlers():
|
||||
# Add the console handler.
|
||||
_ch = logging.StreamHandler(stream=sys.stdout)
|
||||
logger_formatter = logging.Formatter(
|
||||
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s",
|
||||
"%m-%d %H:%M:%S",
|
||||
)
|
||||
_ch.setFormatter(logger_formatter)
|
||||
logger.addHandler(_ch)
|
||||
if verbose <= 2:
|
||||
@@ -600,6 +609,8 @@ def run(
|
||||
metric_constraints=metric_constraints,
|
||||
use_incumbent_result_in_evaluation=use_incumbent_result_in_evaluation,
|
||||
lexico_objectives=lexico_objectives,
|
||||
cost_attr=cost_attr,
|
||||
cost_budget=cost_budget,
|
||||
)
|
||||
else:
|
||||
if metric is None or mode is None:
|
||||
@@ -735,10 +746,16 @@ def run(
|
||||
max_concurrent = max(1, search_alg.max_concurrent)
|
||||
else:
|
||||
max_concurrent = max(1, max_spark_parallelism)
|
||||
passed_in_n_concurrent_trials = max(n_concurrent_trials, max_concurrent)
|
||||
n_concurrent_trials = min(
|
||||
n_concurrent_trials if n_concurrent_trials > 0 else num_executors,
|
||||
max_concurrent,
|
||||
)
|
||||
if n_concurrent_trials < passed_in_n_concurrent_trials:
|
||||
logger.warning(
|
||||
f"The actual concurrent trials is {n_concurrent_trials}. You can set the environment "
|
||||
f"variable `FLAML_MAX_CONCURRENT` to '{passed_in_n_concurrent_trials}' to override the detected num of executors."
|
||||
)
|
||||
with parallel_backend("spark"):
|
||||
with Parallel(n_jobs=n_concurrent_trials, verbose=max(0, (verbose - 1) * 50)) as parallel:
|
||||
try:
|
||||
@@ -759,8 +776,8 @@ def run(
|
||||
and (num_samples < 0 or num_trials < num_samples)
|
||||
and num_failures < upperbound_num_failures
|
||||
):
|
||||
if automl_info and automl_info[0] > 0 and time_budget_s < np.inf:
|
||||
time_budget_s -= automl_info[0]
|
||||
if automl_info and automl_info[1] == "all" and automl_info[0] > 0 and time_budget_s < np.inf:
|
||||
time_budget_s -= automl_info[0] * n_concurrent_trials
|
||||
logger.debug(f"Remaining time budget with mlflow log latency: {time_budget_s} seconds.")
|
||||
while len(_runner.running_trials) < n_concurrent_trials:
|
||||
# suggest trials for spark
|
||||
@@ -785,9 +802,17 @@ def run(
|
||||
)
|
||||
results = None
|
||||
with PySparkOvertimeMonitor(time_start, time_budget_s, force_cancel, parallel=parallel):
|
||||
results = parallel(
|
||||
delayed(evaluation_function)(trial_to_run.config) for trial_to_run in trials_to_run
|
||||
)
|
||||
try:
|
||||
results = parallel(
|
||||
delayed(evaluation_function)(trial_to_run.config) for trial_to_run in trials_to_run
|
||||
)
|
||||
except RuntimeError as e:
|
||||
logger.warning(f"RuntimeError: {e}")
|
||||
results = None
|
||||
logger.info(
|
||||
"Encountered RuntimeError. Waiting 10 seconds for Spark cluster to recover before retrying."
|
||||
)
|
||||
time.sleep(10)
|
||||
# results = [evaluation_function(trial_to_run.config) for trial_to_run in trials_to_run]
|
||||
while results:
|
||||
result = results.pop(0)
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.3.2"
|
||||
__version__ = "2.4.1"
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
license_file = "LICENSE"
|
||||
description-file = "README.md"
|
||||
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = '-m "not conda"'
|
||||
markers = [
|
||||
|
||||
3
pytest.ini
Normal file
3
pytest.ini
Normal file
@@ -0,0 +1,3 @@
|
||||
[pytest]
|
||||
markers =
|
||||
spark: mark a test as requiring Spark
|
||||
82
setup.py
82
setup.py
@@ -51,60 +51,59 @@ setuptools.setup(
|
||||
"joblib<=1.3.2",
|
||||
],
|
||||
"test": [
|
||||
"jupyter",
|
||||
"numpy>=1.17,<2.0.0; python_version<'3.13'",
|
||||
"numpy>2.0.0; python_version>='3.13'",
|
||||
"jupyter; python_version<'3.13'",
|
||||
"lightgbm>=2.3.1",
|
||||
"xgboost>=0.90,<2.0.0",
|
||||
"xgboost>=0.90,<2.0.0; python_version<'3.11'",
|
||||
"xgboost>=2.0.0; python_version>='3.11'",
|
||||
"scipy>=1.4.1",
|
||||
"pandas>=1.1.4,<2.0.0; python_version<'3.10'",
|
||||
"pandas>=1.1.4; python_version>='3.10'",
|
||||
"scikit-learn>=1.0.0",
|
||||
"scikit-learn>=1.2.0",
|
||||
"thop",
|
||||
"pytest>=6.1.1",
|
||||
"pytest-rerunfailures>=13.0",
|
||||
"coverage>=5.3",
|
||||
"pre-commit",
|
||||
"torch",
|
||||
"torchvision",
|
||||
"catboost>=0.26,<1.2; python_version<'3.11'",
|
||||
"catboost>=0.26; python_version>='3.11'",
|
||||
"catboost>=0.26; python_version<'3.13'",
|
||||
"rgf-python",
|
||||
"optuna>=2.8.0,<=3.6.1",
|
||||
"openml",
|
||||
"openml; python_version<'3.13'",
|
||||
"statsmodels>=0.12.2",
|
||||
"psutil==5.8.0",
|
||||
"psutil",
|
||||
"dataclasses",
|
||||
"transformers[torch]==4.26",
|
||||
"transformers[torch]",
|
||||
"datasets",
|
||||
"nltk<=3.8.1", # 3.8.2 doesn't work with mlflow
|
||||
"evaluate",
|
||||
"nltk!=3.8.2", # 3.8.2 doesn't work with mlflow
|
||||
"rouge_score",
|
||||
"hcrystalball==0.1.10",
|
||||
"hcrystalball",
|
||||
"seqeval",
|
||||
"pytorch-forecasting>=0.9.0,<=0.10.1; python_version<'3.11'",
|
||||
# "pytorch-forecasting==0.10.1; python_version=='3.11'",
|
||||
"mlflow==2.15.1",
|
||||
"pytorch-forecasting; python_version<'3.13'",
|
||||
"mlflow-skinny<=2.22.1", # Refer to https://mvnrepository.com/artifact/org.mlflow/mlflow-spark
|
||||
"joblibspark>=0.5.0",
|
||||
"joblib<=1.3.2",
|
||||
"nbconvert",
|
||||
"nbformat",
|
||||
"ipykernel",
|
||||
"pytorch-lightning<1.9.1", # test_forecast_panel
|
||||
"tensorboardX==2.6", # test_forecast_panel
|
||||
"requests<2.29.0", # https://github.com/docker/docker-py/issues/3113
|
||||
"pytorch-lightning", # test_forecast_panel
|
||||
"tensorboardX", # test_forecast_panel
|
||||
"requests", # https://github.com/docker/docker-py/issues/3113
|
||||
"packaging",
|
||||
"pydantic==1.10.9",
|
||||
"sympy",
|
||||
"wolframalpha",
|
||||
"dill", # a drop in replacement of pickle
|
||||
],
|
||||
"catboost": [
|
||||
"catboost>=0.26,<1.2; python_version<'3.11'",
|
||||
"catboost>=0.26,<=1.2.5; python_version>='3.11'",
|
||||
"catboost>=0.26",
|
||||
],
|
||||
"blendsearch": [
|
||||
"optuna>=2.8.0,<=3.6.1",
|
||||
"packaging",
|
||||
],
|
||||
"ray": [
|
||||
"ray[tune]~=1.13",
|
||||
"ray[tune]>=1.13,<2.5.0",
|
||||
],
|
||||
"azureml": [
|
||||
"azureml-mlflow",
|
||||
@@ -117,47 +116,35 @@ setuptools.setup(
|
||||
"scikit-learn",
|
||||
],
|
||||
"hf": [
|
||||
"transformers[torch]==4.26",
|
||||
"transformers[torch]>=4.26",
|
||||
"datasets",
|
||||
"nltk<=3.8.1",
|
||||
"rouge_score",
|
||||
"seqeval",
|
||||
],
|
||||
"nlp": [ # for backward compatibility; hf is the new option name
|
||||
"transformers[torch]==4.26",
|
||||
"transformers[torch]>=4.26",
|
||||
"datasets",
|
||||
"nltk<=3.8.1",
|
||||
"rouge_score",
|
||||
"seqeval",
|
||||
],
|
||||
"ts_forecast": [
|
||||
"holidays<0.14", # to prevent installation error for prophet
|
||||
"prophet>=1.0.1",
|
||||
"holidays",
|
||||
"prophet>=1.1.5",
|
||||
"statsmodels>=0.12.2",
|
||||
"hcrystalball==0.1.10",
|
||||
"hcrystalball>=0.1.10",
|
||||
],
|
||||
"forecast": [
|
||||
"holidays<0.14", # to prevent installation error for prophet
|
||||
"prophet>=1.0.1",
|
||||
"holidays",
|
||||
"prophet>=1.1.5",
|
||||
"statsmodels>=0.12.2",
|
||||
"hcrystalball==0.1.10",
|
||||
"pytorch-forecasting>=0.9.0; python_version<'3.11'",
|
||||
# "pytorch-forecasting==0.10.1; python_version=='3.11'",
|
||||
"pytorch-lightning==1.9.0",
|
||||
"tensorboardX==2.6",
|
||||
"hcrystalball>=0.1.10",
|
||||
"pytorch-forecasting>=0.10.4; python_version<'3.13'",
|
||||
"pytorch-lightning>=1.9.0",
|
||||
"tensorboardX>=2.6",
|
||||
],
|
||||
"benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3", "pandas==1.1.4"],
|
||||
"openai": ["openai==0.27.8", "diskcache"],
|
||||
"autogen": ["openai==0.27.8", "diskcache", "termcolor"],
|
||||
"mathchat": ["openai==0.27.8", "diskcache", "termcolor", "sympy", "pydantic==1.10.9", "wolframalpha"],
|
||||
"retrievechat": [
|
||||
"openai==0.27.8",
|
||||
"diskcache",
|
||||
"termcolor",
|
||||
"chromadb",
|
||||
"tiktoken",
|
||||
"sentence_transformers",
|
||||
],
|
||||
"synapse": [
|
||||
"joblibspark>=0.5.0",
|
||||
"optuna>=2.8.0,<=3.6.1",
|
||||
@@ -170,10 +157,9 @@ setuptools.setup(
|
||||
"Operating System :: OS Independent",
|
||||
# Specify the Python versions you support here.
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
],
|
||||
python_requires=">=3.8",
|
||||
python_requires=">=3.10",
|
||||
)
|
||||
|
||||
@@ -4,8 +4,17 @@ import pytest
|
||||
|
||||
from flaml import AutoML, tune
|
||||
|
||||
try:
|
||||
import transformers
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
|
||||
_transformers_installed = True
|
||||
except ImportError:
|
||||
_transformers_installed = False
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform == "darwin" or not _transformers_installed, reason="do not run on mac os or transformers not installed"
|
||||
)
|
||||
def test_custom_hp_nlp():
|
||||
from test.nlp.utils import get_automl_settings, get_toy_data_seqclassification
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import atexit
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
@@ -15,8 +16,18 @@ from sklearn.model_selection import train_test_split
|
||||
|
||||
from flaml import AutoML
|
||||
from flaml.automl.ml import sklearn_metric_loss_score
|
||||
from flaml.automl.spark import disable_spark_ansi_mode, restore_spark_ansi_mode
|
||||
from flaml.tune.spark.utils import check_spark
|
||||
|
||||
try:
|
||||
import pytorch_lightning
|
||||
|
||||
_pl_installed = True
|
||||
except ImportError:
|
||||
_pl_installed = False
|
||||
|
||||
pytestmark = pytest.mark.spark
|
||||
|
||||
leaderboard = defaultdict(dict)
|
||||
|
||||
warnings.simplefilter(action="ignore")
|
||||
@@ -37,7 +48,7 @@ else:
|
||||
.config(
|
||||
"spark.jars.packages",
|
||||
(
|
||||
"com.microsoft.azure:synapseml_2.12:1.0.2,"
|
||||
"com.microsoft.azure:synapseml_2.12:1.1.0,"
|
||||
"org.apache.hadoop:hadoop-azure:3.3.5,"
|
||||
"com.microsoft.azure:azure-storage:8.6.6,"
|
||||
f"org.mlflow:mlflow-spark_2.12:{mlflow.__version__}"
|
||||
@@ -61,6 +72,9 @@ else:
|
||||
except ImportError:
|
||||
skip_spark = True
|
||||
|
||||
spark, ansi_conf, adjusted = disable_spark_ansi_mode()
|
||||
atexit.register(restore_spark_ansi_mode, spark, ansi_conf, adjusted)
|
||||
|
||||
|
||||
def _test_regular_models(estimator_list, task):
|
||||
if isinstance(estimator_list, str):
|
||||
@@ -269,7 +283,11 @@ class TestExtraModel(unittest.TestCase):
|
||||
|
||||
@unittest.skipIf(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_default_spark(self):
|
||||
_test_spark_models(None, "classification")
|
||||
# TODO: remove the estimator assignment once SynapseML supports spark 4+.
|
||||
from flaml.automl.spark.utils import _spark_major_minor_version
|
||||
|
||||
estimator_list = ["rf_spark"] if _spark_major_minor_version[0] >= 4 else None
|
||||
_test_spark_models(estimator_list, "classification")
|
||||
|
||||
def test_svc(self):
|
||||
_test_regular_models("svc", "classification")
|
||||
@@ -300,7 +318,7 @@ class TestExtraModel(unittest.TestCase):
|
||||
def test_avg(self):
|
||||
_test_forecast("avg")
|
||||
|
||||
@unittest.skipIf(skip_spark, reason="Skip on Mac or Windows")
|
||||
@unittest.skipIf(skip_spark or not _pl_installed, reason="Skip on Mac or Windows or no pytorch_lightning.")
|
||||
def test_tcn(self):
|
||||
_test_forecast("tcn")
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ from flaml import AutoML
|
||||
from flaml.automl.task.time_series_task import TimeSeriesTask
|
||||
|
||||
|
||||
def test_forecast_automl(budget=10, estimators_when_no_prophet=["arima", "sarimax", "holt-winters"]):
|
||||
def test_forecast_automl(budget=20, estimators_when_no_prophet=["arima", "sarimax", "holt-winters"]):
|
||||
# using dataframe
|
||||
import statsmodels.api as sm
|
||||
|
||||
@@ -477,7 +477,10 @@ def test_forecast_classification(budget=5):
|
||||
def get_stalliion_data():
|
||||
from pytorch_forecasting.data.examples import get_stallion_data
|
||||
|
||||
data = get_stallion_data()
|
||||
# data = get_stallion_data()
|
||||
data = pd.read_parquet(
|
||||
"https://raw.githubusercontent.com/sktime/pytorch-forecasting/refs/heads/main/examples/data/stallion.parquet"
|
||||
)
|
||||
# add time index - For datasets with no missing values, FLAML will automate this process
|
||||
data["time_idx"] = data["date"].dt.year * 12 + data["date"].dt.month
|
||||
data["time_idx"] -= data["time_idx"].min()
|
||||
@@ -507,8 +510,12 @@ def get_stalliion_data():
|
||||
"3.11" in sys.version,
|
||||
reason="do not run on py 3.11",
|
||||
)
|
||||
def test_forecast_panel(budget=5):
|
||||
data, special_days = get_stalliion_data()
|
||||
def test_forecast_panel(budget=30):
|
||||
try:
|
||||
data, special_days = get_stalliion_data()
|
||||
except ImportError:
|
||||
print("pytorch_forecasting not installed")
|
||||
return
|
||||
time_horizon = 6 # predict six months
|
||||
training_cutoff = data["time_idx"].max() - time_horizon
|
||||
data["time_idx"] = data["time_idx"].astype("int")
|
||||
@@ -674,11 +681,55 @@ def test_cv_step():
|
||||
print("yahoo!")
|
||||
|
||||
|
||||
def test_log_training_metric_ts_models():
|
||||
"""Test that log_training_metric=True works with time series models (arima, sarimax, holt-winters)."""
|
||||
import statsmodels.api as sm
|
||||
|
||||
from flaml.automl.task.time_series_task import TimeSeriesTask
|
||||
|
||||
estimators_all = TimeSeriesTask("forecast").estimators.keys()
|
||||
estimators_to_test = ["xgboost", "arima", "lassolars", "tcn", "snaive", "prophet", "orbit"]
|
||||
estimators = [
|
||||
est for est in estimators_to_test if est in estimators_all
|
||||
] # not all estimators available in current python env
|
||||
print(f"Testing estimators: {estimators}")
|
||||
|
||||
# Prepare data
|
||||
data = sm.datasets.co2.load_pandas().data["co2"]
|
||||
data = data.resample("MS").mean()
|
||||
data = data.bfill().ffill()
|
||||
data = data.to_frame().reset_index()
|
||||
data = data.rename(columns={"index": "ds", "co2": "y"})
|
||||
num_samples = data.shape[0]
|
||||
time_horizon = 12
|
||||
split_idx = num_samples - time_horizon
|
||||
df = data[:split_idx]
|
||||
|
||||
# Test each time series model with log_training_metric=True
|
||||
for estimator in estimators:
|
||||
print(f"\nTesting {estimator} with log_training_metric=True")
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"time_budget": 3,
|
||||
"metric": "mape",
|
||||
"task": "forecast",
|
||||
"eval_method": "holdout",
|
||||
"label": "y",
|
||||
"log_training_metric": True, # This should not cause errors
|
||||
"estimator_list": [estimator],
|
||||
}
|
||||
automl.fit(dataframe=df, **settings, period=time_horizon, force_cancel=True)
|
||||
print(f" ✅ {estimator} SUCCESS with log_training_metric=True")
|
||||
if automl.best_estimator:
|
||||
assert automl.best_estimator == estimator
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test_forecast_automl(60)
|
||||
# test_multivariate_forecast_num(5)
|
||||
# test_multivariate_forecast_cat(5)
|
||||
test_numpy()
|
||||
# test_numpy()
|
||||
# test_forecast_classification(5)
|
||||
# test_forecast_panel(5)
|
||||
# test_cv_step()
|
||||
test_log_training_metric_ts_models()
|
||||
|
||||
51
test/automl/test_max_iter_1.py
Normal file
51
test/automl/test_max_iter_1.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import mlflow
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from flaml import AutoML
|
||||
|
||||
|
||||
def test_max_iter_1():
|
||||
date_rng = pd.date_range(start="2024-01-01", periods=100, freq="H")
|
||||
X = pd.DataFrame({"ds": date_rng})
|
||||
y_train_24h = np.random.rand(len(X)) * 100
|
||||
|
||||
# AutoML
|
||||
settings = {
|
||||
"max_iter": 1,
|
||||
"estimator_list": ["xgboost", "lgbm"],
|
||||
"starting_points": {"xgboost": {}, "lgbm": {}},
|
||||
"task": "ts_forecast",
|
||||
"log_file_name": "test_max_iter_1.log",
|
||||
"seed": 41,
|
||||
"mlflow_exp_name": "TestExp-max_iter-1",
|
||||
"use_spark": False,
|
||||
"n_concurrent_trials": 1,
|
||||
"verbose": 1,
|
||||
"featurization": "off",
|
||||
"metric": "rmse",
|
||||
"mlflow_logging": True,
|
||||
}
|
||||
|
||||
automl = AutoML(**settings)
|
||||
|
||||
with mlflow.start_run(run_name="AutoMLModel-XGBoost-and-LGBM-max_iter_1"):
|
||||
automl.fit(
|
||||
X_train=X,
|
||||
y_train=y_train_24h,
|
||||
period=24,
|
||||
X_val=X,
|
||||
y_val=y_train_24h,
|
||||
split_ratio=0,
|
||||
force_cancel=False,
|
||||
)
|
||||
|
||||
assert automl.model is not None, "AutoML failed to return a model"
|
||||
assert automl.best_run_id is not None, "Best run ID should not be None with mlflow logging"
|
||||
|
||||
print("Best model:", automl.model)
|
||||
print("Best run ID:", automl.best_run_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_max_iter_1()
|
||||
@@ -10,6 +10,18 @@ from flaml import AutoML
|
||||
|
||||
|
||||
class TestMLFlowLoggingParam:
|
||||
def test_update_and_install_requirements(self):
|
||||
import mlflow
|
||||
from sklearn import tree
|
||||
|
||||
from flaml.fabric.mlflow import update_and_install_requirements
|
||||
|
||||
with mlflow.start_run(run_name="test") as run:
|
||||
sk_model = tree.DecisionTreeClassifier()
|
||||
mlflow.sklearn.log_model(sk_model, "model", registered_model_name="test")
|
||||
|
||||
update_and_install_requirements(run_id=run.info.run_id)
|
||||
|
||||
def test_should_start_new_run_by_default(self, automl_settings):
|
||||
with mlflow.start_run() as parent_run:
|
||||
automl = AutoML()
|
||||
|
||||
@@ -143,4 +143,5 @@ def test_prep():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_lrl2()
|
||||
test_prep()
|
||||
@@ -1,8 +1,23 @@
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
from minio.error import ServerError
|
||||
from openml.exceptions import OpenMLServerException
|
||||
|
||||
try:
|
||||
from minio.error import ServerError
|
||||
except ImportError:
|
||||
|
||||
class ServerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
from openml.exceptions import OpenMLServerException
|
||||
except ImportError:
|
||||
|
||||
class OpenMLServerException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
from requests.exceptions import ChunkedEncodingError, SSLError
|
||||
|
||||
|
||||
@@ -64,6 +79,9 @@ def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings)
|
||||
""" retrieve best config and best learner """
|
||||
print("Best ML leaner:", automl.best_estimator)
|
||||
if not automl.best_estimator:
|
||||
print("Training budget is not sufficient")
|
||||
return
|
||||
print("Best hyperparmeter config:", automl.best_config)
|
||||
print(f"Best accuracy on validation data: {1 - automl.best_loss:.4g}")
|
||||
print(f"Training duration of best run: {automl.best_config_train_time:.4g} s")
|
||||
|
||||
@@ -38,7 +38,7 @@ class TestLogging(unittest.TestCase):
|
||||
"keep_search_state": True,
|
||||
"learner_selector": "roundrobin",
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
|
||||
n = len(y_train) >> 1
|
||||
print(automl.model, automl.classes_, automl.predict(X_train))
|
||||
automl.fit(
|
||||
|
||||
@@ -47,7 +47,7 @@ class TestRegression(unittest.TestCase):
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
|
||||
n = int(len(y_train) * 9 // 10)
|
||||
automl.fit(X_train=X_train[:n], y_train=y_train[:n], X_val=X_train[n:], y_val=y_train[n:], **automl_settings)
|
||||
assert automl._state.eval_method == "holdout"
|
||||
@@ -130,7 +130,7 @@ class TestRegression(unittest.TestCase):
|
||||
)
|
||||
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings)
|
||||
|
||||
def test_parallel(self, hpo_method=None):
|
||||
def test_parallel_and_pickle(self, hpo_method=None):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 10,
|
||||
@@ -141,7 +141,7 @@ class TestRegression(unittest.TestCase):
|
||||
"n_concurrent_trials": 10,
|
||||
"hpo_method": hpo_method,
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
|
||||
try:
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
@@ -153,6 +153,18 @@ class TestRegression(unittest.TestCase):
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
# test pickle and load_pickle, should work for prediction
|
||||
automl_experiment.pickle("automl_xgboost_spark.pkl")
|
||||
automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
|
||||
assert automl_loaded.best_estimator == automl_experiment.best_estimator
|
||||
assert automl_loaded.best_loss == automl_experiment.best_loss
|
||||
automl_loaded.predict(X_train)
|
||||
|
||||
import shutil
|
||||
|
||||
shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
|
||||
shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)
|
||||
|
||||
def test_sparse_matrix_regression_holdout(self):
|
||||
X_train = scipy.sparse.random(8, 100)
|
||||
y_train = np.random.uniform(size=8)
|
||||
@@ -268,7 +280,7 @@ def test_reproducibility_of_regression_models(estimator: str):
|
||||
"skip_transform": True,
|
||||
"retrain_full": True,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
@@ -314,7 +326,7 @@ def test_reproducibility_of_catboost_regression_model():
|
||||
"skip_transform": True,
|
||||
"retrain_full": True,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
@@ -360,7 +372,7 @@ def test_reproducibility_of_lgbm_regression_model():
|
||||
"skip_transform": True,
|
||||
"retrain_full": True,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
@@ -424,7 +436,7 @@ def test_reproducibility_of_underlying_regression_models(estimator: str):
|
||||
"skip_transform": True,
|
||||
"retrain_full": False,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
|
||||
@@ -142,7 +142,7 @@ class TestScore:
|
||||
def test_regression(self):
|
||||
automl_experiment = AutoML()
|
||||
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
|
||||
n = int(len(y_train) * 9 // 10)
|
||||
|
||||
for each_estimator in [
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from sklearn.datasets import fetch_openml
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.datasets import fetch_openml, load_iris
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.model_selection import GroupKFold, KFold, train_test_split
|
||||
|
||||
@@ -48,7 +50,7 @@ def test_time():
|
||||
_test(split_type="time")
|
||||
|
||||
|
||||
def test_groups():
|
||||
def test_groups_for_classification_task():
|
||||
from sklearn.externals._arff import ArffException
|
||||
|
||||
try:
|
||||
@@ -58,8 +60,6 @@ def test_groups():
|
||||
|
||||
X, y = load_wine(return_X_y=True)
|
||||
|
||||
import numpy as np
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
@@ -68,7 +68,7 @@ def test_groups():
|
||||
"model_history": True,
|
||||
"eval_method": "cv",
|
||||
"groups": np.random.randint(low=0, high=10, size=len(y)),
|
||||
"estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
|
||||
"estimator_list": ["catboost", "lgbm", "rf", "xgboost", "kneighbor"],
|
||||
"learner_selector": "roundrobin",
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
@@ -88,6 +88,72 @@ def test_groups():
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
|
||||
def test_groups_for_regression_task():
|
||||
"""Append nonsensical groups to iris dataset and use it to test that GroupKFold works for regression tasks"""
|
||||
iris_dict_data = load_iris(as_frame=True) # numpy arrays
|
||||
iris_data = iris_dict_data["frame"] # pandas dataframe data + target
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
iris_data["cluster"] = rng.integers(
|
||||
low=0, high=5, size=iris_data.shape[0]
|
||||
) # np.random.randint(0, 5, iris_data.shape[0])
|
||||
|
||||
automl = AutoML()
|
||||
X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
|
||||
y = iris_data["petal width (cm)"]
|
||||
X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
|
||||
X, y, iris_data["cluster"], random_state=42
|
||||
)
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"metric": "r2",
|
||||
"task": "regression",
|
||||
"estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
|
||||
"eval_method": "cv",
|
||||
"split_type": "uniform",
|
||||
"groups": groups_train,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
|
||||
def test_groups_with_sample_weights():
|
||||
"""Verifies that sample weights can be used with group splits i.e. that https://github.com/microsoft/FLAML/issues/1396 remains fixed"""
|
||||
iris_dict_data = load_iris(as_frame=True) # numpy arrays
|
||||
iris_data = iris_dict_data["frame"] # pandas dataframe data + target
|
||||
iris_data["cluster"] = np.random.randint(0, 5, iris_data.shape[0])
|
||||
automl = AutoML()
|
||||
|
||||
X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
|
||||
y = iris_data["petal width (cm)"]
|
||||
sample_weight = pd.Series(np.random.rand(X.shape[0]))
|
||||
(
|
||||
X_train,
|
||||
X_test,
|
||||
y_train,
|
||||
y_test,
|
||||
groups_train,
|
||||
groups_test,
|
||||
sample_weight_train,
|
||||
sample_weight_test,
|
||||
) = train_test_split(X, y, iris_data["cluster"], sample_weight, random_state=42)
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"metric": "r2",
|
||||
"task": "regression",
|
||||
"log_file_name": "error.log",
|
||||
"log_type": "all",
|
||||
"estimator_list": ["lgbm"],
|
||||
"eval_method": "cv",
|
||||
"split_type": "group",
|
||||
"groups": groups_train,
|
||||
"sample_weight": sample_weight_train,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
assert automl.model is not None
|
||||
|
||||
|
||||
def test_stratified_groupkfold():
|
||||
from minio.error import ServerError
|
||||
from sklearn.model_selection import StratifiedGroupKFold
|
||||
@@ -108,6 +174,7 @@ def test_stratified_groupkfold():
|
||||
"split_type": splitter,
|
||||
"groups": X_train["Airline"],
|
||||
"estimator_list": [
|
||||
"catboost",
|
||||
"lgbm",
|
||||
"rf",
|
||||
"xgboost",
|
||||
@@ -203,4 +270,4 @@ def test_object():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_groups()
|
||||
test_groups_for_classification_task()
|
||||
|
||||
@@ -30,7 +30,7 @@ class TestTrainingLog(unittest.TestCase):
|
||||
"keep_search_state": True,
|
||||
"estimator_list": estimator_list,
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
|
||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
# Check if the training log file is populated.
|
||||
self.assertTrue(os.path.exists(filename))
|
||||
|
||||
@@ -108,7 +108,14 @@ class TestWarmStart(unittest.TestCase):
|
||||
|
||||
def test_FLAML_sample_size_in_starting_points(self):
|
||||
from minio.error import ServerError
|
||||
from openml.exceptions import OpenMLServerException
|
||||
|
||||
try:
|
||||
from openml.exceptions import OpenMLServerException
|
||||
except ImportError:
|
||||
|
||||
class OpenMLServerException(Exception):
|
||||
pass
|
||||
|
||||
from requests.exceptions import ChunkedEncodingError, SSLError
|
||||
|
||||
from flaml import AutoML
|
||||
|
||||
BIN
test/cal_housing_py3.pkz
Normal file
BIN
test/cal_housing_py3.pkz
Normal file
Binary file not shown.
60
test/check_dependency.py
Normal file
60
test/check_dependency.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import subprocess
|
||||
from importlib.metadata import distributions
|
||||
|
||||
installed_libs = sorted(f"{dist.metadata['Name']}=={dist.version}" for dist in distributions())
|
||||
|
||||
first_tier_dependencies = [
|
||||
"numpy",
|
||||
"jupyter",
|
||||
"lightgbm",
|
||||
"xgboost",
|
||||
"scipy",
|
||||
"pandas",
|
||||
"scikit-learn",
|
||||
"thop",
|
||||
"pytest",
|
||||
"pytest-rerunfailures",
|
||||
"coverage",
|
||||
"pre-commit",
|
||||
"torch",
|
||||
"torchvision",
|
||||
"catboost",
|
||||
"rgf-python",
|
||||
"optuna",
|
||||
"openml",
|
||||
"statsmodels",
|
||||
"psutil",
|
||||
"dataclasses",
|
||||
"transformers[torch]",
|
||||
"transformers",
|
||||
"datasets",
|
||||
"evaluate",
|
||||
"nltk",
|
||||
"rouge_score",
|
||||
"hcrystalball",
|
||||
"seqeval",
|
||||
"pytorch-forecasting",
|
||||
"mlflow-skinny",
|
||||
"joblibspark",
|
||||
"joblib",
|
||||
"nbconvert",
|
||||
"nbformat",
|
||||
"ipykernel",
|
||||
"pytorch-lightning",
|
||||
"tensorboardX",
|
||||
"requests",
|
||||
"packaging",
|
||||
"dill",
|
||||
"ray",
|
||||
"prophet",
|
||||
]
|
||||
|
||||
|
||||
for lib in installed_libs:
|
||||
lib_name = lib.split("==")[0]
|
||||
if lib_name in first_tier_dependencies:
|
||||
print(lib)
|
||||
|
||||
# print current commit hash
|
||||
commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
|
||||
print(f"Current commit hash: {commit_hash}")
|
||||
@@ -2,11 +2,24 @@ from typing import Any, Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
|
||||
import pytest
|
||||
from sklearn.metrics import f1_score, r2_score
|
||||
|
||||
try:
|
||||
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
|
||||
except ImportError: # pragma: no cover
|
||||
CatBoostClassifier = None
|
||||
CatBoostRegressor = None
|
||||
Pool = None
|
||||
|
||||
def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> pd.DataFrame:
|
||||
|
||||
def _is_catboost_model_type(model_type: type) -> bool:
|
||||
if CatBoostClassifier is not None and CatBoostRegressor is not None:
|
||||
return model_type is CatBoostClassifier or model_type is CatBoostRegressor
|
||||
return getattr(model_type, "__module__", "").startswith("catboost")
|
||||
|
||||
|
||||
def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> List[float]:
|
||||
"""Mimic the FLAML CV process to calculate the metrics across each fold.
|
||||
|
||||
:param X_train_all: X training data
|
||||
@@ -17,7 +30,7 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
|
||||
:return: An array containing the metrics
|
||||
"""
|
||||
rng = np.random.RandomState(2020)
|
||||
all_fold_metrics: List[Dict[str, Union[int, float]]] = []
|
||||
all_fold_metrics: List[float] = []
|
||||
for train_index, val_index in kf.split(X_train_all, y_train_all):
|
||||
X_train_split, y_train_split = X_train_all, y_train_all
|
||||
train_index = rng.permutation(train_index)
|
||||
@@ -25,9 +38,11 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
|
||||
X_val = X_train_split.iloc[val_index]
|
||||
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
||||
model_type = type(model)
|
||||
if model_type is not CatBoostClassifier and model_type is not CatBoostRegressor:
|
||||
if not _is_catboost_model_type(model_type):
|
||||
model.fit(X_train, y_train)
|
||||
else:
|
||||
if Pool is None:
|
||||
pytest.skip("catboost is not installed")
|
||||
use_best_model = True
|
||||
n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
|
||||
X_tr, y_tr = (X_train)[:n], y_train[:n]
|
||||
@@ -38,5 +53,5 @@ def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model:
|
||||
reproduced_metric = 1 - f1_score(y_val, y_pred_classes)
|
||||
else:
|
||||
reproduced_metric = 1 - r2_score(y_val, y_pred_classes)
|
||||
all_fold_metrics.append(reproduced_metric)
|
||||
all_fold_metrics.append(float(reproduced_metric))
|
||||
return all_fold_metrics
|
||||
|
||||
@@ -60,7 +60,7 @@ def test_housing(as_frame=True):
|
||||
"starting_points": "data",
|
||||
"max_iter": 0,
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=as_frame, data_home="test")
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
|
||||
@@ -115,7 +115,7 @@ def test_suggest_classification():
|
||||
|
||||
def test_suggest_regression():
|
||||
location = "test/default"
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
suggested = suggest_hyperparams("regression", X_train, y_train, "lgbm", location=location)
|
||||
print(suggested)
|
||||
suggested = preprocess_and_suggest_hyperparams("regression", X_train, y_train, "xgboost", location=location)
|
||||
@@ -137,7 +137,7 @@ def test_rf():
|
||||
print(rf)
|
||||
|
||||
location = "test/default"
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
rf = RandomForestRegressor(default_location=location)
|
||||
rf.fit(X_train[:100], y_train[:100])
|
||||
rf.predict(X_train)
|
||||
@@ -155,7 +155,7 @@ def test_extratrees():
|
||||
print(classifier)
|
||||
|
||||
location = "test/default"
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
regressor = ExtraTreesRegressor(default_location=location)
|
||||
regressor.fit(X_train[:100], y_train[:100])
|
||||
regressor.predict(X_train)
|
||||
@@ -175,7 +175,7 @@ def test_lgbm():
|
||||
print(classifier.classes_)
|
||||
|
||||
location = "test/default"
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
regressor = LGBMRegressor(default_location=location)
|
||||
regressor.fit(X_train, y_train)
|
||||
regressor.predict(X_train)
|
||||
@@ -194,7 +194,7 @@ def test_xgboost():
|
||||
print(classifier.classes_)
|
||||
|
||||
location = "test/default"
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
regressor = XGBRegressor(default_location=location)
|
||||
regressor.fit(X_train[:100], y_train[:100])
|
||||
regressor.predict(X_train)
|
||||
|
||||
@@ -3,6 +3,12 @@ import shutil
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
try:
|
||||
import transformers
|
||||
except ImportError:
|
||||
pytest.skip("transformers not installed", allow_module_level=True)
|
||||
|
||||
from utils import (
|
||||
get_automl_settings,
|
||||
get_toy_data_binclassification,
|
||||
@@ -24,6 +30,8 @@ model_path_list = [
|
||||
if sys.platform.startswith("darwin") and sys.version_info[0] == 3 and sys.version_info[1] == 11:
|
||||
pytest.skip("skipping Python 3.11 on MacOS", allow_module_level=True)
|
||||
|
||||
pytestmark = pytest.mark.spark # set to spark as parallel testing raised RuntimeError
|
||||
|
||||
|
||||
def test_switch_1_1():
|
||||
data_idx, model_path_idx = 0, 0
|
||||
|
||||
@@ -5,8 +5,20 @@ import sys
|
||||
import pytest
|
||||
from utils import get_automl_settings, get_toy_data_seqclassification
|
||||
|
||||
try:
|
||||
import transformers
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows")
|
||||
_transformers_installed = True
|
||||
except ImportError:
|
||||
_transformers_installed = False
|
||||
|
||||
pytestmark = pytest.mark.spark # set to spark as parallel testing raised MlflowException of changing parameter
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform in ["darwin", "win32"] or not _transformers_installed,
|
||||
reason="do not run on mac os or windows or transformers not installed",
|
||||
)
|
||||
def test_cv():
|
||||
import requests
|
||||
|
||||
|
||||
@@ -5,8 +5,18 @@ import sys
|
||||
import pytest
|
||||
from utils import get_automl_settings, get_toy_data_multiplechoiceclassification
|
||||
|
||||
try:
|
||||
import transformers
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows")
|
||||
_transformers_installed = True
|
||||
except ImportError:
|
||||
_transformers_installed = False
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform in ["darwin", "win32"] or not _transformers_installed,
|
||||
reason="do not run on mac os or windows or transformers not installed",
|
||||
)
|
||||
def test_mcc():
|
||||
import requests
|
||||
|
||||
|
||||
@@ -7,8 +7,24 @@ from utils import get_automl_settings, get_toy_data_seqclassification
|
||||
|
||||
from flaml.default import portfolio
|
||||
|
||||
if sys.platform.startswith("darwin") and sys.version_info[0] == 3 and sys.version_info[1] == 11:
|
||||
pytest.skip("skipping Python 3.11 on MacOS", allow_module_level=True)
|
||||
try:
|
||||
import transformers
|
||||
|
||||
_transformers_installed = True
|
||||
except ImportError:
|
||||
_transformers_installed = False
|
||||
|
||||
if (
|
||||
sys.platform.startswith("darwin")
|
||||
and sys.version_info >= (3, 11)
|
||||
or not _transformers_installed
|
||||
or sys.platform == "win32"
|
||||
):
|
||||
pytest.skip("skipping Python 3.11 on MacOS or without transformers or on Windows", allow_module_level=True)
|
||||
|
||||
pytestmark = (
|
||||
pytest.mark.spark
|
||||
) # set to spark as parallel testing raised ValueError: Feature NonExisting not implemented.
|
||||
|
||||
|
||||
def pop_args(fit_kwargs):
|
||||
@@ -24,23 +40,34 @@ def test_build_portfolio(path="./test/nlp/default", strategy="greedy"):
|
||||
portfolio.main()
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="do not run on windows")
|
||||
def test_starting_point_not_in_search_space():
|
||||
from flaml import AutoML
|
||||
"""Regression test for invalid starting points and custom_hp.
|
||||
|
||||
This test must not require network access to Hugging Face.
|
||||
"""
|
||||
|
||||
"""
|
||||
test starting_points located outside of the search space, and custom_hp is not set
|
||||
"""
|
||||
from flaml.automl.state import SearchState
|
||||
from flaml.automl.task.factory import task_factory
|
||||
|
||||
this_estimator_name = "transformer"
|
||||
X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
|
||||
X_train, y_train, _, _, _ = get_toy_data_seqclassification()
|
||||
task = task_factory("seq-classification", X_train, y_train)
|
||||
estimator_class = task.estimator_class_from_str(this_estimator_name)
|
||||
estimator_class.init()
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = get_automl_settings(estimator_name=this_estimator_name)
|
||||
|
||||
automl_settings["starting_points"] = {this_estimator_name: [{"learning_rate": 2e-3}]}
|
||||
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
assert automl._search_states[this_estimator_name].init_config[0]["learning_rate"] != 2e-3
|
||||
# SearchState is where invalid starting points are filtered out when max_iter > 1.
|
||||
search_state = SearchState(
|
||||
learner_class=estimator_class,
|
||||
data=X_train,
|
||||
task=task,
|
||||
starting_point={"learning_rate": 2e-3},
|
||||
max_iter=3,
|
||||
budget=10,
|
||||
)
|
||||
assert search_state.init_config and search_state.init_config[0].get("learning_rate") != 2e-3
|
||||
|
||||
"""
|
||||
test starting_points located outside of the search space, and custom_hp is set
|
||||
@@ -48,39 +75,60 @@ def test_starting_point_not_in_search_space():
|
||||
|
||||
from flaml import tune
|
||||
|
||||
X_train, y_train, X_val, y_val, _ = get_toy_data_seqclassification()
|
||||
X_train, y_train, _, _, _ = get_toy_data_seqclassification()
|
||||
|
||||
this_estimator_name = "transformer_ms"
|
||||
automl = AutoML()
|
||||
automl_settings = get_automl_settings(estimator_name=this_estimator_name)
|
||||
task = task_factory("seq-classification", X_train, y_train)
|
||||
estimator_class = task.estimator_class_from_str(this_estimator_name)
|
||||
estimator_class.init()
|
||||
|
||||
automl_settings["custom_hp"] = {
|
||||
this_estimator_name: {
|
||||
"model_path": {
|
||||
"domain": "albert-base-v2",
|
||||
},
|
||||
"learning_rate": {
|
||||
"domain": tune.choice([1e-4, 1e-5]),
|
||||
},
|
||||
"per_device_train_batch_size": {
|
||||
"domain": 2,
|
||||
},
|
||||
}
|
||||
custom_hp = {
|
||||
"model_path": {
|
||||
"domain": "albert-base-v2",
|
||||
},
|
||||
"learning_rate": {
|
||||
"domain": tune.choice([1e-4, 1e-5]),
|
||||
},
|
||||
"per_device_train_batch_size": {
|
||||
"domain": 2,
|
||||
},
|
||||
}
|
||||
automl_settings["starting_points"] = "data:test/nlp/default/"
|
||||
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
assert len(automl._search_states[this_estimator_name].init_config[0]) == len(
|
||||
automl._search_states[this_estimator_name]._search_space_domain
|
||||
) - len(automl_settings["custom_hp"][this_estimator_name]), (
|
||||
# Simulate a suggested starting point (e.g. from portfolio) which becomes invalid
|
||||
# after custom_hp constrains the space.
|
||||
invalid_starting_points = [
|
||||
{
|
||||
"learning_rate": 1e-5,
|
||||
"num_train_epochs": 1.0,
|
||||
"per_device_train_batch_size": 8,
|
||||
"seed": 43,
|
||||
"global_max_steps": 100,
|
||||
"model_path": "google/electra-base-discriminator",
|
||||
}
|
||||
]
|
||||
|
||||
search_state = SearchState(
|
||||
learner_class=estimator_class,
|
||||
data=X_train,
|
||||
task=task,
|
||||
starting_point=invalid_starting_points,
|
||||
custom_hp=custom_hp,
|
||||
max_iter=3,
|
||||
budget=10,
|
||||
)
|
||||
|
||||
assert search_state.init_config, "Expected a non-empty init_config list"
|
||||
init_config0 = search_state.init_config[0]
|
||||
assert init_config0 is not None
|
||||
assert len(init_config0) == len(search_state._search_space_domain) - len(custom_hp), (
|
||||
"The search space is updated with the custom_hp on {} hyperparameters of "
|
||||
"the specified estimator without an initial value. Thus a valid init config "
|
||||
"should only contain the cardinality of the search space minus {}".format(
|
||||
len(automl_settings["custom_hp"][this_estimator_name]),
|
||||
len(automl_settings["custom_hp"][this_estimator_name]),
|
||||
len(custom_hp),
|
||||
len(custom_hp),
|
||||
)
|
||||
)
|
||||
assert automl._search_states[this_estimator_name].search_space["model_path"] == "albert-base-v2"
|
||||
assert search_state.search_space["model_path"] == "albert-base-v2"
|
||||
|
||||
if os.path.exists("test/data/output/"):
|
||||
try:
|
||||
@@ -89,7 +137,6 @@ def test_starting_point_not_in_search_space():
|
||||
print("PermissionError when deleting test/data/output/")
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="do not run on windows")
|
||||
def test_points_to_evaluate():
|
||||
from flaml import AutoML
|
||||
|
||||
@@ -102,7 +149,13 @@ def test_points_to_evaluate():
|
||||
|
||||
automl_settings["custom_hp"] = {"transformer_ms": {"model_path": {"domain": "google/electra-small-discriminator"}}}
|
||||
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
try:
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
except OSError as e:
|
||||
message = str(e)
|
||||
if "Too Many Requests" in message or "rate limit" in message.lower():
|
||||
pytest.skip(f"Skipping HF model load/training: {message}")
|
||||
raise
|
||||
|
||||
if os.path.exists("test/data/output/"):
|
||||
try:
|
||||
@@ -112,7 +165,6 @@ def test_points_to_evaluate():
|
||||
|
||||
|
||||
# TODO: implement _test_zero_shot_model
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="do not run on windows")
|
||||
def test_zero_shot_nomodel():
|
||||
from flaml.default import preprocess_and_suggest_hyperparams
|
||||
|
||||
@@ -137,7 +189,14 @@ def test_zero_shot_nomodel():
|
||||
fit_kwargs = automl_settings.pop("fit_kwargs_by_estimator", {}).get(estimator_name)
|
||||
fit_kwargs.update(automl_settings)
|
||||
pop_args(fit_kwargs)
|
||||
model.fit(X_train, y_train, **fit_kwargs)
|
||||
|
||||
try:
|
||||
model.fit(X_train, y_train, **fit_kwargs)
|
||||
except OSError as e:
|
||||
message = str(e)
|
||||
if "Too Many Requests" in message or "rate limit" in message.lower():
|
||||
pytest.skip(f"Skipping HF model load/training: {message}")
|
||||
raise
|
||||
|
||||
if os.path.exists("test/data/output/"):
|
||||
try:
|
||||
|
||||
@@ -7,7 +7,7 @@ from sklearn.model_selection import train_test_split
|
||||
from flaml import tune
|
||||
from flaml.automl.model import LGBMEstimator
|
||||
|
||||
data = fetch_california_housing(return_X_y=False, as_frame=True)
|
||||
data = fetch_california_housing(return_X_y=False, as_frame=True, data_home="test")
|
||||
X, y = data.data, data.target
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
||||
X_train_ref = ray.put(X_train)
|
||||
|
||||
@@ -11,7 +11,7 @@ automl_settings = {
|
||||
"task": "regression",
|
||||
"log_file_name": "test/california.log",
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
|
||||
# Train with labeled input data
|
||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl.model)
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
import atexit
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
import mlflow
|
||||
import numpy as np
|
||||
import pytest
|
||||
import sklearn.datasets as skds
|
||||
from packaging.version import Version
|
||||
|
||||
from flaml import AutoML
|
||||
from flaml.automl.data import auto_convert_dtypes_pandas, auto_convert_dtypes_spark, get_random_dataframe
|
||||
from flaml.automl.spark import disable_spark_ansi_mode, restore_spark_ansi_mode
|
||||
from flaml.tune.spark.utils import check_spark
|
||||
|
||||
warnings.simplefilter(action="ignore")
|
||||
@@ -27,7 +31,7 @@ else:
|
||||
.config(
|
||||
"spark.jars.packages",
|
||||
(
|
||||
"com.microsoft.azure:synapseml_2.12:1.0.4,"
|
||||
"com.microsoft.azure:synapseml_2.12:1.1.0,"
|
||||
"org.apache.hadoop:hadoop-azure:3.3.5,"
|
||||
"com.microsoft.azure:azure-storage:8.6.6,"
|
||||
f"org.mlflow:mlflow-spark_2.12:{mlflow.__version__}"
|
||||
@@ -53,15 +57,25 @@ else:
|
||||
except ImportError:
|
||||
skip_spark = True
|
||||
|
||||
spark, ansi_conf, adjusted = disable_spark_ansi_mode()
|
||||
atexit.register(restore_spark_ansi_mode, spark, ansi_conf, adjusted)
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
skip_py311 = True
|
||||
else:
|
||||
skip_py311 = False
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
|
||||
def _test_spark_synapseml_lightgbm(spark=None, task="classification"):
|
||||
# TODO: remove the estimator assignment once SynapseML supports spark 4+.
|
||||
from flaml.automl.spark.utils import _spark_major_minor_version
|
||||
|
||||
if _spark_major_minor_version[0] >= 4:
|
||||
# skip synapseml lightgbm test for spark 4+
|
||||
return
|
||||
|
||||
if task == "classification":
|
||||
metric = "accuracy"
|
||||
X_train, y_train = skds.load_iris(return_X_y=True, as_frame=True)
|
||||
@@ -151,27 +165,32 @@ def test_spark_synapseml_rank():
|
||||
_test_spark_synapseml_lightgbm(spark, "rank")
|
||||
|
||||
|
||||
def test_spark_input_df():
|
||||
df = (
|
||||
spark.read.format("csv")
|
||||
.option("header", True)
|
||||
.option("inferSchema", True)
|
||||
.load("wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv")
|
||||
)
|
||||
def test_spark_input_df_and_pickle():
|
||||
import pandas as pd
|
||||
|
||||
file_url = "https://mmlspark.blob.core.windows.net/publicwasb/company_bankruptcy_prediction_data.csv"
|
||||
df = pd.read_csv(file_url)
|
||||
df = spark.createDataFrame(df)
|
||||
train, test = df.randomSplit([0.8, 0.2], seed=1)
|
||||
feature_cols = df.columns[1:]
|
||||
featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features")
|
||||
train_data = featurizer.transform(train)["Bankrupt?", "features"]
|
||||
test_data = featurizer.transform(test)["Bankrupt?", "features"]
|
||||
automl = AutoML()
|
||||
|
||||
# TODO: remove the estimator assignment once SynapseML supports spark 4+.
|
||||
from flaml.automl.spark.utils import _spark_major_minor_version
|
||||
|
||||
estimator_list = ["rf_spark"] if _spark_major_minor_version[0] >= 4 else None
|
||||
|
||||
settings = {
|
||||
"time_budget": 30, # total running time in seconds
|
||||
"metric": "roc_auc",
|
||||
# "estimator_list": ["lgbm_spark"], # list of ML learners; we tune lightgbm in this example
|
||||
"task": "classification", # task type
|
||||
"log_file_name": "flaml_experiment.log", # flaml log file
|
||||
"seed": 7654321, # random seed
|
||||
"eval_method": "holdout",
|
||||
"estimator_list": estimator_list, # TODO: remove once SynapseML supports spark 4+
|
||||
}
|
||||
df = to_pandas_on_spark(to_pandas_on_spark(train_data).to_spark(index_col="index"))
|
||||
|
||||
@@ -182,6 +201,22 @@ def test_spark_input_df():
|
||||
**settings,
|
||||
)
|
||||
|
||||
# test pickle and load_pickle, should work for prediction
|
||||
automl.pickle("automl_spark.pkl")
|
||||
automl_loaded = AutoML().load_pickle("automl_spark.pkl")
|
||||
assert automl_loaded.best_estimator == automl.best_estimator
|
||||
assert automl_loaded.best_loss == automl.best_loss
|
||||
automl_loaded.predict(df)
|
||||
automl_loaded.model.estimator.transform(test_data)
|
||||
|
||||
import shutil
|
||||
|
||||
shutil.rmtree("automl_spark.pkl", ignore_errors=True)
|
||||
shutil.rmtree("automl_spark.pkl.flaml_artifacts", ignore_errors=True)
|
||||
|
||||
if estimator_list == ["rf_spark"]:
|
||||
return
|
||||
|
||||
try:
|
||||
model = automl.model.estimator
|
||||
predictions = model.transform(test_data)
|
||||
@@ -296,11 +331,88 @@ def _test_spark_large_df():
|
||||
print("time cost in minutes: ", (end_time - start_time) / 60)
|
||||
|
||||
|
||||
def test_get_random_dataframe():
|
||||
# Test with default parameters
|
||||
df = get_random_dataframe(n_rows=50, ratio_none=0.2, seed=123)
|
||||
assert df.shape == (50, 14) # Default is 200 rows and 14 columns
|
||||
|
||||
# Test column types
|
||||
assert "timestamp" in df.columns and np.issubdtype(df["timestamp"].dtype, np.datetime64)
|
||||
assert "id" in df.columns and np.issubdtype(df["id"].dtype, np.integer)
|
||||
assert "score" in df.columns and np.issubdtype(df["score"].dtype, np.floating)
|
||||
assert "category" in df.columns and df["category"].dtype.name == "category"
|
||||
|
||||
|
||||
def test_auto_convert_dtypes_pandas():
|
||||
# Create a test DataFrame with various types
|
||||
import pandas as pd
|
||||
|
||||
test_df = pd.DataFrame(
|
||||
{
|
||||
"int_col": ["1", "2", "3", "4", "5", "6", "6"],
|
||||
"float_col": ["1.1", "2.2", "3.3", "NULL", "5.5", "6.6", "6.6"],
|
||||
"date_col": ["2021-01-01", "2021-02-01", "NA", "2021-04-01", "2021-05-01", "2021-06-01", "2021-06-01"],
|
||||
"cat_col": ["A", "B", "A", "A", "B", "A", "B"],
|
||||
"string_col": ["text1", "text2", "text3", "text4", "text5", "text6", "text7"],
|
||||
}
|
||||
)
|
||||
|
||||
# Convert dtypes
|
||||
converted_df, schema = auto_convert_dtypes_pandas(test_df)
|
||||
|
||||
# Check conversions
|
||||
assert schema["int_col"] == "int"
|
||||
assert schema["float_col"] == "double"
|
||||
assert schema["date_col"] == "timestamp"
|
||||
assert schema["cat_col"] == "category"
|
||||
assert schema["string_col"] == "string"
|
||||
|
||||
|
||||
def test_auto_convert_dtypes_spark():
|
||||
"""Test auto_convert_dtypes_spark function with various data types."""
|
||||
import pandas as pd
|
||||
|
||||
# Create a test DataFrame with various types
|
||||
test_pdf = pd.DataFrame(
|
||||
{
|
||||
"int_col": ["1", "2", "3", "4", "NA"],
|
||||
"float_col": ["1.1", "2.2", "3.3", "NULL", "5.5"],
|
||||
"date_col": ["2021-01-01", "2021-02-01", "NA", "2021-04-01", "2021-05-01"],
|
||||
"cat_col": ["A", "B", "A", "C", "B"],
|
||||
"string_col": ["text1", "text2", "text3", "text4", "text5"],
|
||||
}
|
||||
)
|
||||
|
||||
# Convert pandas DataFrame to Spark DataFrame
|
||||
test_df = spark.createDataFrame(test_pdf)
|
||||
|
||||
# Convert dtypes
|
||||
converted_df, schema = auto_convert_dtypes_spark(test_df)
|
||||
|
||||
# Check conversions
|
||||
assert schema["int_col"] == "int"
|
||||
assert schema["float_col"] == "double"
|
||||
assert schema["date_col"] == "timestamp"
|
||||
assert schema["cat_col"] == "string" # Conceptual category in schema
|
||||
assert schema["string_col"] == "string"
|
||||
|
||||
# Verify the actual data types from the Spark DataFrame
|
||||
spark_dtypes = dict(converted_df.dtypes)
|
||||
assert spark_dtypes["int_col"] == "int"
|
||||
assert spark_dtypes["float_col"] == "double"
|
||||
assert spark_dtypes["date_col"] == "timestamp"
|
||||
assert spark_dtypes["cat_col"] == "string" # In Spark, categories are still strings
|
||||
assert spark_dtypes["string_col"] == "string"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_spark_synapseml_classification()
|
||||
test_spark_synapseml_regression()
|
||||
test_spark_synapseml_rank()
|
||||
test_spark_input_df()
|
||||
# test_spark_synapseml_classification()
|
||||
# test_spark_synapseml_regression()
|
||||
# test_spark_synapseml_rank()
|
||||
test_spark_input_df_and_pickle()
|
||||
# test_get_random_dataframe()
|
||||
# test_auto_convert_dtypes_pandas()
|
||||
# test_auto_convert_dtypes_spark()
|
||||
|
||||
# import cProfile
|
||||
# import pstats
|
||||
|
||||
@@ -25,13 +25,13 @@ os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
|
||||
def test_parallel_xgboost(hpo_method=None, data_size=1000):
|
||||
def test_parallel_xgboost_and_pickle(hpo_method=None, data_size=1000):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 10,
|
||||
"time_budget": 30,
|
||||
"metric": "ap",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
@@ -53,15 +53,27 @@ def test_parallel_xgboost(hpo_method=None, data_size=1000):
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
# test pickle and load_pickle, should work for prediction
|
||||
automl_experiment.pickle("automl_xgboost_spark.pkl")
|
||||
automl_loaded = AutoML().load_pickle("automl_xgboost_spark.pkl")
|
||||
assert automl_loaded.best_estimator == automl_experiment.best_estimator
|
||||
assert automl_loaded.best_loss == automl_experiment.best_loss
|
||||
automl_loaded.predict(X_train)
|
||||
|
||||
import shutil
|
||||
|
||||
shutil.rmtree("automl_xgboost_spark.pkl", ignore_errors=True)
|
||||
shutil.rmtree("automl_xgboost_spark.pkl.flaml_artifacts", ignore_errors=True)
|
||||
|
||||
|
||||
def test_parallel_xgboost_others():
|
||||
# use random search as the hpo_method
|
||||
test_parallel_xgboost(hpo_method="random")
|
||||
test_parallel_xgboost_and_pickle(hpo_method="random")
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="currently not supporting too large data, will support spark dataframe in the future")
|
||||
def test_large_dataset():
|
||||
test_parallel_xgboost(data_size=90000000)
|
||||
test_parallel_xgboost_and_pickle(data_size=90000000)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -95,10 +107,10 @@ def test_custom_learner(data_size=1000):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_parallel_xgboost()
|
||||
test_parallel_xgboost_others()
|
||||
# test_large_dataset()
|
||||
if skip_my_learner:
|
||||
print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
|
||||
else:
|
||||
test_custom_learner()
|
||||
test_parallel_xgboost_and_pickle()
|
||||
# test_parallel_xgboost_others()
|
||||
# # test_large_dataset()
|
||||
# if skip_my_learner:
|
||||
# print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
|
||||
# else:
|
||||
# test_custom_learner()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
from sklearn.datasets import load_wine
|
||||
|
||||
from flaml import AutoML
|
||||
@@ -24,6 +25,8 @@ if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "custom_mylearner.p
|
||||
else:
|
||||
skip_my_learner = True
|
||||
|
||||
pytestmark = pytest.mark.spark
|
||||
|
||||
|
||||
class TestEnsemble(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
|
||||
@@ -9,7 +9,7 @@ from flaml.tune.spark.utils import check_spark
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
|
||||
@@ -22,7 +22,7 @@ def base_automl(n_concurrent_trials=1, use_ray=False, use_spark=False, verbose=0
|
||||
except (ServerError, Exception):
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True, data_home="test")
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"time_budget": 3, # total running time in seconds
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import atexit
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
@@ -13,6 +14,7 @@ from sklearn.metrics import r2_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
import flaml
|
||||
from flaml.automl.spark import disable_spark_ansi_mode, restore_spark_ansi_mode
|
||||
from flaml.automl.spark.utils import to_pandas_on_spark
|
||||
|
||||
try:
|
||||
@@ -21,6 +23,7 @@ try:
|
||||
from pyspark.ml.feature import VectorAssembler
|
||||
except ImportError:
|
||||
pass
|
||||
pytestmark = pytest.mark.spark
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
skip_spark = importlib.util.find_spec("pyspark") is None
|
||||
@@ -119,6 +122,29 @@ def _check_mlflow_logging(possible_num_runs, metric, is_parent_run, experiment_i
|
||||
# mlflow.delete_experiment(experiment_id)
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_automl_nonsparkdata_noautolog_noparentrun():
|
||||
experiment_id = _test_automl_nonsparkdata(is_autolog=False, is_parent_run=False)
|
||||
_check_mlflow_logging(0, "r2", False, experiment_id, is_automl=True) # no logging
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_automl_sparkdata_noautolog_noparentrun():
|
||||
experiment_id = _test_automl_sparkdata(is_autolog=False, is_parent_run=False)
|
||||
_check_mlflow_logging(0, "mse", False, experiment_id, is_automl=True) # no logging
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_tune_noautolog_noparentrun_parallel():
|
||||
experiment_id = _test_tune(is_autolog=False, is_parent_run=False, is_parallel=True)
|
||||
_check_mlflow_logging(0, "r2", False, experiment_id)
|
||||
|
||||
|
||||
def test_tune_noautolog_noparentrun_nonparallel():
|
||||
experiment_id = _test_tune(is_autolog=False, is_parent_run=False, is_parallel=False)
|
||||
_check_mlflow_logging(3, "r2", False, experiment_id, skip_tags=True)
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_tune_autolog_parentrun_parallel():
|
||||
experiment_id = _test_tune(is_autolog=True, is_parent_run=True, is_parallel=True)
|
||||
@@ -130,6 +156,16 @@ def test_tune_autolog_parentrun_nonparallel():
|
||||
_check_mlflow_logging(3, "r2", True, experiment_id)
|
||||
|
||||
|
||||
def test_tune_autolog_noparentrun_nonparallel():
|
||||
experiment_id = _test_tune(is_autolog=True, is_parent_run=False, is_parallel=False)
|
||||
_check_mlflow_logging(3, "r2", False, experiment_id)
|
||||
|
||||
|
||||
def test_tune_noautolog_parentrun_nonparallel():
|
||||
experiment_id = _test_tune(is_autolog=False, is_parent_run=True, is_parallel=False)
|
||||
_check_mlflow_logging(3, "r2", True, experiment_id)
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_tune_autolog_noparentrun_parallel():
|
||||
experiment_id = _test_tune(is_autolog=True, is_parent_run=False, is_parallel=True)
|
||||
@@ -142,28 +178,12 @@ def test_tune_noautolog_parentrun_parallel():
|
||||
_check_mlflow_logging([4, 3], "r2", True, experiment_id)
|
||||
|
||||
|
||||
def test_tune_autolog_noparentrun_nonparallel():
|
||||
experiment_id = _test_tune(is_autolog=True, is_parent_run=False, is_parallel=False)
|
||||
_check_mlflow_logging(3, "r2", False, experiment_id)
|
||||
|
||||
|
||||
def test_tune_noautolog_parentrun_nonparallel():
|
||||
experiment_id = _test_tune(is_autolog=False, is_parent_run=True, is_parallel=False)
|
||||
_check_mlflow_logging(3, "r2", True, experiment_id)
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_tune_noautolog_noparentrun_parallel():
|
||||
experiment_id = _test_tune(is_autolog=False, is_parent_run=False, is_parallel=True)
|
||||
_check_mlflow_logging(0, "r2", False, experiment_id)
|
||||
|
||||
|
||||
def test_tune_noautolog_noparentrun_nonparallel():
|
||||
experiment_id = _test_tune(is_autolog=False, is_parent_run=False, is_parallel=False)
|
||||
_check_mlflow_logging(3, "r2", False, experiment_id, skip_tags=True)
|
||||
|
||||
|
||||
def _test_automl_sparkdata(is_autolog, is_parent_run):
|
||||
# TODO: remove the estimator assignment once SynapseML supports spark 4+.
|
||||
from flaml.automl.spark.utils import _spark_major_minor_version
|
||||
|
||||
estimator_list = ["rf_spark"] if _spark_major_minor_version[0] >= 4 else None
|
||||
|
||||
mlflow.end_run()
|
||||
mlflow_exp_name = f"test_mlflow_integration_{int(time.time())}"
|
||||
mlflow_experiment = mlflow.set_experiment(mlflow_exp_name)
|
||||
@@ -174,6 +194,9 @@ def _test_automl_sparkdata(is_autolog, is_parent_run):
|
||||
if is_parent_run:
|
||||
mlflow.start_run(run_name=f"automl_sparkdata_autolog_{is_autolog}")
|
||||
spark = pyspark.sql.SparkSession.builder.getOrCreate()
|
||||
spark, ansi_conf, adjusted = disable_spark_ansi_mode()
|
||||
atexit.register(restore_spark_ansi_mode, spark, ansi_conf, adjusted)
|
||||
|
||||
pd_df = load_diabetes(as_frame=True).frame
|
||||
df = spark.createDataFrame(pd_df)
|
||||
df = df.repartition(4).cache()
|
||||
@@ -192,6 +215,7 @@ def _test_automl_sparkdata(is_autolog, is_parent_run):
|
||||
"log_type": "all",
|
||||
"n_splits": 2,
|
||||
"model_history": True,
|
||||
"estimator_list": estimator_list,
|
||||
}
|
||||
df = to_pandas_on_spark(to_pandas_on_spark(train_data).to_spark(index_col="index"))
|
||||
automl.fit(
|
||||
@@ -251,12 +275,6 @@ def test_automl_sparkdata_noautolog_parentrun():
|
||||
_check_mlflow_logging(3, "mse", True, experiment_id, is_automl=True)
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_automl_sparkdata_noautolog_noparentrun():
|
||||
experiment_id = _test_automl_sparkdata(is_autolog=False, is_parent_run=False)
|
||||
_check_mlflow_logging(0, "mse", False, experiment_id, is_automl=True) # no logging
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_automl_nonsparkdata_autolog_parentrun():
|
||||
experiment_id = _test_automl_nonsparkdata(is_autolog=True, is_parent_run=True)
|
||||
@@ -275,12 +293,6 @@ def test_automl_nonsparkdata_noautolog_parentrun():
|
||||
_check_mlflow_logging([4, 3], "r2", True, experiment_id, is_automl=True)
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_automl_nonsparkdata_noautolog_noparentrun():
|
||||
experiment_id = _test_automl_nonsparkdata(is_autolog=False, is_parent_run=False)
|
||||
_check_mlflow_logging(0, "r2", False, experiment_id, is_automl=True) # no logging
|
||||
|
||||
|
||||
@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
def test_exit_pyspark_autolog():
|
||||
import pyspark
|
||||
@@ -318,6 +330,9 @@ def _init_spark_for_main():
|
||||
"https://mmlspark.blob.core.windows.net/publicwasb/log_model_allowlist.txt",
|
||||
)
|
||||
|
||||
spark, ansi_conf, adjusted = disable_spark_ansi_mode()
|
||||
atexit.register(restore_spark_ansi_mode, spark, ansi_conf, adjusted)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_init_spark_for_main()
|
||||
|
||||
@@ -2,6 +2,7 @@ import os
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import load_iris, load_wine
|
||||
|
||||
@@ -12,6 +13,7 @@ from flaml.tune.spark.utils import check_spark
|
||||
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
pytestmark = pytest.mark.spark
|
||||
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from flaml.tune.spark.utils import check_spark
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
here = os.path.abspath(os.path.dirname(__file__))
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
|
||||
@@ -25,7 +25,7 @@ try:
|
||||
except ImportError:
|
||||
skip_spark = True
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
|
||||
def test_overtime():
|
||||
|
||||
@@ -2,8 +2,23 @@ import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
from minio.error import ServerError
|
||||
from openml.exceptions import OpenMLServerException
|
||||
|
||||
try:
|
||||
from minio.error import ServerError
|
||||
except ImportError:
|
||||
|
||||
class ServerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
from openml.exceptions import OpenMLServerException
|
||||
except ImportError:
|
||||
|
||||
class OpenMLServerException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
from requests.exceptions import ChunkedEncodingError, SSLError
|
||||
|
||||
from flaml.tune.spark.utils import check_spark
|
||||
@@ -11,19 +26,19 @@ from flaml.tune.spark.utils import check_spark
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
|
||||
|
||||
def run_automl(budget=3, dataset_format="dataframe", hpo_method=None):
|
||||
def run_automl(budget=30, dataset_format="dataframe", hpo_method=None):
|
||||
import urllib3
|
||||
|
||||
from flaml.automl.data import load_openml_dataset
|
||||
|
||||
performance_check_budget = 3600
|
||||
if sys.platform == "darwin" or "nt" in os.name or "3.10" not in sys.version:
|
||||
budget = 3 # revise the buget if the platform is not linux + python 3.10
|
||||
budget = 30 # revise the buget if the platform is not linux + python 3.10
|
||||
if budget >= performance_check_budget:
|
||||
max_iter = 60
|
||||
performance_check_budget = None
|
||||
@@ -76,6 +91,11 @@ def run_automl(budget=3, dataset_format="dataframe", hpo_method=None):
|
||||
print("Best ML leaner:", automl.best_estimator)
|
||||
print("Best hyperparmeter config:", automl.best_config)
|
||||
print(f"Best accuracy on validation data: {1 - automl.best_loss:.4g}")
|
||||
if performance_check_budget is not None and automl.best_estimator is None:
|
||||
# skip the performance check if no model is trained
|
||||
# this happens sometimes in github actions ubuntu python 3.12 environment
|
||||
print("Warning: no model is trained, skip performance check")
|
||||
return
|
||||
print(f"Training duration of best run: {automl.best_config_train_time:.4g} s")
|
||||
print(automl.model.estimator)
|
||||
print(automl.best_config_per_estimator)
|
||||
|
||||
@@ -14,7 +14,7 @@ from flaml.tune.spark.utils import check_spark
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import atexit
|
||||
import os
|
||||
from functools import partial
|
||||
from timeit import timeit
|
||||
@@ -14,6 +15,7 @@ try:
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
from flaml.automl.ml import sklearn_metric_loss_score
|
||||
from flaml.automl.spark import disable_spark_ansi_mode, restore_spark_ansi_mode
|
||||
from flaml.automl.spark.metrics import spark_metric_loss_score
|
||||
from flaml.automl.spark.utils import (
|
||||
iloc_pandas_on_spark,
|
||||
@@ -24,6 +26,7 @@ try:
|
||||
unique_value_first_index,
|
||||
)
|
||||
from flaml.tune.spark.utils import (
|
||||
_spark_major_minor_version,
|
||||
check_spark,
|
||||
get_broadcast_data,
|
||||
get_n_cpus,
|
||||
@@ -35,8 +38,39 @@ try:
|
||||
except ImportError:
|
||||
print("Spark is not installed. Skip all spark tests.")
|
||||
skip_spark = True
|
||||
_spark_major_minor_version = (0, 0)
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
|
||||
@pytest.mark.skipif(_spark_major_minor_version[0] < 4, reason="Requires Spark 4.0+")
|
||||
def test_to_pandas_on_spark_temp_override():
|
||||
import pyspark.pandas as ps
|
||||
from pyspark.sql import Row
|
||||
|
||||
from flaml.automl.spark.utils import to_pandas_on_spark
|
||||
|
||||
spark_session = SparkSession.builder.getOrCreate()
|
||||
spark, ansi_conf, adjusted = disable_spark_ansi_mode()
|
||||
atexit.register(restore_spark_ansi_mode, spark, ansi_conf, adjusted)
|
||||
|
||||
# Ensure we can toggle options
|
||||
orig = ps.get_option("compute.fail_on_ansi_mode")
|
||||
|
||||
try:
|
||||
spark_session.conf.set("spark.sql.ansi.enabled", "true")
|
||||
ps.set_option("compute.fail_on_ansi_mode", True)
|
||||
|
||||
# create tiny spark df
|
||||
sdf = spark_session.createDataFrame([Row(a=1, b=2)])
|
||||
# Should not raise as our function temporarily disables fail_on_ansi_mode
|
||||
pds = to_pandas_on_spark(sdf)
|
||||
assert "a" in pds.columns
|
||||
finally:
|
||||
# restore test environment
|
||||
ps.set_option("compute.fail_on_ansi_mode", orig)
|
||||
spark_session.conf.set("spark.sql.ansi.enabled", "false")
|
||||
|
||||
|
||||
def test_with_parameters_spark():
|
||||
|
||||
@@ -5,17 +5,38 @@ import sys
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import openml
|
||||
|
||||
try:
|
||||
import openml
|
||||
except ImportError:
|
||||
openml = None
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import scipy.sparse
|
||||
from minio.error import ServerError
|
||||
|
||||
try:
|
||||
from minio.error import ServerError
|
||||
except ImportError:
|
||||
|
||||
class ServerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
from requests.exceptions import SSLError
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
||||
|
||||
from flaml import AutoVW
|
||||
from flaml.tune import loguniform, polynomial_expansion_set
|
||||
|
||||
try:
|
||||
from vowpalwabbit import pyvw
|
||||
except ImportError:
|
||||
skip_vw_test = True
|
||||
else:
|
||||
skip_vw_test = False
|
||||
|
||||
pytest.skip("skipping if no openml", allow_module_level=True) if openml is None else None
|
||||
|
||||
VW_DS_DIR = "test/data/"
|
||||
NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -351,14 +372,9 @@ def get_vw_tuning_problem(tuning_hp="NamesapceInteraction"):
|
||||
return vw_oml_problem_args, vw_online_aml_problem
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
"3.10" in sys.version or "3.11" in sys.version,
|
||||
reason="do not run on py >= 3.10",
|
||||
)
|
||||
@pytest.mark.skipif(skip_vw_test, reason="vowpalwabbit not installed")
|
||||
class TestAutoVW(unittest.TestCase):
|
||||
def test_vw_oml_problem_and_vanilla_vw(self):
|
||||
from vowpalwabbit import pyvw
|
||||
|
||||
try:
|
||||
vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem()
|
||||
except (SSLError, ServerError, Exception) as e:
|
||||
|
||||
@@ -59,6 +59,17 @@ def _test_hf_data():
|
||||
except requests.exceptions.ConnectionError:
|
||||
return
|
||||
|
||||
# Tests will only run if there is a GPU available
|
||||
try:
|
||||
import ray
|
||||
|
||||
pg = ray.util.placement_group([{"CPU": 1, "GPU": 1}])
|
||||
|
||||
if not pg.wait(timeout_seconds=10): # Wait 10 seconds for resources
|
||||
raise RuntimeError("No available node types can fulfill resource request!")
|
||||
except RuntimeError:
|
||||
return
|
||||
|
||||
custom_sent_keys = ["sentence1", "sentence2"]
|
||||
label_key = "label"
|
||||
|
||||
|
||||
@@ -4,10 +4,17 @@ from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import thop
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
try:
|
||||
import thop
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
except ImportError:
|
||||
thop = None
|
||||
torch = None
|
||||
nn = None
|
||||
F = None
|
||||
|
||||
try:
|
||||
import torchvision
|
||||
@@ -16,6 +23,11 @@ except ImportError:
|
||||
|
||||
from flaml import tune
|
||||
|
||||
if thop is None or torch is None or nn is None or F is None or torchvision is None:
|
||||
pytest.skip(
|
||||
"skipping test_lexiflow.py because torch, torchvision or thop is not installed.", allow_module_level=True
|
||||
)
|
||||
|
||||
DEVICE = torch.device("cpu")
|
||||
BATCHSIZE = 128
|
||||
N_TRAIN_EXAMPLES = BATCHSIZE * 30
|
||||
|
||||
@@ -53,6 +53,11 @@ def _easy_objective(config):
|
||||
|
||||
|
||||
def test_nested_run():
|
||||
"""
|
||||
nested tuning example: Tune -> AutoML -> MLflow autolog
|
||||
mlflow logging is complicated in nested tuning. It's better to turn off mlflow autologging to avoid
|
||||
potential issues in FLAML's mlflow_integration.adopt_children() function.
|
||||
"""
|
||||
from flaml import AutoML, tune
|
||||
|
||||
data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
|
||||
|
||||
@@ -6,12 +6,12 @@ from sklearn.model_selection import train_test_split
|
||||
from flaml import tune
|
||||
from flaml.automl.model import LGBMEstimator
|
||||
|
||||
data = fetch_california_housing(return_X_y=False, as_frame=True)
|
||||
data = fetch_california_housing(return_X_y=False, as_frame=True, data_home="test")
|
||||
df, X, y = data.frame, data.data, data.target
|
||||
df_train, _, X_train, X_test, _, y_test = train_test_split(df, X, y, test_size=0.33, random_state=42)
|
||||
csv_file_name = "test/housing.csv"
|
||||
df_train.to_csv(csv_file_name, index=False)
|
||||
# X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
# X, y = fetch_california_housing(return_X_y=True, as_frame=True, data_home="test")
|
||||
# X_train, X_test, y_train, y_test = train_test_split(
|
||||
# X, y, test_size=0.33, random_state=42
|
||||
# )
|
||||
|
||||
132
website/docs/Best-Practices.md
Normal file
132
website/docs/Best-Practices.md
Normal file
@@ -0,0 +1,132 @@
|
||||
````markdown
|
||||
# Best Practices
|
||||
|
||||
This page collects practical guidance for using FLAML effectively across common tasks.
|
||||
|
||||
## General tips
|
||||
|
||||
- Start simple: set `task`, `time_budget`, and keep `metric="auto"` unless you have a strong reason to override.
|
||||
- Prefer correct splits: ensure your evaluation strategy matches your data (time series vs i.i.d., grouped data, etc.).
|
||||
- Keep estimator lists explicit when debugging: start with a small `estimator_list` and expand.
|
||||
- Use built-in discovery helpers to avoid stale hardcoded lists:
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
from flaml.automl.task.factory import task_factory
|
||||
|
||||
automl = AutoML()
|
||||
print("Built-in sklearn metrics:", sorted(automl.supported_metrics[0]))
|
||||
print("classification estimators:", sorted(task_factory("classification").estimators.keys()))
|
||||
```
|
||||
|
||||
## Classification
|
||||
|
||||
- **Metric**: for binary classification, `metric="roc_auc"` is common; for multiclass, `metric="log_loss"` is often robust.
|
||||
- **Imbalanced data**:
|
||||
- pass `sample_weight` to `AutoML.fit()`;
|
||||
- consider setting class weights via `custom_hp` / `fit_kwargs_by_estimator` for specific estimators (see [FAQ](FAQ)).
|
||||
- **Probability vs label metrics**: use `roc_auc` / `log_loss` when you care about calibrated probabilities.
|
||||
|
||||
## Regression
|
||||
|
||||
- **Default metric**: `metric="r2"` (minimizes `1 - r2`).
|
||||
- If your target scale matters (e.g., dollar error), consider `mae`/`rmse`.
|
||||
|
||||
## Learning to rank
|
||||
|
||||
- Use `task="rank"` with group information (`groups` / `groups_val`) so metrics like `ndcg` and `ndcg@k` are meaningful.
|
||||
- If you pass `metric="ndcg@10"`, also pass `groups` so FLAML can compute group-aware NDCG.
|
||||
|
||||
## Time series forecasting
|
||||
|
||||
- Use time-aware splitting. For holdout validation, set `eval_method="holdout"` and use a time-ordered dataset.
|
||||
- Prefer supplying a DataFrame with a clear time column when possible.
|
||||
- Optional time-series estimators depend on optional dependencies. To list what is available in your environment:
|
||||
|
||||
```python
|
||||
from flaml.automl.task.factory import task_factory
|
||||
|
||||
print("forecast:", sorted(task_factory("forecast").estimators.keys()))
|
||||
```
|
||||
|
||||
## NLP (Transformers)
|
||||
|
||||
- Install the optional dependency: `pip install "flaml[hf]"`.
|
||||
- When you provide a custom metric, ensure it returns `(metric_to_minimize, metrics_to_log)` with stable keys.
|
||||
|
||||
## Speed, stability, and tricky settings
|
||||
|
||||
- **Time budget vs convergence**: if you see warnings about not all estimators converging, increase `time_budget` or reduce `estimator_list`.
|
||||
- **Memory pressure / OOM**:
|
||||
- set `free_mem_ratio` (e.g., `0.2`) to keep free memory above a threshold;
|
||||
- set `model_history=False` to reduce stored artifacts;
|
||||
- **Reproducibility**: set `seed` and keep `n_jobs` fixed; expect some runtime variance.
|
||||
|
||||
## Persisting models
|
||||
|
||||
FLAML supports **both** MLflow logging and pickle-based persistence. For production deployment, MLflow logging is typically the most important option because it plugs into the MLflow ecosystem (tracking, model registry, serving, governance). For quick local reuse, persisting the whole `AutoML` object via pickle is often the most convenient.
|
||||
|
||||
### Option 1: MLflow logging (recommended for production)
|
||||
|
||||
When you run `AutoML.fit()` inside an MLflow run, FLAML can log metrics/params automatically (disable via `mlflow_logging=False` if needed). To persist the trained `AutoML` object as a model artifact and reuse MLflow tooling end-to-end:
|
||||
|
||||
```python
|
||||
import mlflow
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
from flaml import AutoML
|
||||
|
||||
|
||||
X, y = load_iris(return_X_y=True, as_frame=True)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
automl = AutoML()
|
||||
mlflow.set_experiment("flaml")
|
||||
with mlflow.start_run(run_name="flaml_run") as run:
|
||||
automl.fit(X_train, y_train, task="classification", time_budget=3, retrain_full=False, eval_method="holdout")
|
||||
|
||||
run_id = run.info.run_id
|
||||
|
||||
# Later (or in a different process)
|
||||
automl2 = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
|
||||
assert np.array_equal(automl2.predict(X_test), automl.predict(X_test))
|
||||
```
|
||||
|
||||
### Option 2: Pickle the full `AutoML` instance (convenient / Fabric)
|
||||
|
||||
Pickling stores the *entire* `AutoML` instance (not just the best estimator). This is useful when you prefer not to rely on MLflow or when you want to reuse additional attributes of the AutoML object without retraining.
|
||||
|
||||
In Microsoft Fabric scenarios, this is particularly important for re-plotting visualization figures without requiring model retraining.
|
||||
|
||||
```python
|
||||
import mlflow
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.model_selection import train_test_split
|
||||
from flaml import AutoML
|
||||
|
||||
|
||||
X, y = load_iris(return_X_y=True, as_frame=True)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
automl = AutoML()
|
||||
mlflow.set_experiment("flaml")
|
||||
with mlflow.start_run(run_name="flaml_run") as run:
|
||||
automl.fit(X_train, y_train, task="classification", time_budget=3, retrain_full=False, eval_method="holdout")
|
||||
|
||||
automl.pickle("automl.pkl")
|
||||
automl2 = AutoML.load_pickle("automl.pkl")
|
||||
assert np.array_equal(automl2.predict(X_test), automl.predict(X_test))
|
||||
assert automl.best_config == automl2.best_config
|
||||
assert automl.best_loss == automl2.best_loss
|
||||
assert automl.mlflow_integration.infos == automl2.mlflow_integration.infos
|
||||
```
|
||||
|
||||
See also: [Task-Oriented AutoML](Use-Cases/Task-Oriented-AutoML) and [FAQ](FAQ).
|
||||
|
||||
````
|
||||
@@ -62,10 +62,10 @@ There is currently no formal reviewer solicitation process. Current reviewers id
|
||||
|
||||
```bash
|
||||
git clone https://github.com/microsoft/FLAML.git
|
||||
pip install -e FLAML[notebook,autogen]
|
||||
pip install -e ".[notebook]"
|
||||
```
|
||||
|
||||
In case the `pip install` command fails, try escaping the brackets such as `pip install -e FLAML\[notebook,autogen\]`.
|
||||
In case the `pip install` command fails, try escaping the brackets such as `pip install -e .\[notebook\]`.
|
||||
|
||||
### Docker
|
||||
|
||||
|
||||
@@ -80,11 +80,92 @@ from flaml import AutoML
|
||||
from sklearn.datasets import load_iris
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
|
||||
automl = AutoML(settings={"time_budget": 3})
|
||||
settings = {"time_budget": 3}
|
||||
automl = AutoML(**settings)
|
||||
automl.fit(X, y)
|
||||
|
||||
print(f"{automl.best_estimator=}")
|
||||
print(f"{automl.best_config=}")
|
||||
print(f"params for best estimator: {automl.model.config2params(automl.best_config)}")
|
||||
```
|
||||
|
||||
If the automl instance is not accessible and you've the `best_config`. You can also convert it with below code:
|
||||
|
||||
```python
|
||||
from flaml.automl.task.factory import task_factory
|
||||
|
||||
task = "classification"
|
||||
best_estimator = "rf"
|
||||
best_config = {
|
||||
"n_estimators": 15,
|
||||
"max_features": 0.35807183923834934,
|
||||
"max_leaves": 12,
|
||||
"criterion": "gini",
|
||||
}
|
||||
|
||||
model_class = task_factory(task).estimator_class_from_str(best_estimator)(task=task)
|
||||
best_params = model_class.config2params(best_config)
|
||||
```
|
||||
|
||||
Then you can use it to train the sklearn estimators directly:
|
||||
|
||||
```python
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
model = RandomForestClassifier(**best_params)
|
||||
model.fit(X, y)
|
||||
```
|
||||
|
||||
### How to save and load an AutoML object? (`pickle` / `load_pickle`)
|
||||
|
||||
FLAML provides `AutoML.pickle()` / `AutoML.load_pickle()` as a convenient and robust way to persist an AutoML run.
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
|
||||
automl = AutoML()
|
||||
automl.fit(X_train, y_train, task="classification", time_budget=60)
|
||||
|
||||
# Save
|
||||
automl.pickle("automl.pkl")
|
||||
|
||||
# Load
|
||||
automl_loaded = AutoML.load_pickle("automl.pkl")
|
||||
pred = automl_loaded.predict(X_test)
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
- If you used Spark estimators, `AutoML.pickle()` externalizes Spark ML models into an adjacent artifact folder and keeps
|
||||
the pickle itself lightweight.
|
||||
- If you want to skip re-loading externalized Spark models (e.g., in an environment without Spark), use:
|
||||
|
||||
```python
|
||||
automl_loaded = AutoML.load_pickle("automl.pkl", load_spark_models=False)
|
||||
```
|
||||
|
||||
### How to list all available estimators for a task?
|
||||
|
||||
The available estimator set is task-dependent and can vary with optional dependencies. You can list the estimator keys
|
||||
that FLAML currently has registered in your environment:
|
||||
|
||||
```python
|
||||
from flaml.automl.task.factory import task_factory
|
||||
|
||||
print(sorted(task_factory("classification").estimators.keys()))
|
||||
print(sorted(task_factory("regression").estimators.keys()))
|
||||
print(sorted(task_factory("forecast").estimators.keys()))
|
||||
print(sorted(task_factory("rank").estimators.keys()))
|
||||
```
|
||||
|
||||
### How to list supported built-in metrics?
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
|
||||
automl = AutoML()
|
||||
sklearn_metrics, hf_metrics, spark_metrics = automl.supported_metrics
|
||||
print(sorted(sklearn_metrics))
|
||||
print(sorted(hf_metrics))
|
||||
print(spark_metrics)
|
||||
```
|
||||
|
||||
@@ -8,7 +8,6 @@ and optimizes their performance.
|
||||
|
||||
### Main Features
|
||||
|
||||
- FLAML enables building next-gen GPT-X applications based on multi-agent conversations with minimal effort. It simplifies the orchestration, automation and optimization of a complex GPT-X workflow. It maximizes the performance of GPT-X models and augments their weakness.
|
||||
- For common machine learning tasks like classification and regression, it quickly finds quality models for user-provided data with low computational resources. It is easy to customize or extend.
|
||||
- It supports fast and economical automatic tuning, capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping.
|
||||
|
||||
@@ -16,45 +15,10 @@ FLAML is powered by a series of [research studies](/docs/Research) from Microsof
|
||||
|
||||
### Quickstart
|
||||
|
||||
Install FLAML from pip: `pip install flaml`. Find more options in [Installation](/docs/Installation).
|
||||
Install FLAML from pip: `pip install flaml` (**requires Python >= 3.10**). Find more options in [Installation](/docs/Installation).
|
||||
|
||||
There are several ways of using flaml:
|
||||
|
||||
#### (New) [AutoGen](https://microsoft.github.io/autogen/)
|
||||
|
||||
Autogen enables the next-gen GPT-X applications with a generic multi-agent conversation framework.
|
||||
It offers customizable and conversable agents which integrate LLMs, tools and human.
|
||||
By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For example,
|
||||
|
||||
```python
|
||||
from flaml import autogen
|
||||
|
||||
assistant = autogen.AssistantAgent("assistant")
|
||||
user_proxy = autogen.UserProxyAgent("user_proxy")
|
||||
user_proxy.initiate_chat(
|
||||
assistant,
|
||||
message="Show me the YTD gain of 10 largest technology companies as of today.",
|
||||
)
|
||||
# This initiates an automated chat between the two agents to solve the task
|
||||
```
|
||||
|
||||
Autogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers a drop-in replacement of `openai.Completion` or `openai.ChatCompletion` with powerful functionalites like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.
|
||||
|
||||
```python
|
||||
# perform tuning
|
||||
config, analysis = autogen.Completion.tune(
|
||||
data=tune_data,
|
||||
metric="success",
|
||||
mode="max",
|
||||
eval_func=eval_func,
|
||||
inference_budget=0.05,
|
||||
optimization_budget=3,
|
||||
num_samples=-1,
|
||||
)
|
||||
# perform inference for a test instance
|
||||
response = autogen.Completion.create(context=test_instance, **config)
|
||||
```
|
||||
|
||||
#### [Task-oriented AutoML](/docs/Use-Cases/task-oriented-automl)
|
||||
|
||||
With three lines of code, you can start using this economical and fast AutoML engine as a scikit-learn style estimator.
|
||||
@@ -140,9 +104,10 @@ Then, you can use it just like you use the original `LGMBClassifier`. Your other
|
||||
|
||||
### Where to Go Next?
|
||||
|
||||
- Understand the use cases for [AutoGen](https://microsoft.github.io/autogen/), [Task-oriented AutoML](/docs/Use-Cases/Task-Oriented-Automl), [Tune user-defined function](/docs/Use-Cases/Tune-User-Defined-Function) and [Zero-shot AutoML](/docs/Use-Cases/Zero-Shot-AutoML).
|
||||
- Find code examples under "Examples": from [AutoGen - AgentChat](/docs/Examples/AutoGen-AgentChat) to [Tune - PyTorch](/docs/Examples/Tune-PyTorch).
|
||||
- Understand the use cases for [Task-oriented AutoML](/docs/Use-Cases/Task-Oriented-Automl), [Tune user-defined function](/docs/Use-Cases/Tune-User-Defined-Function) and [Zero-shot AutoML](/docs/Use-Cases/Zero-Shot-AutoML).
|
||||
- Find code examples under "Examples": from [AutoML - Classification](/docs/Examples/AutoML-Classification) to [Tune - PyTorch](/docs/Examples/Tune-PyTorch).
|
||||
- Learn about [research](/docs/Research) around FLAML and check [blogposts](/blog).
|
||||
- Apply practical guidance in [Best Practices](/docs/Best-Practices).
|
||||
- Chat on [Discord](https://discord.gg/Cppx2vSPVP).
|
||||
|
||||
If you like our project, please give it a [star](https://github.com/microsoft/FLAML/stargazers) on GitHub. If you are interested in contributing, please read [Contributor's Guide](/docs/Contribute).
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
## Python
|
||||
|
||||
FLAML requires **Python version >= 3.7**. It can be installed from pip:
|
||||
FLAML requires **Python version >= 3.10**. It can be installed from pip:
|
||||
|
||||
```bash
|
||||
pip install flaml
|
||||
@@ -16,12 +16,6 @@ conda install flaml -c conda-forge
|
||||
|
||||
### Optional Dependencies
|
||||
|
||||
#### [Autogen](Use-Cases/Autogen)
|
||||
|
||||
```bash
|
||||
pip install "flaml[autogen]"
|
||||
```
|
||||
|
||||
#### [Task-oriented AutoML](Use-Cases/Task-Oriented-AutoML)
|
||||
|
||||
```bash
|
||||
|
||||
@@ -32,15 +32,16 @@ from flaml import AutoML
|
||||
automl = AutoML()
|
||||
automl.fit(X_train, y_train, task="regression", time_budget=60, **other_settings)
|
||||
# Save the model
|
||||
with open("automl.pkl", "wb") as f:
|
||||
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||
automl.pickle("automl.pkl")
|
||||
|
||||
# At prediction time
|
||||
with open("automl.pkl", "rb") as f:
|
||||
automl = pickle.load(f)
|
||||
automl = AutoML.load_pickle("automl.pkl")
|
||||
pred = automl.predict(X_test)
|
||||
```
|
||||
|
||||
FLAML also supports plain `pickle.dump()` / `pickle.load()`, but `automl.pickle()` / `AutoML.load_pickle()` is recommended,
|
||||
especially when Spark estimators are involved.
|
||||
|
||||
If users provide the minimal inputs only, `AutoML` uses the default settings for optimization metric, estimator list etc.
|
||||
|
||||
## Customize AutoML.fit()
|
||||
@@ -122,6 +123,18 @@ def custom_metric(
|
||||
|
||||
It returns the validation loss penalized by the gap between validation and training loss as the metric to minimize, and three metrics to log: val_loss, train_loss and pred_time. The arguments `config`, `groups_val` and `groups_train` are not used in the function.
|
||||
|
||||
You can also inspect what FLAML recognizes as built-in metrics at runtime:
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
|
||||
automl = AutoML()
|
||||
sklearn_metrics, hf_metrics, spark_metrics = automl.supported_metrics
|
||||
print(sorted(sklearn_metrics))
|
||||
print(sorted(hf_metrics))
|
||||
print(spark_metrics)
|
||||
```
|
||||
|
||||
### Estimator and search space
|
||||
|
||||
The estimator list can contain one or more estimator names, each corresponding to a built-in estimator or a custom estimator. Each estimator has a search space for hyperparameter configurations. FLAML supports both classical machine learning models and deep neural networks.
|
||||
@@ -146,11 +159,45 @@ The estimator list can contain one or more estimator names, each corresponding t
|
||||
- 'sarimax': SARIMAX for task "ts_forecast". Hyperparameters: p, d, q, P, D, Q, s.
|
||||
- 'holt-winters': Holt-Winters (triple exponential smoothing) model for task "ts_forecast". Hyperparameters: seasonal_perdiods, seasonal, use_boxcox, trend, damped_trend.
|
||||
- 'transformer': Huggingface transformer models for task "seq-classification", "seq-regression", "multichoice-classification", "token-classification" and "summarization". Hyperparameters: learning_rate, num_train_epochs, per_device_train_batch_size, warmup_ratio, weight_decay, adam_epsilon, seed.
|
||||
- 'temporal_fusion_transformer': TemporalFusionTransformerEstimator for task "ts_forecast_panel". Hyperparameters: gradient_clip_val, hidden_size, hidden_continuous_size, attention_head_size, dropout, learning_rate. There is a [known issue](https://github.com/jdb78/pytorch-forecasting/issues/1145) with pytorch-forecast logging.
|
||||
- 'tft': TemporalFusionTransformerEstimator for task "ts_forecast_panel". Hyperparameters: gradient_clip_val, hidden_size, hidden_continuous_size, attention_head_size, dropout, learning_rate.
|
||||
- 'tcn': Temporal Convolutional Network (TCN) estimator for task "ts_forecast" (requires optional deep learning dependencies, e.g., `torch` and `pytorch_lightning`).
|
||||
- Spark estimators (for Spark / pandas-on-Spark DataFrames; the exact set depends on your Spark runtime and installed packages):
|
||||
- 'lgbm_spark': Spark LightGBM models via [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/).
|
||||
- 'rf_spark': Spark MLlib RandomForestClassifier/Regressor.
|
||||
- 'gbt_spark': Spark MLlib GBTClassifier/GBTRegressor.
|
||||
- 'lr_spark': Spark MLlib LinearRegression.
|
||||
- 'glr_spark': Spark MLlib GeneralizedLinearRegression.
|
||||
- 'svc_spark': Spark MLlib LinearSVC (binary classification only).
|
||||
- 'nb_spark': Spark MLlib NaiveBayes (classification only).
|
||||
- 'aft_spark': Spark MLlib AFTSurvivalRegression.
|
||||
- Custom estimator. Use custom estimator for:
|
||||
- tuning an estimator that is not built-in;
|
||||
- customizing search space for a built-in estimator.
|
||||
|
||||
#### List all available estimators (recommended)
|
||||
|
||||
The exact set of available estimators depends on the `task` and optional dependencies (e.g., Prophet/Orbit/PyTorch).
|
||||
To list the estimator keys available in your environment:
|
||||
|
||||
```python
|
||||
from flaml.automl.task.factory import task_factory
|
||||
|
||||
print("classification:", sorted(task_factory("classification").estimators.keys()))
|
||||
print("regression:", sorted(task_factory("regression").estimators.keys()))
|
||||
print("forecast:", sorted(task_factory("forecast").estimators.keys()))
|
||||
print("rank:", sorted(task_factory("rank").estimators.keys()))
|
||||
```
|
||||
|
||||
For reference, the built-in estimator keys included in the codebase are:
|
||||
|
||||
- Tabular / ranking / NLP tasks (GenericTask):
|
||||
`['aft_spark', 'catboost', 'enet', 'extra_tree', 'gbt_spark', 'glr_spark', 'histgb', 'kneighbor', 'lassolars', 'lgbm', 'lgbm_spark', 'lr_spark', 'lrl1', 'lrl2', 'nb_spark', 'rf', 'rf_spark', 'sgd', 'svc', 'svc_spark', 'transformer', 'transformer_ms', 'xgb_limitdepth', 'xgboost']`
|
||||
- Time series tasks (TimeSeriesTask):
|
||||
`['arima', 'avg', 'catboost', 'extra_tree', 'holt-winters', 'lassolars', 'lgbm', 'naive', 'prophet', 'rf', 'sarimax', 'savg', 'snaive', 'tcn', 'tft', 'xgb_limitdepth', 'xgboost', 'orbit']`
|
||||
|
||||
Some of the time series estimators (e.g., `prophet`, `orbit`, `tcn`, `tft`) are only available when the corresponding
|
||||
optional dependencies are installed.
|
||||
|
||||
#### Guidelines on tuning a custom estimator
|
||||
|
||||
To tune a custom estimator that is not built-in, you need to:
|
||||
|
||||
@@ -99,6 +99,12 @@ module.exports = {
|
||||
'https://github.com/microsoft/FLAML/edit/main/website/',
|
||||
remarkPlugins: [math],
|
||||
rehypePlugins: [katex],
|
||||
// Allow __init__.md and other underscore-prefixed markdown docs
|
||||
exclude: [
|
||||
'**/_*.{js,jsx,ts,tsx}',
|
||||
'**/*.test.{js,jsx,ts,tsx}',
|
||||
'**/__tests__/**',
|
||||
],
|
||||
},
|
||||
theme: {
|
||||
customCss: require.resolve('./src/css/custom.css'),
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
"name": "website",
|
||||
"version": "0.0.0",
|
||||
"private": true,
|
||||
"resolutions" :{
|
||||
"nth-check":"2.0.1",
|
||||
"trim":"0.0.3",
|
||||
"resolutions": {
|
||||
"nth-check": "2.0.1",
|
||||
"trim": "0.0.3",
|
||||
"got": "11.8.5",
|
||||
"node-forge": "1.3.0",
|
||||
"minimatch": "3.0.5",
|
||||
@@ -12,7 +12,7 @@
|
||||
"eta": "2.0.0",
|
||||
"@sideway/formula": "3.0.1",
|
||||
"http-cache-semantics": "4.1.1"
|
||||
},
|
||||
},
|
||||
"scripts": {
|
||||
"docusaurus": "docusaurus",
|
||||
"start": "docusaurus start",
|
||||
@@ -33,13 +33,13 @@
|
||||
"clsx": "^1.1.1",
|
||||
"file-loader": "^6.2.0",
|
||||
"hast-util-is-element": "1.1.0",
|
||||
"minimatch": "3.0.5",
|
||||
"react": "^17.0.1",
|
||||
"react-dom": "^17.0.1",
|
||||
"rehype-katex": "4",
|
||||
"remark-math": "3",
|
||||
"trim": "^0.0.3",
|
||||
"url-loader": "^4.1.1",
|
||||
"minimatch": "3.0.5"
|
||||
"url-loader": "^4.1.1"
|
||||
},
|
||||
"browserslist": {
|
||||
"production": [
|
||||
|
||||
@@ -153,6 +153,15 @@
|
||||
"@babel/highlight" "^7.23.4"
|
||||
chalk "^2.4.2"
|
||||
|
||||
"@babel/code-frame@^7.26.2":
|
||||
version "7.26.2"
|
||||
resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.26.2.tgz#4b5fab97d33338eff916235055f0ebc21e573a85"
|
||||
integrity sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ==
|
||||
dependencies:
|
||||
"@babel/helper-validator-identifier" "^7.25.9"
|
||||
js-tokens "^4.0.0"
|
||||
picocolors "^1.0.0"
|
||||
|
||||
"@babel/compat-data@^7.17.7", "@babel/compat-data@^7.20.0", "@babel/compat-data@^7.20.1":
|
||||
version "7.20.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/compat-data/-/compat-data-7.20.1.tgz#f2e6ef7790d8c8dbf03d379502dcc246dcce0b30"
|
||||
@@ -429,6 +438,11 @@
|
||||
resolved "https://registry.yarnpkg.com/@babel/helper-string-parser/-/helper-string-parser-7.23.4.tgz#9478c707febcbbe1ddb38a3d91a2e054ae622d83"
|
||||
integrity sha512-803gmbQdqwdf4olxrX4AJyFBV/RTr3rSmOj0rKwesmzlfhYNDEs+/iOcznzpNWlJlIlTJC2QfPFcHB6DlzdVLQ==
|
||||
|
||||
"@babel/helper-string-parser@^7.25.9":
|
||||
version "7.25.9"
|
||||
resolved "https://registry.yarnpkg.com/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz#1aabb72ee72ed35789b4bbcad3ca2862ce614e8c"
|
||||
integrity sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==
|
||||
|
||||
"@babel/helper-validator-identifier@^7.18.6", "@babel/helper-validator-identifier@^7.19.1":
|
||||
version "7.19.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.19.1.tgz#7eea834cf32901ffdc1a7ee555e2f9c27e249ca2"
|
||||
@@ -439,6 +453,11 @@
|
||||
resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz#c4ae002c61d2879e724581d96665583dbc1dc0e0"
|
||||
integrity sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==
|
||||
|
||||
"@babel/helper-validator-identifier@^7.25.9":
|
||||
version "7.25.9"
|
||||
resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz#24b64e2c3ec7cd3b3c547729b8d16871f22cbdc7"
|
||||
integrity sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==
|
||||
|
||||
"@babel/helper-validator-option@^7.18.6":
|
||||
version "7.18.6"
|
||||
resolved "https://registry.npmmirror.com/@babel/helper-validator-option/-/helper-validator-option-7.18.6.tgz#bf0d2b5a509b1f336099e4ff36e1a63aa5db4db8"
|
||||
@@ -455,13 +474,12 @@
|
||||
"@babel/types" "^7.19.0"
|
||||
|
||||
"@babel/helpers@^7.12.5", "@babel/helpers@^7.20.1":
|
||||
version "7.20.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/helpers/-/helpers-7.20.1.tgz#2ab7a0fcb0a03b5bf76629196ed63c2d7311f4c9"
|
||||
integrity sha512-J77mUVaDTUJFZ5BpP6mMn6OIl3rEWymk2ZxDBQJUG3P+PbmyMcF3bYWvz0ma69Af1oobDqT/iAsvzhB58xhQUg==
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/helpers/-/helpers-7.26.10.tgz#6baea3cd62ec2d0c1068778d63cb1314f6637384"
|
||||
integrity sha512-UPYc3SauzZ3JGgj87GgZ89JVdC5dj0AoetR5Bw6wj4niittNyFh6+eOGonYvJ1ao6B8lEa3Q3klS7ADZ53bc5g==
|
||||
dependencies:
|
||||
"@babel/template" "^7.18.10"
|
||||
"@babel/traverse" "^7.20.1"
|
||||
"@babel/types" "^7.20.0"
|
||||
"@babel/template" "^7.26.9"
|
||||
"@babel/types" "^7.26.10"
|
||||
|
||||
"@babel/highlight@^7.18.6":
|
||||
version "7.18.6"
|
||||
@@ -491,6 +509,13 @@
|
||||
resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.23.6.tgz#ba1c9e512bda72a47e285ae42aff9d2a635a9e3b"
|
||||
integrity sha512-Z2uID7YJ7oNvAI20O9X0bblw7Qqs8Q2hFy0R9tAfnfLkp5MW0UH9eUvnDSnFwKZ0AvgS1ucqR4KzvVHgnke1VQ==
|
||||
|
||||
"@babel/parser@^7.26.9":
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.26.10.tgz#e9bdb82f14b97df6569b0b038edd436839c57749"
|
||||
integrity sha512-6aQR2zGE/QFi8JpDLjUZEPYOs7+mhKXm86VaKFiLP35JQwQb6bwUE+XbvkH0EptsYhbNBSUGaUBLKqxH1xSgsA==
|
||||
dependencies:
|
||||
"@babel/types" "^7.26.10"
|
||||
|
||||
"@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression@^7.18.6":
|
||||
version "7.18.6"
|
||||
resolved "https://registry.npmmirror.com/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression/-/plugin-bugfix-safari-id-destructuring-collision-in-function-expression-7.18.6.tgz#da5b8f9a580acdfbe53494dba45ea389fb09a4d2"
|
||||
@@ -1196,19 +1221,19 @@
|
||||
"@babel/plugin-transform-typescript" "^7.18.6"
|
||||
|
||||
"@babel/runtime-corejs3@^7.15.4":
|
||||
version "7.20.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/runtime-corejs3/-/runtime-corejs3-7.20.1.tgz#d0775a49bb5fba77e42cbb7276c9955c7b05af8d"
|
||||
integrity sha512-CGulbEDcg/ND1Im7fUNRZdGXmX2MTWVVZacQi/6DiKE5HNwZ3aVTm5PV4lO8HHz0B2h8WQyvKKjbX5XgTtydsg==
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/runtime-corejs3/-/runtime-corejs3-7.26.10.tgz#5a3185ca2813f8de8ae68622572086edf5cf51f2"
|
||||
integrity sha512-uITFQYO68pMEYR46AHgQoyBg7KPPJDAbGn4jUTIRgCFJIp88MIBUianVOplhZDEec07bp9zIyr4Kp0FCyQzmWg==
|
||||
dependencies:
|
||||
core-js-pure "^3.25.1"
|
||||
regenerator-runtime "^0.13.10"
|
||||
core-js-pure "^3.30.2"
|
||||
regenerator-runtime "^0.14.0"
|
||||
|
||||
"@babel/runtime@^7.1.2", "@babel/runtime@^7.10.2", "@babel/runtime@^7.10.3", "@babel/runtime@^7.12.13", "@babel/runtime@^7.15.4", "@babel/runtime@^7.8.4":
|
||||
version "7.20.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/runtime/-/runtime-7.20.1.tgz#1148bb33ab252b165a06698fde7576092a78b4a9"
|
||||
integrity sha512-mrzLkl6U9YLF8qpqI7TB82PESyEGjm/0Ly91jG575eVxMMlb8fYfOXFZIJ8XfLrJZQbm7dlKry2bJmXBUEkdFg==
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.26.10.tgz#a07b4d8fa27af131a633d7b3524db803eb4764c2"
|
||||
integrity sha512-2WJMeRQPHKSPemqk/awGrAiuFfzBmOIPXKizAsVhWH9YJqLZ0H+HS4c8loHGgW6utJ3E/ejXQUsiGaQy2NZ9Fw==
|
||||
dependencies:
|
||||
regenerator-runtime "^0.13.10"
|
||||
regenerator-runtime "^0.14.0"
|
||||
|
||||
"@babel/template@^7.12.7", "@babel/template@^7.18.10":
|
||||
version "7.18.10"
|
||||
@@ -1228,6 +1253,15 @@
|
||||
"@babel/parser" "^7.22.15"
|
||||
"@babel/types" "^7.22.15"
|
||||
|
||||
"@babel/template@^7.26.9":
|
||||
version "7.26.9"
|
||||
resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.26.9.tgz#4577ad3ddf43d194528cff4e1fa6b232fa609bb2"
|
||||
integrity sha512-qyRplbeIpNZhmzOysF/wFMuP9sctmh2cFzRAZOn1YapxBsE1i9bJIY586R/WBLfLcmcBlM8ROBiQURnnNy+zfA==
|
||||
dependencies:
|
||||
"@babel/code-frame" "^7.26.2"
|
||||
"@babel/parser" "^7.26.9"
|
||||
"@babel/types" "^7.26.9"
|
||||
|
||||
"@babel/traverse@^7.12.13", "@babel/traverse@^7.12.9", "@babel/traverse@^7.19.0", "@babel/traverse@^7.19.1", "@babel/traverse@^7.20.1":
|
||||
version "7.23.6"
|
||||
resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.23.6.tgz#b53526a2367a0dd6edc423637f3d2d0f2521abc5"
|
||||
@@ -1262,6 +1296,14 @@
|
||||
"@babel/helper-validator-identifier" "^7.22.20"
|
||||
to-fast-properties "^2.0.0"
|
||||
|
||||
"@babel/types@^7.26.10", "@babel/types@^7.26.9":
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.26.10.tgz#396382f6335bd4feb65741eacfc808218f859259"
|
||||
integrity sha512-emqcG3vHrpxUKTrxcblR36dcrcoRDvKmnL/dCL6ZsHaShW80qxCAcNhzQZrpeM765VzEos+xOi4s+r4IXzTwdQ==
|
||||
dependencies:
|
||||
"@babel/helper-string-parser" "^7.25.9"
|
||||
"@babel/helper-validator-identifier" "^7.25.9"
|
||||
|
||||
"@docsearch/css@3.3.0":
|
||||
version "3.3.0"
|
||||
resolved "https://registry.npmmirror.com/@docsearch/css/-/css-3.3.0.tgz#d698e48302d12240d7c2f7452ccb2d2239a8cd80"
|
||||
@@ -2592,9 +2634,9 @@ ajv@^8.0.0, ajv@^8.8.0:
|
||||
uri-js "^4.2.2"
|
||||
|
||||
algoliasearch-helper@^3.5.5:
|
||||
version "3.11.1"
|
||||
resolved "https://registry.npmmirror.com/algoliasearch-helper/-/algoliasearch-helper-3.11.1.tgz#d83ab7f1a2a374440686ef7a144b3c288b01188a"
|
||||
integrity sha512-mvsPN3eK4E0bZG0/WlWJjeqe/bUD2KOEVOl0GyL/TGXn6wcpZU8NOuztGHCUKXkyg5gq6YzUakVTmnmSSO5Yiw==
|
||||
version "3.26.0"
|
||||
resolved "https://registry.yarnpkg.com/algoliasearch-helper/-/algoliasearch-helper-3.26.0.tgz#d6e283396a9fc5bf944f365dc3b712570314363f"
|
||||
integrity sha512-Rv2x3GXleQ3ygwhkhJubhhYGsICmShLAiqtUuJTUkr9uOCOXyF2E71LVT4XDnVffbknv8XgScP4U0Oxtgm+hIw==
|
||||
dependencies:
|
||||
"@algolia/events" "^4.0.1"
|
||||
|
||||
@@ -2863,9 +2905,9 @@ boxen@^5.0.0, boxen@^5.0.1:
|
||||
wrap-ansi "^7.0.0"
|
||||
|
||||
brace-expansion@^1.1.7:
|
||||
version "1.1.11"
|
||||
resolved "https://registry.npmmirror.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd"
|
||||
integrity sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==
|
||||
version "1.1.12"
|
||||
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.12.tgz#ab9b454466e5a8cc3a187beaad580412a9c5b843"
|
||||
integrity sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==
|
||||
dependencies:
|
||||
balanced-match "^1.0.0"
|
||||
concat-map "0.0.1"
|
||||
@@ -2995,15 +3037,10 @@ caniuse-api@^3.0.0:
|
||||
lodash.memoize "^4.1.2"
|
||||
lodash.uniq "^4.5.0"
|
||||
|
||||
caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001400, caniuse-lite@^1.0.30001426:
|
||||
version "1.0.30001430"
|
||||
resolved "https://registry.npmmirror.com/caniuse-lite/-/caniuse-lite-1.0.30001430.tgz#638a8ae00b5a8a97e66ff43733b2701f81b101fa"
|
||||
integrity sha512-IB1BXTZKPDVPM7cnV4iaKaHxckvdr/3xtctB3f7Hmenx3qYBhGtTZ//7EllK66aKXW98Lx0+7Yr0kxBtIt3tzg==
|
||||
|
||||
caniuse-lite@^1.0.30001646:
|
||||
version "1.0.30001657"
|
||||
resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001657.tgz#29fd504bffca719d1c6b63a1f6f840be1973a660"
|
||||
integrity sha512-DPbJAlP8/BAXy3IgiWmZKItubb3TYGP0WscQQlVGIfT4s/YlFYVuJgyOsQNP7rJRChx/qdMeLJQJP0Sgg2yjNA==
|
||||
caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001400, caniuse-lite@^1.0.30001426, caniuse-lite@^1.0.30001646:
|
||||
version "1.0.30001718"
|
||||
resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001718.tgz"
|
||||
integrity sha512-AflseV1ahcSunK53NfEs9gFWgOEmzr0f+kaMFA4xiLZlr9Hzt7HxcSpIFcnNCUkz6R6dWKa54rUz3HUmI3nVcw==
|
||||
|
||||
ccount@^1.0.0, ccount@^1.0.3:
|
||||
version "1.1.0"
|
||||
@@ -3326,10 +3363,10 @@ core-js-compat@^3.25.1:
|
||||
dependencies:
|
||||
browserslist "^4.21.4"
|
||||
|
||||
core-js-pure@^3.25.1:
|
||||
version "3.26.0"
|
||||
resolved "https://registry.npmmirror.com/core-js-pure/-/core-js-pure-3.26.0.tgz#7ad8a5dd7d910756f3124374b50026e23265ca9a"
|
||||
integrity sha512-LiN6fylpVBVwT8twhhluD9TzXmZQQsr2I2eIKtWNbZI1XMfBT7CV18itaN6RA7EtQd/SDdRx/wzvAShX2HvhQA==
|
||||
core-js-pure@^3.30.2:
|
||||
version "3.41.0"
|
||||
resolved "https://registry.yarnpkg.com/core-js-pure/-/core-js-pure-3.41.0.tgz#349fecad168d60807a31e83c99d73d786fe80811"
|
||||
integrity sha512-71Gzp96T9YPk63aUvE5Q5qP+DryB4ZloUZPSOebGM88VNw8VNfvdA7z6kGA8iGOTEzAomsRidp4jXSmUIJsL+Q==
|
||||
|
||||
core-js@^3.18.0:
|
||||
version "3.26.0"
|
||||
@@ -3371,9 +3408,9 @@ cross-fetch@^3.1.5:
|
||||
node-fetch "2.6.7"
|
||||
|
||||
cross-spawn@^7.0.3:
|
||||
version "7.0.3"
|
||||
resolved "https://registry.npmmirror.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6"
|
||||
integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==
|
||||
version "7.0.6"
|
||||
resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f"
|
||||
integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==
|
||||
dependencies:
|
||||
path-key "^3.1.0"
|
||||
shebang-command "^2.0.0"
|
||||
@@ -4830,9 +4867,9 @@ http-parser-js@>=0.5.1:
|
||||
integrity sha512-SGeBX54F94Wgu5RH3X5jsDtf4eHyRogWX1XGT3b4HuW3tQPM4AaBzoUji/4AAJNXCEOWZ5O0DgZmJw1947gD5Q==
|
||||
|
||||
http-proxy-middleware@^2.0.3:
|
||||
version "2.0.7"
|
||||
resolved "https://registry.yarnpkg.com/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz#915f236d92ae98ef48278a95dedf17e991936ec6"
|
||||
integrity sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==
|
||||
version "2.0.9"
|
||||
resolved "https://registry.yarnpkg.com/http-proxy-middleware/-/http-proxy-middleware-2.0.9.tgz#e9e63d68afaa4eee3d147f39149ab84c0c2815ef"
|
||||
integrity sha512-c1IyJYLYppU574+YI7R4QyX2ystMtVXZwIdzazUIPIJsHuWNd+mho2j+bKoHftndicGj9yh+xjd+l0yj7VeT1Q==
|
||||
dependencies:
|
||||
"@types/http-proxy" "^1.17.8"
|
||||
http-proxy "^1.18.1"
|
||||
@@ -5709,9 +5746,9 @@ multicast-dns@^7.2.5:
|
||||
thunky "^1.0.2"
|
||||
|
||||
nanoid@^3.3.6:
|
||||
version "3.3.6"
|
||||
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.6.tgz#443380c856d6e9f9824267d960b4236ad583ea4c"
|
||||
integrity sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==
|
||||
version "3.3.8"
|
||||
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.8.tgz#b1be3030bee36aaff18bacb375e5cce521684baf"
|
||||
integrity sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==
|
||||
|
||||
negotiator@0.6.3:
|
||||
version "0.6.3"
|
||||
@@ -6441,9 +6478,9 @@ prism-react-renderer@^1.2.1:
|
||||
integrity sha512-IJ+MSwBWKG+SM3b2SUfdrhC+gu01QkV2KmRQgREThBfSQRoufqRfxfHUxpG1WcaFjP+kojcFyO9Qqtpgt3qLCg==
|
||||
|
||||
prismjs@^1.23.0:
|
||||
version "1.29.0"
|
||||
resolved "https://registry.npmmirror.com/prismjs/-/prismjs-1.29.0.tgz#f113555a8fa9b57c35e637bba27509dcf802dd12"
|
||||
integrity sha512-Kx/1w86q/epKcmte75LNrEoT+lX8pBpavuAbvJWRXar7Hz8jrtF+e3vY751p0R8H9HdArwaCTNDDzHg/ScJK1Q==
|
||||
version "1.30.0"
|
||||
resolved "https://registry.yarnpkg.com/prismjs/-/prismjs-1.30.0.tgz#d9709969d9d4e16403f6f348c63553b19f0975a9"
|
||||
integrity sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==
|
||||
|
||||
process-nextick-args@~2.0.0:
|
||||
version "2.0.1"
|
||||
@@ -6816,10 +6853,10 @@ regenerate@^1.4.2:
|
||||
resolved "https://registry.npmmirror.com/regenerate/-/regenerate-1.4.2.tgz#b9346d8827e8f5a32f7ba29637d398b69014848a"
|
||||
integrity sha512-zrceR/XhGYU/d/opr2EKO7aRHUeiBI8qjtfHqADTwZd6Szfy16la6kqD0MIUs5z5hx6AaKa+PixpPrR289+I0A==
|
||||
|
||||
regenerator-runtime@^0.13.10:
|
||||
version "0.13.10"
|
||||
resolved "https://registry.npmmirror.com/regenerator-runtime/-/regenerator-runtime-0.13.10.tgz#ed07b19616bcbec5da6274ebc75ae95634bfc2ee"
|
||||
integrity sha512-KepLsg4dU12hryUO7bp/axHAKvwGOCV0sGloQtpagJ12ai+ojVDqkeGSiRX1zlq+kjIMZ1t7gpze+26QqtdGqw==
|
||||
regenerator-runtime@^0.14.0:
|
||||
version "0.14.1"
|
||||
resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz#356ade10263f685dda125100cd862c1db895327f"
|
||||
integrity sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==
|
||||
|
||||
regenerator-transform@^0.15.0:
|
||||
version "0.15.0"
|
||||
@@ -7272,14 +7309,7 @@ send@0.19.0:
|
||||
range-parser "~1.2.1"
|
||||
statuses "2.0.1"
|
||||
|
||||
serialize-javascript@^6.0.0:
|
||||
version "6.0.0"
|
||||
resolved "https://registry.npmmirror.com/serialize-javascript/-/serialize-javascript-6.0.0.tgz#efae5d88f45d7924141da8b5c3a7a7e663fefeb8"
|
||||
integrity sha512-Qr3TosvguFt8ePWqsvRfrKyQXIiW+nGbYpy8XK24NQHE83caxWt+mIymTT19DGFbNWNLfEwsrkSmN64lVWB9ag==
|
||||
dependencies:
|
||||
randombytes "^2.1.0"
|
||||
|
||||
serialize-javascript@^6.0.1:
|
||||
serialize-javascript@^6.0.0, serialize-javascript@^6.0.1:
|
||||
version "6.0.2"
|
||||
resolved "https://registry.yarnpkg.com/serialize-javascript/-/serialize-javascript-6.0.2.tgz#defa1e055c83bf6d59ea805d8da862254eb6a6c2"
|
||||
integrity sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==
|
||||
|
||||
Reference in New Issue
Block a user