mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-14 12:49:17 +08:00
Compare commits
49 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
13aec414ea | ||
|
|
bb16dcde93 | ||
|
|
be81a76da9 | ||
|
|
2d16089529 | ||
|
|
01c3c83653 | ||
|
|
9b66103f7c | ||
|
|
48dfd72e64 | ||
|
|
dec92e5b02 | ||
|
|
22911ea1ef | ||
|
|
12183e5f73 | ||
|
|
c2b25310fc | ||
|
|
0f9420590d | ||
|
|
5107c506b4 | ||
|
|
9e219ef8dc | ||
|
|
6e4083743b | ||
|
|
17e95edd9e | ||
|
|
468bc62d27 | ||
|
|
437c239c11 | ||
|
|
8e753f1092 | ||
|
|
a3b57e11d4 | ||
|
|
a80dcf9925 | ||
|
|
7157af44e0 | ||
|
|
1798c4591e | ||
|
|
dd26263330 | ||
|
|
2ba5f8bed1 | ||
|
|
d0a11958a5 | ||
|
|
0ef9b00a75 | ||
|
|
840f76e5e5 | ||
|
|
d8b7d25b80 | ||
|
|
6d53929803 | ||
|
|
c038fbca07 | ||
|
|
6a99202492 | ||
|
|
42d1dcfa0e | ||
|
|
b83c8a7d3b | ||
|
|
b9194cdcf2 | ||
|
|
9a1f6b0291 | ||
|
|
07f4413aae | ||
|
|
5a74227bc3 | ||
|
|
7644958e21 | ||
|
|
a316f84fe1 | ||
|
|
72881d3a2b | ||
|
|
69da685d1e | ||
|
|
c01c3910eb | ||
|
|
98d3fd2f48 | ||
|
|
9724c626cc | ||
|
|
0d92400200 | ||
|
|
d224218ecf | ||
|
|
a2a5e1abb9 | ||
|
|
5c0f18b7bc |
21
.github/workflows/CD.yml
vendored
21
.github/workflows/CD.yml
vendored
@@ -12,26 +12,17 @@ jobs:
|
||||
deploy:
|
||||
strategy:
|
||||
matrix:
|
||||
os: ['ubuntu-latest']
|
||||
python-version: [3.8]
|
||||
os: ["ubuntu-latest"]
|
||||
python-version: ["3.10"]
|
||||
runs-on: ${{ matrix.os }}
|
||||
environment: package
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Cache conda
|
||||
uses: actions/cache@v3
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
path: ~/conda_pkgs_dir
|
||||
key: conda-${{ matrix.os }}-python-${{ matrix.python-version }}-${{ hashFiles('environment.yml') }}
|
||||
- name: Setup Miniconda
|
||||
uses: conda-incubator/setup-miniconda@v2
|
||||
with:
|
||||
auto-update-conda: true
|
||||
auto-activate-base: false
|
||||
activate-environment: hcrystalball
|
||||
python-version: ${{ matrix.python-version }}
|
||||
use-only-tar-bz2: true
|
||||
- name: Install from source
|
||||
# This is required for the pre-commit tests
|
||||
shell: pwsh
|
||||
@@ -42,7 +33,7 @@ jobs:
|
||||
- name: Build
|
||||
shell: pwsh
|
||||
run: |
|
||||
pip install twine
|
||||
pip install twine wheel setuptools
|
||||
python setup.py sdist bdist_wheel
|
||||
- name: Publish to PyPI
|
||||
env:
|
||||
|
||||
8
.github/workflows/deploy-website.yml
vendored
8
.github/workflows/deploy-website.yml
vendored
@@ -37,11 +37,11 @@ jobs:
|
||||
- name: setup python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
python-version: "3.10"
|
||||
- name: pydoc-markdown install
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install pydoc-markdown==4.5.0
|
||||
pip install pydoc-markdown==4.7.0
|
||||
- name: pydoc-markdown run
|
||||
run: |
|
||||
pydoc-markdown
|
||||
@@ -73,11 +73,11 @@ jobs:
|
||||
- name: setup python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
python-version: "3.10"
|
||||
- name: pydoc-markdown install
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install pydoc-markdown==4.5.0
|
||||
pip install pydoc-markdown==4.7.0
|
||||
- name: pydoc-markdown run
|
||||
run: |
|
||||
pydoc-markdown
|
||||
|
||||
16
.github/workflows/python-package.yml
vendored
16
.github/workflows/python-package.yml
vendored
@@ -14,6 +14,12 @@ on:
|
||||
- 'setup.py'
|
||||
pull_request:
|
||||
branches: ['main']
|
||||
paths:
|
||||
- 'flaml/**'
|
||||
- 'test/**'
|
||||
- 'notebook/**'
|
||||
- '.github/workflows/python-package.yml'
|
||||
- 'setup.py'
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
|
||||
@@ -29,8 +35,8 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-2019]
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
python-version: ["3.9", "3.10", "3.11"]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
@@ -50,7 +56,7 @@ jobs:
|
||||
export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
|
||||
- name: Install packages and dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip wheel
|
||||
python -m pip install --upgrade pip wheel setuptools
|
||||
pip install -e .
|
||||
python -c "import flaml"
|
||||
pip install -e .[test]
|
||||
@@ -85,12 +91,12 @@ jobs:
|
||||
- name: Test with pytest
|
||||
if: matrix.python-version != '3.10'
|
||||
run: |
|
||||
pytest test
|
||||
pytest test/ --ignore=test/autogen
|
||||
- name: Coverage
|
||||
if: matrix.python-version == '3.10'
|
||||
run: |
|
||||
pip install coverage
|
||||
coverage run -a -m pytest test
|
||||
coverage run -a -m pytest test --ignore=test/autogen
|
||||
coverage xml
|
||||
- name: Upload coverage to Codecov
|
||||
if: matrix.python-version == '3.10'
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# basic setup
|
||||
FROM mcr.microsoft.com/devcontainers/python:3.8
|
||||
FROM mcr.microsoft.com/devcontainers/python:3.10
|
||||
RUN apt-get update && apt-get -y update
|
||||
RUN apt-get install -y sudo git npm
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ FLAML has a .NET implementation in [ML.NET](http://dot.net/ml), an open-source,
|
||||
|
||||
## Installation
|
||||
|
||||
FLAML requires **Python version >= 3.8**. It can be installed from pip:
|
||||
FLAML requires **Python version >= 3.9**. It can be installed from pip:
|
||||
|
||||
```bash
|
||||
pip install flaml
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from flaml.automl import AutoML, logger_formatter
|
||||
@@ -12,7 +13,8 @@ from flaml.version import __version__
|
||||
|
||||
# Set the root logger.
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
if logger.level == logging.NOTSET:
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
if not has_automl:
|
||||
logger.warning("flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.")
|
||||
warnings.warn("flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.")
|
||||
|
||||
@@ -156,7 +156,7 @@ class MathUserProxyAgent(UserProxyAgent):
|
||||
when the number of auto reply reaches the max_consecutive_auto_reply or when is_termination_msg is True.
|
||||
default_auto_reply (str or dict or None): the default auto reply message when no code execution or llm based reply is generated.
|
||||
max_invalid_q_per_step (int): (ADDED) the maximum number of invalid queries per step.
|
||||
**kwargs (dict): other kwargs in [UserProxyAgent](user_proxy_agent#__init__).
|
||||
**kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).
|
||||
"""
|
||||
super().__init__(
|
||||
name=name,
|
||||
|
||||
@@ -123,7 +123,7 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
can be found at `https://www.sbert.net/docs/pretrained_models.html`. The default model is a
|
||||
fast model. If you want to use a high performance model, `all-mpnet-base-v2` is recommended.
|
||||
- customized_prompt (Optional, str): the customized prompt for the retrieve chat. Default is None.
|
||||
**kwargs (dict): other kwargs in [UserProxyAgent](user_proxy_agent#__init__).
|
||||
**kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).
|
||||
"""
|
||||
super().__init__(
|
||||
name=name,
|
||||
|
||||
@@ -10,6 +10,7 @@ import os
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import as_completed
|
||||
from functools import partial
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
@@ -187,9 +188,16 @@ class AutoML(BaseEstimator):
|
||||
mem_thres: A float of the memory size constraint in bytes.
|
||||
pred_time_limit: A float of the prediction latency constraint in seconds.
|
||||
It refers to the average prediction time per row in validation data.
|
||||
train_time_limit: A float of the training time constraint in seconds.
|
||||
train_time_limit: None or a float of the training time constraint in seconds for each trial.
|
||||
Only valid for sequential search.
|
||||
verbose: int, default=3 | Controls the verbosity, higher means more
|
||||
messages.
|
||||
verbose=0: logger level = CRITICAL
|
||||
verbose=1: logger level = ERROR
|
||||
verbose=2: logger level = WARNING
|
||||
verbose=3: logger level = INFO
|
||||
verbose=4: logger level = DEBUG
|
||||
verbose>5: logger level = NOTSET
|
||||
retrain_full: bool or str, default=True | whether to retrain the
|
||||
selected model on the full training data when using holdout.
|
||||
True - retrain only after search finishes; False - no retraining;
|
||||
@@ -203,7 +211,7 @@ class AutoML(BaseEstimator):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
@@ -424,6 +432,8 @@ class AutoML(BaseEstimator):
|
||||
If `model_history` was set to True, then the returned model is trained.
|
||||
"""
|
||||
state = self._search_states.get(estimator_name)
|
||||
if state and estimator_name == self._best_estimator:
|
||||
return self.model
|
||||
return state and getattr(state, "trained_estimator", None)
|
||||
|
||||
@property
|
||||
@@ -739,7 +749,7 @@ class AutoML(BaseEstimator):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
@@ -1332,7 +1342,8 @@ class AutoML(BaseEstimator):
|
||||
mem_thres: A float of the memory size constraint in bytes.
|
||||
pred_time_limit: A float of the prediction latency constraint in seconds.
|
||||
It refers to the average prediction time per row in validation data.
|
||||
train_time_limit: None or a float of the training time constraint in seconds.
|
||||
train_time_limit: None or a float of the training time constraint in seconds for each trial.
|
||||
Only valid for sequential search.
|
||||
X_val: None or a numpy array or a pandas dataframe of validation data.
|
||||
y_val: None or a numpy array or a pandas series of validation labels.
|
||||
sample_weight_val: None or a numpy array of the sample weight of
|
||||
@@ -1345,6 +1356,12 @@ class AutoML(BaseEstimator):
|
||||
for training data.
|
||||
verbose: int, default=3 | Controls the verbosity, higher means more
|
||||
messages.
|
||||
verbose=0: logger level = CRITICAL
|
||||
verbose=1: logger level = ERROR
|
||||
verbose=2: logger level = WARNING
|
||||
verbose=3: logger level = INFO
|
||||
verbose=4: logger level = DEBUG
|
||||
verbose>5: logger level = NOTSET
|
||||
retrain_full: bool or str, default=True | whether to retrain the
|
||||
selected model on the full training data when using holdout.
|
||||
True - retrain only after search finishes; False - no retraining;
|
||||
@@ -1358,7 +1375,7 @@ class AutoML(BaseEstimator):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
@@ -1623,6 +1640,13 @@ class AutoML(BaseEstimator):
|
||||
_ch.setFormatter(logger_formatter)
|
||||
logger.addHandler(_ch)
|
||||
|
||||
if model_history:
|
||||
logger.warning(
|
||||
"With `model_history` set to `True` by default, all intermediate models are retained in memory, "
|
||||
"which may significantly increase memory usage and slow down training. "
|
||||
"Consider setting `model_history=False` to optimize memory and accelerate the training process."
|
||||
)
|
||||
|
||||
if not use_ray and not use_spark and n_concurrent_trials > 1:
|
||||
if ray_available:
|
||||
logger.warning(
|
||||
@@ -1708,7 +1732,7 @@ class AutoML(BaseEstimator):
|
||||
if not (mlflow.active_run() is not None or is_autolog_enabled()):
|
||||
self.mlflow_integration.only_history = True
|
||||
except KeyError:
|
||||
print("Not in Fabric, Skipped")
|
||||
logger.info("Not in Fabric, Skipped")
|
||||
task.validate_data(
|
||||
self,
|
||||
self._state,
|
||||
@@ -2529,6 +2553,21 @@ class AutoML(BaseEstimator):
|
||||
self._selected = state = self._search_states[estimator]
|
||||
state.best_config_sample_size = self._state.data_size[0]
|
||||
state.best_config = state.init_config[0] if state.init_config else {}
|
||||
self._track_iter = 0
|
||||
self._config_history[self._track_iter] = (estimator, state.best_config, self._state.time_from_start)
|
||||
self._best_iteration = self._track_iter
|
||||
state.val_loss = getattr(state, "val_loss", float("inf"))
|
||||
state.best_loss = getattr(state, "best_loss", float("inf"))
|
||||
state.config = getattr(state, "config", state.best_config.copy())
|
||||
state.metric_for_logging = getattr(state, "metric_for_logging", None)
|
||||
state.sample_size = getattr(state, "sample_size", self._state.data_size[0])
|
||||
state.learner_class = getattr(state, "learner_class", self._state.learner_classes.get(estimator))
|
||||
if hasattr(self, "mlflow_integration") and self.mlflow_integration:
|
||||
self.mlflow_integration.record_state(
|
||||
automl=self,
|
||||
search_state=state,
|
||||
estimator=estimator,
|
||||
)
|
||||
elif self._use_ray is False and self._use_spark is False:
|
||||
self._search_sequential()
|
||||
else:
|
||||
@@ -2700,16 +2739,47 @@ class AutoML(BaseEstimator):
|
||||
):
|
||||
if mlflow.active_run() is None:
|
||||
mlflow.start_run(run_id=self.mlflow_integration.parent_run_id)
|
||||
self.mlflow_integration.log_model(
|
||||
self._trained_estimator.model,
|
||||
self.best_estimator,
|
||||
signature=self.estimator_signature,
|
||||
)
|
||||
self.mlflow_integration.pickle_and_log_automl_artifacts(
|
||||
self, self.model, self.best_estimator, signature=self.pipeline_signature
|
||||
)
|
||||
if self.best_estimator.endswith("_spark"):
|
||||
self.mlflow_integration.log_model(
|
||||
self._trained_estimator.model,
|
||||
self.best_estimator,
|
||||
signature=self.estimator_signature,
|
||||
run_id=self.mlflow_integration.parent_run_id,
|
||||
)
|
||||
else:
|
||||
self.mlflow_integration.pickle_and_log_automl_artifacts(
|
||||
self,
|
||||
self.model,
|
||||
self.best_estimator,
|
||||
signature=self.pipeline_signature,
|
||||
run_id=self.mlflow_integration.parent_run_id,
|
||||
)
|
||||
else:
|
||||
logger.info("not retraining because the time budget is too small.")
|
||||
logger.warning("not retraining because the time budget is too small.")
|
||||
self.wait_futures()
|
||||
|
||||
def wait_futures(self):
|
||||
if self.mlflow_integration is not None:
|
||||
logger.debug("Collecting results from submitted record_state tasks")
|
||||
t1 = time.perf_counter()
|
||||
for future in as_completed(self.mlflow_integration.futures):
|
||||
_task = self.mlflow_integration.futures[future]
|
||||
try:
|
||||
result = future.result()
|
||||
logger.debug(f"Result for record_state task {_task}: {result}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Exception for record_state task {_task}: {e}")
|
||||
for future in as_completed(self.mlflow_integration.futures_log_model):
|
||||
_task = self.mlflow_integration.futures_log_model[future]
|
||||
try:
|
||||
result = future.result()
|
||||
logger.debug(f"Result for log_model task {_task}: {result}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Exception for log_model task {_task}: {e}")
|
||||
t2 = time.perf_counter()
|
||||
logger.debug(f"Collecting results from tasks submitted to executors costs {t2-t1} seconds.")
|
||||
else:
|
||||
logger.debug("No futures to wait for.")
|
||||
|
||||
def __del__(self):
|
||||
if (
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
try:
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
|
||||
except ImportError:
|
||||
pass
|
||||
except ImportError as e:
|
||||
print(f"scikit-learn is required for HistGradientBoostingEstimator. Please install it; error: {e}")
|
||||
|
||||
from flaml import tune
|
||||
from flaml.automl.model import SKLearnEstimator
|
||||
|
||||
@@ -2,13 +2,17 @@
|
||||
# * Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# * Licensed under the MIT License. See LICENSE file in the
|
||||
# * project root for license information.
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
import random
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from decimal import ROUND_HALF_UP, Decimal
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from flaml.automl.spark import DataFrame, Series, pd, ps, psDataFrame, psSeries
|
||||
from flaml.automl.spark import DataFrame, F, Series, T, pd, ps, psDataFrame, psSeries
|
||||
from flaml.automl.training_log import training_log_reader
|
||||
|
||||
try:
|
||||
@@ -19,6 +23,7 @@ except ImportError:
|
||||
if TYPE_CHECKING:
|
||||
from flaml.automl.task import Task
|
||||
|
||||
|
||||
TS_TIMESTAMP_COL = "ds"
|
||||
TS_VALUE_COL = "y"
|
||||
|
||||
@@ -293,7 +298,7 @@ class DataTransformer:
|
||||
y = y.rename(TS_VALUE_COL)
|
||||
for column in X.columns:
|
||||
# sklearn\utils\validation.py needs int/float values
|
||||
if X[column].dtype.name in ("object", "category"):
|
||||
if X[column].dtype.name in ("object", "category", "string"):
|
||||
if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum():
|
||||
X.drop(columns=column, inplace=True)
|
||||
drop = True
|
||||
@@ -445,3 +450,331 @@ class DataTransformer:
|
||||
def group_counts(groups):
|
||||
_, i, c = np.unique(groups, return_counts=True, return_index=True)
|
||||
return c[np.argsort(i)]
|
||||
|
||||
|
||||
def get_random_dataframe(n_rows: int = 200, ratio_none: float = 0.1, seed: int = 42) -> DataFrame:
|
||||
"""Generate a random pandas DataFrame with various data types for testing.
|
||||
This function creates a DataFrame with multiple column types including:
|
||||
- Timestamps
|
||||
- Integers
|
||||
- Floats
|
||||
- Categorical values
|
||||
- Booleans
|
||||
- Lists (tags)
|
||||
- Decimal strings
|
||||
- UUIDs
|
||||
- Binary data (as hex strings)
|
||||
- JSON blobs
|
||||
- Nullable text fields
|
||||
Parameters
|
||||
----------
|
||||
n_rows : int, default=200
|
||||
Number of rows in the generated DataFrame
|
||||
ratio_none : float, default=0.1
|
||||
Probability of generating None values in applicable columns
|
||||
seed : int, default=42
|
||||
Random seed for reproducibility
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
A DataFrame with 14 columns of various data types
|
||||
Examples
|
||||
--------
|
||||
>>> df = get_random_dataframe(100, 0.05, 123)
|
||||
>>> df.shape
|
||||
(100, 14)
|
||||
>>> df.dtypes
|
||||
timestamp datetime64[ns]
|
||||
id int64
|
||||
score float64
|
||||
status object
|
||||
flag object
|
||||
count object
|
||||
value object
|
||||
tags object
|
||||
rating object
|
||||
uuid object
|
||||
binary object
|
||||
json_blob object
|
||||
category category
|
||||
nullable_text object
|
||||
dtype: object
|
||||
"""
|
||||
|
||||
np.random.seed(seed)
|
||||
random.seed(seed)
|
||||
|
||||
def random_tags():
|
||||
tags = ["AI", "ML", "data", "robotics", "vision"]
|
||||
return random.sample(tags, k=random.randint(1, 3)) if random.random() > ratio_none else None
|
||||
|
||||
def random_decimal():
|
||||
return (
|
||||
str(Decimal(random.uniform(1, 5)).quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
|
||||
if random.random() > ratio_none
|
||||
else None
|
||||
)
|
||||
|
||||
def random_json_blob():
|
||||
blob = {"a": random.randint(1, 10), "b": random.random()}
|
||||
return json.dumps(blob) if random.random() > ratio_none else None
|
||||
|
||||
def random_binary():
|
||||
return bytes(random.randint(0, 255) for _ in range(4)).hex() if random.random() > ratio_none else None
|
||||
|
||||
data = {
|
||||
"timestamp": [
|
||||
datetime(2020, 1, 1) + timedelta(days=np.random.randint(0, 1000)) if np.random.rand() > ratio_none else None
|
||||
for _ in range(n_rows)
|
||||
],
|
||||
"id": range(1, n_rows + 1),
|
||||
"score": np.random.uniform(0, 100, n_rows),
|
||||
"status": np.random.choice(
|
||||
["active", "inactive", "pending", None],
|
||||
size=n_rows,
|
||||
p=[(1 - ratio_none) / 3, (1 - ratio_none) / 3, (1 - ratio_none) / 3, ratio_none],
|
||||
),
|
||||
"flag": np.random.choice(
|
||||
[True, False, None], size=n_rows, p=[(1 - ratio_none) / 2, (1 - ratio_none) / 2, ratio_none]
|
||||
),
|
||||
"count": [np.random.randint(0, 100) if np.random.rand() > ratio_none else None for _ in range(n_rows)],
|
||||
"value": [round(np.random.normal(50, 15), 2) if np.random.rand() > ratio_none else None for _ in range(n_rows)],
|
||||
"tags": [random_tags() for _ in range(n_rows)],
|
||||
"rating": [random_decimal() for _ in range(n_rows)],
|
||||
"uuid": [str(uuid.uuid4()) if np.random.rand() > ratio_none else None for _ in range(n_rows)],
|
||||
"binary": [random_binary() for _ in range(n_rows)],
|
||||
"json_blob": [random_json_blob() for _ in range(n_rows)],
|
||||
"category": pd.Categorical(
|
||||
np.random.choice(
|
||||
["A", "B", "C", None],
|
||||
size=n_rows,
|
||||
p=[(1 - ratio_none) / 3, (1 - ratio_none) / 3, (1 - ratio_none) / 3, ratio_none],
|
||||
)
|
||||
),
|
||||
"nullable_text": [random.choice(["Good", "Bad", "Average", None]) for _ in range(n_rows)],
|
||||
}
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
def auto_convert_dtypes_spark(
|
||||
df: psDataFrame,
|
||||
na_values: list = None,
|
||||
category_threshold: float = 0.3,
|
||||
convert_threshold: float = 0.6,
|
||||
sample_ratio: float = 0.1,
|
||||
) -> tuple[psDataFrame, dict]:
|
||||
"""Automatically convert data types in a PySpark DataFrame using heuristics.
|
||||
|
||||
This function analyzes a sample of the DataFrame to infer appropriate data types
|
||||
and applies the conversions. It handles timestamps, numeric values, booleans,
|
||||
and categorical fields.
|
||||
|
||||
Args:
|
||||
df: A PySpark DataFrame to convert.
|
||||
na_values: List of strings to be considered as NA/NaN. Defaults to
|
||||
['NA', 'na', 'NULL', 'null', ''].
|
||||
category_threshold: Maximum ratio of unique values to total values
|
||||
to consider a column categorical. Defaults to 0.3.
|
||||
convert_threshold: Minimum ratio of successfully converted values required
|
||||
to apply a type conversion. Defaults to 0.6.
|
||||
sample_ratio: Fraction of data to sample for type inference. Defaults to 0.1.
|
||||
|
||||
Returns:
|
||||
tuple: (The DataFrame with converted types, A dictionary mapping column names to
|
||||
their inferred types as strings)
|
||||
|
||||
Note:
|
||||
- 'category' in the schema dict is conceptual as PySpark doesn't have a true
|
||||
category type like pandas
|
||||
- The function uses sampling for efficiency with large datasets
|
||||
"""
|
||||
n_rows = df.count()
|
||||
if na_values is None:
|
||||
na_values = ["NA", "na", "NULL", "null", ""]
|
||||
|
||||
# Normalize NA-like values
|
||||
for colname, coltype in df.dtypes:
|
||||
if coltype == "string":
|
||||
df = df.withColumn(
|
||||
colname,
|
||||
F.when(F.trim(F.lower(F.col(colname))).isin([v.lower() for v in na_values]), None).otherwise(
|
||||
F.col(colname)
|
||||
),
|
||||
)
|
||||
|
||||
schema = {}
|
||||
for colname in df.columns:
|
||||
# Sample once at an appropriate ratio
|
||||
sample_ratio_to_use = min(1.0, sample_ratio if n_rows * sample_ratio > 100 else 100 / n_rows)
|
||||
col_sample = df.select(colname).sample(withReplacement=False, fraction=sample_ratio_to_use).dropna()
|
||||
sample_count = col_sample.count()
|
||||
|
||||
inferred_type = "string" # Default
|
||||
|
||||
if col_sample.dtypes[0][1] != "string":
|
||||
schema[colname] = col_sample.dtypes[0][1]
|
||||
continue
|
||||
|
||||
if sample_count == 0:
|
||||
schema[colname] = "string"
|
||||
continue
|
||||
|
||||
# Check if timestamp
|
||||
ts_col = col_sample.withColumn("parsed", F.to_timestamp(F.col(colname)))
|
||||
|
||||
# Check numeric
|
||||
if (
|
||||
col_sample.withColumn("n", F.col(colname).cast("double")).filter("n is not null").count()
|
||||
>= sample_count * convert_threshold
|
||||
):
|
||||
# All whole numbers?
|
||||
all_whole = (
|
||||
col_sample.withColumn("n", F.col(colname).cast("double"))
|
||||
.filter("n is not null")
|
||||
.withColumn("frac", F.abs(F.col("n") % 1))
|
||||
.filter("frac > 0.000001")
|
||||
.count()
|
||||
== 0
|
||||
)
|
||||
inferred_type = "int" if all_whole else "double"
|
||||
|
||||
# Check low-cardinality (category-like)
|
||||
elif (
|
||||
sample_count > 0
|
||||
and col_sample.select(F.countDistinct(F.col(colname))).collect()[0][0] / sample_count <= category_threshold
|
||||
):
|
||||
inferred_type = "category" # Will just be string, but marked as such
|
||||
|
||||
# Check if timestamp
|
||||
elif ts_col.filter(F.col("parsed").isNotNull()).count() >= sample_count * convert_threshold:
|
||||
inferred_type = "timestamp"
|
||||
|
||||
schema[colname] = inferred_type
|
||||
|
||||
# Apply inferred schema
|
||||
for colname, inferred_type in schema.items():
|
||||
if inferred_type == "int":
|
||||
df = df.withColumn(colname, F.col(colname).cast(T.IntegerType()))
|
||||
elif inferred_type == "double":
|
||||
df = df.withColumn(colname, F.col(colname).cast(T.DoubleType()))
|
||||
elif inferred_type == "boolean":
|
||||
df = df.withColumn(
|
||||
colname,
|
||||
F.when(F.lower(F.col(colname)).isin("true", "yes", "1"), True)
|
||||
.when(F.lower(F.col(colname)).isin("false", "no", "0"), False)
|
||||
.otherwise(None),
|
||||
)
|
||||
elif inferred_type == "timestamp":
|
||||
df = df.withColumn(colname, F.to_timestamp(F.col(colname)))
|
||||
elif inferred_type == "category":
|
||||
df = df.withColumn(colname, F.col(colname).cast(T.StringType())) # Marked conceptually
|
||||
|
||||
# otherwise keep as string (or original type)
|
||||
|
||||
return df, schema
|
||||
|
||||
|
||||
def auto_convert_dtypes_pandas(
|
||||
df: DataFrame,
|
||||
na_values: list = None,
|
||||
category_threshold: float = 0.3,
|
||||
convert_threshold: float = 0.6,
|
||||
sample_ratio: float = 1.0,
|
||||
) -> tuple[DataFrame, dict]:
|
||||
"""Automatically convert data types in a pandas DataFrame using heuristics.
|
||||
|
||||
This function analyzes the DataFrame to infer appropriate data types
|
||||
and applies the conversions. It handles timestamps, timedeltas, numeric values,
|
||||
and categorical fields.
|
||||
|
||||
Args:
|
||||
df: A pandas DataFrame to convert.
|
||||
na_values: List of strings to be considered as NA/NaN. Defaults to
|
||||
['NA', 'na', 'NULL', 'null', ''].
|
||||
category_threshold: Maximum ratio of unique values to total values
|
||||
to consider a column categorical. Defaults to 0.3.
|
||||
convert_threshold: Minimum ratio of successfully converted values required
|
||||
to apply a type conversion. Defaults to 0.6.
|
||||
sample_ratio: Fraction of data to sample for type inference. Not used in pandas version
|
||||
but included for API compatibility. Defaults to 1.0.
|
||||
|
||||
Returns:
|
||||
tuple: (The DataFrame with converted types, A dictionary mapping column names to
|
||||
their inferred types as strings)
|
||||
"""
|
||||
if na_values is None:
|
||||
na_values = {"NA", "na", "NULL", "null", ""}
|
||||
|
||||
df_converted = df.convert_dtypes()
|
||||
schema = {}
|
||||
|
||||
# Sample if needed (for API compatibility)
|
||||
if sample_ratio < 1.0:
|
||||
df = df.sample(frac=sample_ratio)
|
||||
|
||||
n_rows = len(df)
|
||||
|
||||
for col in df.columns:
|
||||
series = df[col]
|
||||
# Replace NA-like values if string
|
||||
series_cleaned = series.map(lambda x: np.nan if isinstance(x, str) and x.strip() in na_values else x)
|
||||
|
||||
# Skip conversion if already non-object data type, except bool which can potentially be categorical
|
||||
if (
|
||||
not isinstance(series_cleaned.dtype, pd.BooleanDtype)
|
||||
and not isinstance(series_cleaned.dtype, pd.StringDtype)
|
||||
and series_cleaned.dtype != "object"
|
||||
):
|
||||
# Keep the original data type for non-object dtypes
|
||||
df_converted[col] = series
|
||||
schema[col] = str(series_cleaned.dtype)
|
||||
continue
|
||||
|
||||
# print(f"type: {series_cleaned.dtype}, column: {series_cleaned.name}")
|
||||
|
||||
if not isinstance(series_cleaned.dtype, pd.BooleanDtype):
|
||||
# Try numeric (int or float)
|
||||
numeric = pd.to_numeric(series_cleaned, errors="coerce")
|
||||
if numeric.notna().sum() >= n_rows * convert_threshold:
|
||||
if (numeric.dropna() % 1 == 0).all():
|
||||
try:
|
||||
df_converted[col] = numeric.astype("int") # Nullable integer
|
||||
schema[col] = "int"
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
df_converted[col] = numeric.astype("double")
|
||||
schema[col] = "double"
|
||||
continue
|
||||
|
||||
# Try datetime
|
||||
datetime_converted = pd.to_datetime(series_cleaned, errors="coerce")
|
||||
if datetime_converted.notna().sum() >= n_rows * convert_threshold:
|
||||
df_converted[col] = datetime_converted
|
||||
schema[col] = "timestamp"
|
||||
continue
|
||||
|
||||
# Try timedelta
|
||||
try:
|
||||
timedelta_converted = pd.to_timedelta(series_cleaned, errors="coerce")
|
||||
if timedelta_converted.notna().sum() >= n_rows * convert_threshold:
|
||||
df_converted[col] = timedelta_converted
|
||||
schema[col] = "timedelta"
|
||||
continue
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
# Try category
|
||||
try:
|
||||
unique_ratio = series_cleaned.nunique(dropna=True) / n_rows if n_rows > 0 else 1.0
|
||||
if unique_ratio <= category_threshold:
|
||||
df_converted[col] = series_cleaned.astype("category")
|
||||
schema[col] = "category"
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
df_converted[col] = series_cleaned.astype("string")
|
||||
schema[col] = "string"
|
||||
|
||||
return df_converted, schema
|
||||
|
||||
@@ -1,7 +1,37 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
class ColoredFormatter(logging.Formatter):
|
||||
# ANSI escape codes for colors
|
||||
COLORS = {
|
||||
# logging.DEBUG: "\033[36m", # Cyan
|
||||
# logging.INFO: "\033[32m", # Green
|
||||
logging.WARNING: "\033[33m", # Yellow
|
||||
logging.ERROR: "\033[31m", # Red
|
||||
logging.CRITICAL: "\033[1;31m", # Bright Red
|
||||
}
|
||||
RESET = "\033[0m" # Reset to default
|
||||
|
||||
def __init__(self, fmt, datefmt, use_color=True):
|
||||
super().__init__(fmt, datefmt)
|
||||
self.use_color = use_color
|
||||
|
||||
def format(self, record):
|
||||
formatted = super().format(record)
|
||||
if self.use_color:
|
||||
color = self.COLORS.get(record.levelno, "")
|
||||
if color:
|
||||
return f"{color}{formatted}{self.RESET}"
|
||||
return formatted
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger_formatter = logging.Formatter(
|
||||
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
|
||||
use_color = True
|
||||
if os.getenv("FLAML_LOG_NO_COLOR"):
|
||||
use_color = False
|
||||
|
||||
logger_formatter = ColoredFormatter(
|
||||
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S", use_color
|
||||
)
|
||||
logger.propagate = False
|
||||
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
import shutil
|
||||
import signal
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
@@ -89,21 +90,25 @@ def limit_resource(memory_limit, time_limit):
|
||||
except ValueError:
|
||||
# According to https://bugs.python.org/issue40518, it's a mac-specific error.
|
||||
pass
|
||||
main_thread = False
|
||||
if time_limit is not None:
|
||||
alarm_set = False
|
||||
if time_limit is not None and threading.current_thread() is threading.main_thread():
|
||||
try:
|
||||
signal.signal(signal.SIGALRM, TimeoutHandler)
|
||||
signal.alarm(int(time_limit) or 1)
|
||||
main_thread = True
|
||||
alarm_set = True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if main_thread:
|
||||
if alarm_set:
|
||||
signal.alarm(0)
|
||||
if memory_limit > 0:
|
||||
resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
|
||||
try:
|
||||
resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
class BaseEstimator:
|
||||
@@ -130,7 +135,7 @@ class BaseEstimator:
|
||||
self._task = task if isinstance(task, Task) else task_factory(task, None, None)
|
||||
self.params = self.config2params(config)
|
||||
self.estimator_class = self._model = None
|
||||
if "_estimator_type" in config:
|
||||
if "_estimator_type" in self.params:
|
||||
self._estimator_type = self.params.pop("_estimator_type")
|
||||
else:
|
||||
self._estimator_type = "classifier" if self._task.is_classification() else "regressor"
|
||||
@@ -157,7 +162,25 @@ class BaseEstimator:
|
||||
|
||||
@property
|
||||
def estimator(self):
|
||||
"""Trained model after fit() is called, or None before fit() is called."""
|
||||
"""
|
||||
Get the best trained estimator model.
|
||||
|
||||
Returns:
|
||||
object or None: The trained model obtained after calling the `fit()` method,
|
||||
representing the best estimator found during the training process. If `fit()` has
|
||||
not been called yet, it returns `None`.
|
||||
|
||||
Examples:
|
||||
>>> from flaml import AutoML
|
||||
>>> automl = AutoML()
|
||||
>>> automl.fit(X_train, y_train)
|
||||
>>> best_estimator = automl.model.estimator
|
||||
>>> print(best_estimator)
|
||||
RandomForestClassifier()
|
||||
|
||||
Note:
|
||||
To access the best estimator, use `automl.model.estimator`.
|
||||
"""
|
||||
return self._model
|
||||
|
||||
@property
|
||||
@@ -249,9 +272,11 @@ class BaseEstimator:
|
||||
mem = psutil.virtual_memory() if psutil is not None else None
|
||||
try:
|
||||
with limit_resource(
|
||||
mem.available * (1 - free_mem_ratio) + psutil.Process(os.getpid()).memory_info().rss
|
||||
if mem is not None
|
||||
else -1,
|
||||
(
|
||||
mem.available * (1 - free_mem_ratio) + psutil.Process(os.getpid()).memory_info().rss
|
||||
if mem is not None
|
||||
else -1
|
||||
),
|
||||
budget,
|
||||
):
|
||||
train_time = self._fit(X_train, y_train, **kwargs)
|
||||
@@ -1534,12 +1559,16 @@ class LGBMEstimator(BaseEstimator):
|
||||
if n_iter > 1:
|
||||
max_iter = min(
|
||||
n_iter,
|
||||
int((budget - time.time() + start_time - self._t1) / self._time_per_iter + 1)
|
||||
if budget is not None
|
||||
else n_iter,
|
||||
int((1 - free_mem_ratio) * mem0 / self._mem_per_iter)
|
||||
if psutil is not None and self._mem_per_iter > 0
|
||||
else n_iter,
|
||||
(
|
||||
int((budget - time.time() + start_time - self._t1) / self._time_per_iter + 1)
|
||||
if budget is not None
|
||||
else n_iter
|
||||
),
|
||||
(
|
||||
int((1 - free_mem_ratio) * mem0 / self._mem_per_iter)
|
||||
if psutil is not None and self._mem_per_iter > 0
|
||||
else n_iter
|
||||
),
|
||||
)
|
||||
if trained and max_iter <= self.params[self.ITER_HP]:
|
||||
return time.time() - start_time
|
||||
@@ -1561,18 +1590,17 @@ class LGBMEstimator(BaseEstimator):
|
||||
callbacks = None
|
||||
if callbacks is None:
|
||||
self._fit(X_train, y_train, **kwargs)
|
||||
else:
|
||||
self._fit(X_train, y_train, callbacks=callbacks, **kwargs)
|
||||
if callbacks is None:
|
||||
# for xgboost>=1.6.0, pop callbacks to enable pickle
|
||||
callbacks = self.params.pop("callbacks")
|
||||
self._model.set_params(callbacks=callbacks[:-1])
|
||||
else:
|
||||
self._fit(X_train, y_train, callbacks=callbacks, **kwargs)
|
||||
best_iteration = (
|
||||
getattr(self._model.get_booster(), "best_iteration", None)
|
||||
if isinstance(self, XGBoostSklearnEstimator)
|
||||
else self._model.best_iteration_
|
||||
)
|
||||
if best_iteration is not None:
|
||||
if best_iteration is not None and best_iteration > 0:
|
||||
self._model.set_params(n_estimators=best_iteration + 1)
|
||||
else:
|
||||
self._fit(X_train, y_train, **kwargs)
|
||||
@@ -1668,7 +1696,7 @@ class XGBoostEstimator(SKLearnEstimator):
|
||||
# use_label_encoder is deprecated in 1.7.
|
||||
if xgboost_version < "1.7.0":
|
||||
params["use_label_encoder"] = params.get("use_label_encoder", False)
|
||||
if "n_jobs" in config:
|
||||
if "n_jobs" in params:
|
||||
params["nthread"] = params.pop("n_jobs")
|
||||
return params
|
||||
|
||||
@@ -1868,7 +1896,7 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
params = super().config2params(config)
|
||||
if "max_leaves" in params:
|
||||
params["max_leaf_nodes"] = params.get("max_leaf_nodes", params.pop("max_leaves"))
|
||||
if not self._task.is_classification() and "criterion" in config:
|
||||
if not self._task.is_classification() and "criterion" in params:
|
||||
params.pop("criterion")
|
||||
if "random_state" not in params:
|
||||
params["random_state"] = 12032022
|
||||
@@ -2043,8 +2071,8 @@ class CatBoostEstimator(BaseEstimator):
|
||||
self.estimator_class = CatBoostRegressor
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
|
||||
if "is_retrain" in kwargs:
|
||||
kwargs.pop("is_retrain")
|
||||
kwargs.pop("is_retrain", None)
|
||||
kwargs.pop("groups", None)
|
||||
start_time = time.time()
|
||||
deadline = start_time + budget if budget else np.inf
|
||||
train_dir = f"catboost_{str(start_time)}"
|
||||
@@ -2090,7 +2118,8 @@ class CatBoostEstimator(BaseEstimator):
|
||||
if weight is not None:
|
||||
kwargs["sample_weight"] = weight
|
||||
self._model = model
|
||||
self.params[self.ITER_HP] = self._model.tree_count_
|
||||
# Commented-out line below incorrectly assigned n_estimators - see https://github.com/microsoft/FLAML/pull/1364
|
||||
# self.params[self.ITER_HP] = self._model.tree_count_
|
||||
train_time = time.time() - start_time
|
||||
return train_time
|
||||
|
||||
@@ -2184,6 +2213,11 @@ class SVCEstimator(SKLearnEstimator):
|
||||
|
||||
def __init__(self, task="binary", **config):
|
||||
super().__init__(task, **config)
|
||||
self.params.update(
|
||||
{
|
||||
"random_state": config.get("random_seed", 10242048),
|
||||
}
|
||||
)
|
||||
assert self._task.is_classification(), "LinearSVC for classification task only"
|
||||
self.estimator_class = LinearSVC
|
||||
|
||||
@@ -2315,7 +2349,7 @@ class SGDEstimator(SKLearnEstimator):
|
||||
params["loss"] = params.get("loss", None)
|
||||
if params["loss"] is None and self._task.is_classification():
|
||||
params["loss"] = "log_loss" if SKLEARN_VERSION >= "1.1" else "log"
|
||||
if not self._task.is_classification():
|
||||
if not self._task.is_classification() and "n_jobs" in params:
|
||||
params.pop("n_jobs")
|
||||
|
||||
if params.get("penalty") != "elasticnet":
|
||||
@@ -2428,6 +2462,11 @@ class ElasticNetEstimator(SKLearnEstimator):
|
||||
|
||||
def __init__(self, task="regression", **config):
|
||||
super().__init__(task, **config)
|
||||
self.params.update(
|
||||
{
|
||||
"random_state": config.get("random_seed", 10242048),
|
||||
}
|
||||
)
|
||||
assert self._task.is_regression(), "ElasticNet for regression task only"
|
||||
self.estimator_class = ElasticNet
|
||||
|
||||
@@ -2752,7 +2791,7 @@ class BaseResourceLimit:
|
||||
def check_resource_limits(self, current_time, current_iteration, mllib):
|
||||
if (mllib == "xgb" and current_iteration == 0) or (mllib == "cat" and current_iteration == 1):
|
||||
self._time_per_iter = current_time - self.start_time
|
||||
if current_time + self._time_per_iter > self.deadline:
|
||||
if mllib != "cat" and current_time + self._time_per_iter > self.deadline:
|
||||
return False
|
||||
if psutil is not None and self.free_mem_ratio is not None:
|
||||
mem = psutil.virtual_memory()
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
@@ -9,7 +10,7 @@ from pyspark.ml.evaluation import (
|
||||
RegressionEvaluator,
|
||||
)
|
||||
|
||||
from flaml.automl.spark import F, psSeries
|
||||
from flaml.automl.spark import F, T, psDataFrame, psSeries, sparkDataFrame
|
||||
|
||||
|
||||
def ps_group_counts(groups: Union[psSeries, np.ndarray]) -> np.ndarray:
|
||||
@@ -36,6 +37,16 @@ def _compute_label_from_probability(df, probability_col, prediction_col):
|
||||
return df
|
||||
|
||||
|
||||
def string_to_array(s):
|
||||
try:
|
||||
return json.loads(s)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
|
||||
string_to_array_udf = F.udf(string_to_array, T.ArrayType(T.DoubleType()))
|
||||
|
||||
|
||||
def spark_metric_loss_score(
|
||||
metric_name: str,
|
||||
y_predict: psSeries,
|
||||
@@ -135,6 +146,11 @@ def spark_metric_loss_score(
|
||||
)
|
||||
elif metric_name == "log_loss":
|
||||
# For log_loss, prediction_col should be probability, and we need to convert it to label
|
||||
# handle data like "{'type': '1', 'values': '[1, 2, 3]'}"
|
||||
# Fix cannot resolve "array_max(prediction)" due to data type mismatch: Parameter 1 requires the "ARRAY" type,
|
||||
# however "prediction" has the type "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>"
|
||||
df = df.withColumn(prediction_col, df[prediction_col].cast(T.StringType()))
|
||||
df = df.withColumn(prediction_col, string_to_array_udf(df[prediction_col]))
|
||||
df = _compute_label_from_probability(df, prediction_col, prediction_col + "_label")
|
||||
evaluator = MulticlassClassificationEvaluator(
|
||||
metricName="logLoss",
|
||||
|
||||
@@ -87,7 +87,6 @@ class GenericTask(Task):
|
||||
"transformer": TransformersEstimator,
|
||||
"transformer_ms": TransformersEstimatorModelSelection,
|
||||
"histgb": HistGradientBoostingEstimator,
|
||||
# Above are open-source, below are internal
|
||||
"svc": SVCEstimator,
|
||||
"sgd": SGDEstimator,
|
||||
"nb_spark": SparkNaiveBayesEstimator,
|
||||
@@ -443,8 +442,8 @@ class GenericTask(Task):
|
||||
X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||
if data_is_df:
|
||||
X_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all.reset_index(drop=True, inplace=True)
|
||||
|
||||
X_train, y_train = X_train_all, y_train_all
|
||||
state.groups_all = state.groups
|
||||
@@ -706,7 +705,6 @@ class GenericTask(Task):
|
||||
fit_kwargs = {}
|
||||
if cv_score_agg_func is None:
|
||||
cv_score_agg_func = default_cv_score_agg_func
|
||||
start_time = time.time()
|
||||
val_loss_folds = []
|
||||
log_metric_folds = []
|
||||
metric = None
|
||||
@@ -748,7 +746,10 @@ class GenericTask(Task):
|
||||
elif isinstance(kf, TimeSeriesSplit):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
else:
|
||||
kf = kf.split(X_train_split)
|
||||
try:
|
||||
kf = kf.split(X_train_split)
|
||||
except TypeError:
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
|
||||
for train_index, val_index in kf:
|
||||
if shuffle:
|
||||
@@ -771,10 +772,10 @@ class GenericTask(Task):
|
||||
if not is_spark_dataframe:
|
||||
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
||||
if weight is not None:
|
||||
fit_kwargs["sample_weight"], weight_val = (
|
||||
weight[train_index],
|
||||
weight[val_index],
|
||||
fit_kwargs["sample_weight"] = (
|
||||
weight[train_index] if isinstance(weight, np.ndarray) else weight.iloc[train_index]
|
||||
)
|
||||
weight_val = weight[val_index] if isinstance(weight, np.ndarray) else weight.iloc[val_index]
|
||||
if groups is not None:
|
||||
fit_kwargs["groups"] = (
|
||||
groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]
|
||||
@@ -813,8 +814,6 @@ class GenericTask(Task):
|
||||
if is_spark_dataframe:
|
||||
X_train.spark.unpersist() # uncache data to free memory
|
||||
X_val.spark.unpersist() # uncache data to free memory
|
||||
if budget and time.time() - start_time >= budget:
|
||||
break
|
||||
val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
|
||||
n = total_fold_num
|
||||
pred_time /= n
|
||||
|
||||
@@ -192,7 +192,7 @@ class Task(ABC):
|
||||
* Valid str options depend on different tasks.
|
||||
For classification tasks, valid choices are
|
||||
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
|
||||
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
|
||||
"auto" -> uniform.
|
||||
For time series forecast tasks, must be "auto" or 'time'.
|
||||
For ranking task, must be "auto" or 'group'.
|
||||
|
||||
@@ -393,7 +393,7 @@ class DataTransformerTS:
|
||||
|
||||
for column in X.columns:
|
||||
# sklearn/utils/validation.py needs int/float values
|
||||
if X[column].dtype.name in ("object", "category"):
|
||||
if X[column].dtype.name in ("object", "category", "string"):
|
||||
if (
|
||||
# drop columns where all values are the same
|
||||
X[column].nunique() == 1
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
import atexit
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from concurrent.futures import ThreadPoolExecutor, wait
|
||||
from typing import MutableMapping
|
||||
|
||||
import mlflow
|
||||
@@ -12,14 +16,15 @@ import pandas as pd
|
||||
from mlflow.entities import Metric, Param, RunTag
|
||||
from mlflow.exceptions import MlflowException
|
||||
from mlflow.utils.autologging_utils import AUTOLOGGING_INTEGRATIONS, autologging_is_disabled
|
||||
from packaging.requirements import Requirement
|
||||
from scipy.sparse import issparse
|
||||
from sklearn import tree
|
||||
|
||||
try:
|
||||
from pyspark.ml import Pipeline as SparkPipeline
|
||||
from pyspark.ml import PipelineModel as SparkPipelineModel
|
||||
except ImportError:
|
||||
|
||||
class SparkPipeline:
|
||||
class SparkPipelineModel:
|
||||
pass
|
||||
|
||||
|
||||
@@ -32,6 +37,84 @@ from flaml.version import __version__
|
||||
|
||||
SEARCH_MAX_RESULTS = 5000 # Each train should not have more than 5000 trials
|
||||
IS_RENAME_CHILD_RUN = os.environ.get("FLAML_IS_RENAME_CHILD_RUN", "false").lower() == "true"
|
||||
REMOVE_REQUIREMENT_LIST = [
|
||||
"synapseml-cognitive",
|
||||
"synapseml-core",
|
||||
"synapseml-deep-learning",
|
||||
"synapseml-internal",
|
||||
"synapseml-mlflow",
|
||||
"synapseml-opencv",
|
||||
"synapseml-vw",
|
||||
"synapseml-lightgbm",
|
||||
"synapseml-utils",
|
||||
"nni",
|
||||
"optuna",
|
||||
]
|
||||
OPTIONAL_REMOVE_REQUIREMENT_LIST = ["pytorch-lightning", "transformers"]
|
||||
|
||||
os.environ["MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR"] = os.environ.get("MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR", "false")
|
||||
|
||||
MLFLOW_NUM_WORKERS = int(os.environ.get("FLAML_MLFLOW_NUM_WORKERS", os.cpu_count() * 4 if os.cpu_count() else 2))
|
||||
executor = ThreadPoolExecutor(max_workers=MLFLOW_NUM_WORKERS)
|
||||
atexit.register(lambda: executor.shutdown(wait=True))
|
||||
|
||||
IS_CLEAN_LOGS = os.environ.get("FLAML_IS_CLEAN_LOGS", "1")
|
||||
if IS_CLEAN_LOGS == "1":
|
||||
logging.getLogger("synapse.ml").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("mlflow.utils").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("mlflow.utils.environment").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("mlflow.models.model").setLevel(logging.CRITICAL)
|
||||
warnings.simplefilter("ignore", category=FutureWarning)
|
||||
warnings.simplefilter("ignore", category=UserWarning)
|
||||
|
||||
|
||||
def convert_requirement(requirement_list: list[str]):
|
||||
ret = (
|
||||
[Requirement(s.strip().lower()) for s in requirement_list]
|
||||
if mlflow.__version__ <= "2.17.0"
|
||||
else requirement_list
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
def time_it(func_or_code=None):
|
||||
"""
|
||||
Decorator or function that measures execution time.
|
||||
|
||||
Can be used in three ways:
|
||||
1. As a decorator with no arguments: @time_it
|
||||
2. As a decorator with arguments: @time_it()
|
||||
3. As a function call with a string of code to execute and time: time_it("some_code()")
|
||||
|
||||
Args:
|
||||
func_or_code (callable or str, optional): Either a function to decorate or
|
||||
a string of code to execute and time.
|
||||
|
||||
Returns:
|
||||
callable or None: Returns a decorated function if used as a decorator,
|
||||
or None if used to execute a string of code.
|
||||
"""
|
||||
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
start_time = time.time()
|
||||
result = func(*args, **kwargs)
|
||||
end_time = time.time()
|
||||
logger.debug(f"Execution of {func.__name__} took {end_time - start_time:.4f} seconds")
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
|
||||
if callable(func_or_code):
|
||||
return decorator(func_or_code)
|
||||
elif func_or_code is None:
|
||||
return decorator
|
||||
else:
|
||||
start_time = time.time()
|
||||
exec(func_or_code)
|
||||
end_time = time.time()
|
||||
logger.debug(f"Execution\n```\n{func_or_code}\n```\ntook {end_time - start_time:.4f} seconds")
|
||||
|
||||
|
||||
def flatten_dict(d: MutableMapping, sep: str = ".") -> MutableMapping:
|
||||
@@ -49,23 +132,28 @@ def is_autolog_enabled():
|
||||
return not all(autologging_is_disabled(k) for k in AUTOLOGGING_INTEGRATIONS.keys())
|
||||
|
||||
|
||||
def get_mlflow_log_latency(model_history=False):
|
||||
def get_mlflow_log_latency(model_history=False, delete_run=True):
|
||||
try:
|
||||
FLAML_MLFLOW_LOG_LATENCY = float(os.getenv("FLAML_MLFLOW_LOG_LATENCY", 0))
|
||||
except ValueError:
|
||||
FLAML_MLFLOW_LOG_LATENCY = 0
|
||||
if FLAML_MLFLOW_LOG_LATENCY >= 0.1:
|
||||
return FLAML_MLFLOW_LOG_LATENCY
|
||||
st = time.time()
|
||||
with mlflow.start_run(nested=True, run_name="get_mlflow_log_latency") as run:
|
||||
if model_history:
|
||||
sk_model = tree.DecisionTreeClassifier()
|
||||
mlflow.sklearn.log_model(sk_model, "sk_models")
|
||||
mlflow.sklearn.log_model(Pipeline([("estimator", sk_model)]), "sk_pipeline")
|
||||
mlflow.sklearn.log_model(sk_model, "model")
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
pickle_fpath = os.path.join(tmpdir, f"tmp_{int(time.time()*1000)}")
|
||||
pickle_fpath = os.path.join(tmpdir, f"tmp_{int(time.time() * 1000)}")
|
||||
with open(pickle_fpath, "wb") as f:
|
||||
pickle.dump(sk_model, f)
|
||||
mlflow.log_artifact(pickle_fpath, "sk_model1")
|
||||
mlflow.log_artifact(pickle_fpath, "sk_model2")
|
||||
mlflow.log_artifact(pickle_fpath, "sk_model")
|
||||
mlflow.set_tag("synapseml.ui.visible", "false") # not shown inline in fabric
|
||||
mlflow.delete_run(run.info.run_id)
|
||||
if delete_run:
|
||||
mlflow.delete_run(run.info.run_id)
|
||||
et = time.time()
|
||||
return et - st
|
||||
return 3 * (et - st)
|
||||
|
||||
|
||||
def infer_signature(X_train=None, y_train=None, dataframe=None, label=None):
|
||||
@@ -98,12 +186,76 @@ def infer_signature(X_train=None, y_train=None, dataframe=None, label=None):
|
||||
)
|
||||
|
||||
|
||||
def update_and_install_requirements(
|
||||
run_id=None,
|
||||
model_name=None,
|
||||
model_version=None,
|
||||
remove_list=None,
|
||||
artifact_path="model",
|
||||
dst_path=None,
|
||||
install_with_ipython=False,
|
||||
):
|
||||
if not (run_id or (model_name and model_version)):
|
||||
raise ValueError(
|
||||
"Please provide `run_id` or both `model_name` and `model_version`. If all three are provided, `run_id` will be used."
|
||||
)
|
||||
|
||||
if install_with_ipython:
|
||||
from IPython import get_ipython
|
||||
|
||||
if not remove_list:
|
||||
remove_list = [
|
||||
"synapseml-cognitive",
|
||||
"synapseml-core",
|
||||
"synapseml-deep-learning",
|
||||
"synapseml-internal",
|
||||
"synapseml-mlflow",
|
||||
"synapseml-opencv",
|
||||
"synapseml-vw",
|
||||
"synapseml-lightgbm",
|
||||
"synapseml-utils",
|
||||
"flaml", # flaml is needed for AutoML models, should be pre-installed in the runtime
|
||||
"pyspark", # fabric internal pyspark should be pre-installed in the runtime
|
||||
]
|
||||
|
||||
# Download model artifacts
|
||||
client = mlflow.MlflowClient()
|
||||
if not run_id:
|
||||
run_id = client.get_model_version(model_name, model_version).run_id
|
||||
if not dst_path:
|
||||
dst_path = os.path.join(tempfile.gettempdir(), "model_artifacts")
|
||||
os.makedirs(dst_path, exist_ok=True)
|
||||
client.download_artifacts(run_id, artifact_path, dst_path)
|
||||
requirements_path = os.path.join(dst_path, artifact_path, "requirements.txt")
|
||||
with open(requirements_path) as f:
|
||||
reqs = f.read().splitlines()
|
||||
old_reqs = [Requirement(req) for req in reqs if req]
|
||||
old_reqs_dict = {req.name: str(req) for req in old_reqs}
|
||||
for req in remove_list:
|
||||
req = Requirement(req)
|
||||
if req.name in old_reqs_dict:
|
||||
old_reqs_dict.pop(req.name, None)
|
||||
new_reqs_list = list(old_reqs_dict.values())
|
||||
|
||||
with open(requirements_path, "w") as f:
|
||||
f.write("\n".join(new_reqs_list))
|
||||
|
||||
if install_with_ipython:
|
||||
get_ipython().run_line_magic("pip", f"install -r {requirements_path} -q")
|
||||
else:
|
||||
logger.info(f"You can run `pip install -r {requirements_path}` to install dependencies.")
|
||||
return requirements_path
|
||||
|
||||
|
||||
def _mlflow_wrapper(evaluation_func, mlflow_exp_id, mlflow_config=None, extra_tags=None, autolog=False):
|
||||
def wrapped(*args, **kwargs):
|
||||
if mlflow_config is not None:
|
||||
from synapse.ml.mlflow import set_mlflow_env_config
|
||||
try:
|
||||
from synapse.ml.mlflow import set_mlflow_env_config
|
||||
|
||||
set_mlflow_env_config(mlflow_config)
|
||||
set_mlflow_env_config(mlflow_config)
|
||||
except Exception:
|
||||
pass
|
||||
import mlflow
|
||||
|
||||
if mlflow_exp_id is not None:
|
||||
@@ -124,7 +276,27 @@ def _mlflow_wrapper(evaluation_func, mlflow_exp_id, mlflow_config=None, extra_ta
|
||||
|
||||
|
||||
def _get_notebook_name():
|
||||
return None
|
||||
try:
|
||||
import re
|
||||
|
||||
from synapse.ml.mlflow import get_mlflow_env_config
|
||||
from synapse.ml.mlflow.shared_platform_utils import get_artifact
|
||||
|
||||
notebook_id = get_mlflow_env_config(False).artifact_id
|
||||
current_notebook = get_artifact(notebook_id)
|
||||
notebook_name = re.sub("\\W+", "-", current_notebook.displayName).strip()
|
||||
|
||||
return notebook_name
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get notebook name: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def safe_json_dumps(obj):
|
||||
def default(o):
|
||||
return str(o)
|
||||
|
||||
return json.dumps(obj, default=default)
|
||||
|
||||
|
||||
class MLflowIntegration:
|
||||
@@ -156,6 +328,8 @@ class MLflowIntegration:
|
||||
self.has_model = False
|
||||
self.only_history = False
|
||||
self._do_log_model = True
|
||||
self.futures = {}
|
||||
self.futures_log_model = {}
|
||||
|
||||
self.extra_tag = (
|
||||
extra_tag
|
||||
@@ -163,6 +337,9 @@ class MLflowIntegration:
|
||||
else {"extra_tag.sid": f"flaml_{__version__}_{int(time.time())}_{random.randint(1001, 9999)}"}
|
||||
)
|
||||
self.start_time = time.time()
|
||||
self.experiment_type = experiment_type
|
||||
self.update_autolog_state()
|
||||
|
||||
self.mlflow_client = mlflow.tracking.MlflowClient()
|
||||
parent_run_info = mlflow.active_run().info if mlflow.active_run() is not None else None
|
||||
if parent_run_info:
|
||||
@@ -181,8 +358,6 @@ class MLflowIntegration:
|
||||
mlflow.set_experiment(experiment_name=mlflow_exp_name)
|
||||
self.experiment_id = mlflow.tracking.fluent._active_experiment_id
|
||||
self.experiment_name = mlflow.get_experiment(self.experiment_id).name
|
||||
self.experiment_type = experiment_type
|
||||
self.update_autolog_state()
|
||||
|
||||
if self.autolog:
|
||||
# only end user created parent run in autolog scenario
|
||||
@@ -190,9 +365,12 @@ class MLflowIntegration:
|
||||
|
||||
def set_mlflow_config(self):
|
||||
if self.driver_mlflow_env_config is not None:
|
||||
from synapse.ml.mlflow import set_mlflow_env_config
|
||||
try:
|
||||
from synapse.ml.mlflow import set_mlflow_env_config
|
||||
|
||||
set_mlflow_env_config(self.driver_mlflow_env_config)
|
||||
set_mlflow_env_config(self.driver_mlflow_env_config)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def wrap_evaluation_function(self, evaluation_function):
|
||||
wrapped_evaluation_function = _mlflow_wrapper(
|
||||
@@ -260,6 +438,7 @@ class MLflowIntegration:
|
||||
else:
|
||||
_tags = []
|
||||
self.mlflow_client.log_batch(run_id=target_id, metrics=_metrics, params=[], tags=_tags)
|
||||
return f"Successfully copy_mlflow_run run_id {src_id} to run_id {target_id}"
|
||||
|
||||
def record_trial(self, result, trial, metric):
|
||||
if isinstance(result, dict):
|
||||
@@ -327,12 +506,31 @@ class MLflowIntegration:
|
||||
self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id)
|
||||
self.has_summary = True
|
||||
|
||||
def log_model(self, model, estimator, signature=None):
|
||||
def log_model(self, model, estimator, signature=None, run_id=None):
|
||||
if not self._do_log_model:
|
||||
return
|
||||
logger.debug(f"logging model {estimator}")
|
||||
ret_message = f"Successfully log_model {estimator} to run_id {run_id}"
|
||||
optional_remove_list = (
|
||||
[] if estimator in ["transformer", "transformer_ms", "tcn", "tft"] else OPTIONAL_REMOVE_REQUIREMENT_LIST
|
||||
)
|
||||
run = mlflow.active_run()
|
||||
if run and run.info.run_id == self.parent_run_id:
|
||||
logger.debug(
|
||||
f"Current active run_id {run.info.run_id} == parent_run_id {self.parent_run_id}, Starting run_id {run_id}"
|
||||
)
|
||||
mlflow.start_run(run_id=run_id, nested=True)
|
||||
elif run and run.info.run_id != run_id:
|
||||
ret_message = (
|
||||
f"Error: Should log_model {estimator} to run_id {run_id}, but logged to run_id {run.info.run_id}"
|
||||
)
|
||||
logger.error(ret_message)
|
||||
else:
|
||||
logger.debug(f"No active run, start run_id {run_id}")
|
||||
mlflow.start_run(run_id=run_id)
|
||||
logger.debug(f"logged model {estimator} to run_id {mlflow.active_run().info.run_id}")
|
||||
if estimator.endswith("_spark"):
|
||||
mlflow.spark.log_model(model, estimator, signature=signature)
|
||||
# mlflow.spark.log_model(model, estimator, signature=signature)
|
||||
mlflow.spark.log_model(model, "model", signature=signature)
|
||||
elif estimator in ["lgbm"]:
|
||||
mlflow.lightgbm.log_model(model, estimator, signature=signature)
|
||||
@@ -345,42 +543,93 @@ class MLflowIntegration:
|
||||
elif estimator in ["prophet"]:
|
||||
mlflow.prophet.log_model(model, estimator, signature=signature)
|
||||
elif estimator in ["orbit"]:
|
||||
pass
|
||||
logger.warning(f"Unsupported model: {estimator}. No model logged.")
|
||||
else:
|
||||
mlflow.sklearn.log_model(model, estimator, signature=signature)
|
||||
future = executor.submit(
|
||||
lambda: mlflow.models.model.update_model_requirements(
|
||||
model_uri=f"runs:/{run_id}/{'model' if estimator.endswith('_spark') else estimator}",
|
||||
operation="remove",
|
||||
requirement_list=convert_requirement(REMOVE_REQUIREMENT_LIST + optional_remove_list),
|
||||
)
|
||||
)
|
||||
self.futures[future] = f"run_{run_id}_requirements_updated"
|
||||
if not run or run.info.run_id == self.parent_run_id:
|
||||
logger.debug(f"Ending current run_id {mlflow.active_run().info.run_id}")
|
||||
mlflow.end_run()
|
||||
return ret_message
|
||||
|
||||
def _pickle_and_log_artifact(self, obj, artifact_name, pickle_fname="temp_.pkl"):
|
||||
def _pickle_and_log_artifact(self, obj, artifact_name, pickle_fname="temp_.pkl", run_id=None):
|
||||
if not self._do_log_model:
|
||||
return
|
||||
return True
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
pickle_fpath = os.path.join(tmpdir, pickle_fname)
|
||||
try:
|
||||
with open(pickle_fpath, "wb") as f:
|
||||
pickle.dump(obj, f)
|
||||
mlflow.log_artifact(pickle_fpath, artifact_name)
|
||||
mlflow.log_artifact(pickle_fpath, artifact_name, run_id)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to pickle and log artifact {artifact_name}, error: {e}")
|
||||
logger.debug(f"Failed to pickle and log {artifact_name}, error: {e}")
|
||||
return False
|
||||
|
||||
def pickle_and_log_automl_artifacts(self, automl, model, estimator, signature=None):
|
||||
def _log_pipeline(self, pipeline, flavor_name, pipeline_name, signature, run_id, estimator=None):
|
||||
logger.debug(f"logging pipeline {flavor_name}:{pipeline_name}:{estimator}")
|
||||
ret_message = f"Successfully _log_pipeline {flavor_name}:{pipeline_name}:{estimator} to run_id {run_id}"
|
||||
optional_remove_list = (
|
||||
[] if estimator in ["transformer", "transformer_ms", "tcn", "tft"] else OPTIONAL_REMOVE_REQUIREMENT_LIST
|
||||
)
|
||||
run = mlflow.active_run()
|
||||
if run and run.info.run_id == self.parent_run_id:
|
||||
logger.debug(
|
||||
f"Current active run_id {run.info.run_id} == parent_run_id {self.parent_run_id}, Starting run_id {run_id}"
|
||||
)
|
||||
mlflow.start_run(run_id=run_id, nested=True)
|
||||
elif run and run.info.run_id != run_id:
|
||||
ret_message = f"Error: Should _log_pipeline {flavor_name}:{pipeline_name}:{estimator} model to run_id {run_id}, but logged to run_id {run.info.run_id}"
|
||||
logger.error(ret_message)
|
||||
else:
|
||||
logger.debug(f"No active run, start run_id {run_id}")
|
||||
mlflow.start_run(run_id=run_id)
|
||||
logger.debug(
|
||||
f"logging pipeline {flavor_name}:{pipeline_name}:{estimator} to run_id {mlflow.active_run().info.run_id}"
|
||||
)
|
||||
if flavor_name == "sklearn":
|
||||
mlflow.sklearn.log_model(pipeline, pipeline_name, signature=signature)
|
||||
elif flavor_name == "spark":
|
||||
mlflow.spark.log_model(pipeline, pipeline_name, signature=signature)
|
||||
else:
|
||||
logger.warning(f"Unsupported pipeline flavor: {flavor_name}. No model logged.")
|
||||
future = executor.submit(
|
||||
lambda: mlflow.models.model.update_model_requirements(
|
||||
model_uri=f"runs:/{run_id}/{pipeline_name}",
|
||||
operation="remove",
|
||||
requirement_list=convert_requirement(REMOVE_REQUIREMENT_LIST + optional_remove_list),
|
||||
)
|
||||
)
|
||||
self.futures[future] = f"run_{run_id}_requirements_updated"
|
||||
if not run or run.info.run_id == self.parent_run_id:
|
||||
logger.debug(f"Ending current run_id {mlflow.active_run().info.run_id}")
|
||||
mlflow.end_run()
|
||||
return ret_message
|
||||
|
||||
def pickle_and_log_automl_artifacts(self, automl, model, estimator, signature=None, run_id=None):
|
||||
"""log automl artifacts to mlflow
|
||||
load back with `automl = mlflow.pyfunc.load_model(model_run_id_or_uri)`, then do prediction with `automl.predict(X)`
|
||||
"""
|
||||
logger.debug(f"logging automl artifacts {estimator}")
|
||||
self._pickle_and_log_artifact(automl.feature_transformer, "feature_transformer", "feature_transformer.pkl")
|
||||
self._pickle_and_log_artifact(automl.label_transformer, "label_transformer", "label_transformer.pkl")
|
||||
# Test test_mlflow 1 and 4 will get error: TypeError: cannot pickle '_io.TextIOWrapper' object
|
||||
# try:
|
||||
# self._pickle_and_log_artifact(automl, "automl", "automl.pkl")
|
||||
# except TypeError:
|
||||
# pass
|
||||
logger.debug(f"logging automl estimator {estimator}")
|
||||
# self._pickle_and_log_artifact(
|
||||
# automl.feature_transformer, "feature_transformer", "feature_transformer.pkl", run_id
|
||||
# )
|
||||
# self._pickle_and_log_artifact(automl.label_transformer, "label_transformer", "label_transformer.pkl", run_id)
|
||||
if estimator.endswith("_spark"):
|
||||
# spark pipeline is not supported yet
|
||||
return
|
||||
feature_transformer = automl.feature_transformer
|
||||
if isinstance(feature_transformer, Pipeline):
|
||||
if isinstance(feature_transformer, Pipeline) and not estimator.endswith("_spark"):
|
||||
pipeline = feature_transformer
|
||||
pipeline.steps.append(("estimator", model))
|
||||
elif isinstance(feature_transformer, SparkPipeline):
|
||||
elif isinstance(feature_transformer, SparkPipelineModel) and estimator.endswith("_spark"):
|
||||
pipeline = feature_transformer
|
||||
pipeline.stages.append(model)
|
||||
elif not estimator.endswith("_spark"):
|
||||
@@ -388,24 +637,26 @@ class MLflowIntegration:
|
||||
steps.append(("estimator", model))
|
||||
pipeline = Pipeline(steps)
|
||||
else:
|
||||
stages = [feature_transformer]
|
||||
stages = []
|
||||
if feature_transformer is not None:
|
||||
stages.append(feature_transformer)
|
||||
stages.append(model)
|
||||
pipeline = SparkPipeline(stages=stages)
|
||||
if isinstance(pipeline, SparkPipeline):
|
||||
pipeline = SparkPipelineModel(stages=stages)
|
||||
if isinstance(pipeline, SparkPipelineModel):
|
||||
logger.debug(f"logging spark pipeline {estimator}")
|
||||
mlflow.spark.log_model(pipeline, "automl_pipeline", signature=signature)
|
||||
self._log_pipeline(pipeline, "spark", "model", signature, run_id, estimator)
|
||||
else:
|
||||
# Add a log named "model" to fit default settings
|
||||
logger.debug(f"logging sklearn pipeline {estimator}")
|
||||
mlflow.sklearn.log_model(pipeline, "automl_pipeline", signature=signature)
|
||||
mlflow.sklearn.log_model(pipeline, "model", signature=signature)
|
||||
self._log_pipeline(pipeline, "sklearn", "model", signature, run_id, estimator)
|
||||
return f"Successfully pickle_and_log_automl_artifacts {estimator} to run_id {run_id}"
|
||||
|
||||
@time_it
|
||||
def record_state(self, automl, search_state, estimator):
|
||||
_st = time.time()
|
||||
automl_metric_name = (
|
||||
automl._state.metric if isinstance(automl._state.metric, str) else automl._state.error_metric
|
||||
)
|
||||
|
||||
if automl._state.error_metric.startswith("1-"):
|
||||
automl_metric_value = 1 - search_state.val_loss
|
||||
elif automl._state.error_metric.startswith("-"):
|
||||
@@ -418,6 +669,8 @@ class MLflowIntegration:
|
||||
else:
|
||||
config = search_state.config
|
||||
|
||||
self.automl_user_configurations = safe_json_dumps(automl._automl_user_configurations)
|
||||
|
||||
info = {
|
||||
"metrics": {
|
||||
"iter_counter": automl._track_iter,
|
||||
@@ -438,7 +691,7 @@ class MLflowIntegration:
|
||||
"flaml.meric": automl_metric_name,
|
||||
"flaml.run_source": "flaml-automl",
|
||||
"flaml.log_type": self.log_type,
|
||||
"flaml.automl_user_configurations": json.dumps(automl._automl_user_configurations),
|
||||
"flaml.automl_user_configurations": self.automl_user_configurations,
|
||||
},
|
||||
"params": {
|
||||
"sample_size": search_state.sample_size,
|
||||
@@ -465,33 +718,70 @@ class MLflowIntegration:
|
||||
run_name = f"{self.parent_run_name}_child_{self.child_counter}"
|
||||
else:
|
||||
run_name = None
|
||||
_t1 = time.time()
|
||||
wait(self.futures_log_model)
|
||||
_t2 = time.time() - _t1
|
||||
logger.debug(f"wait futures_log_model in record_state took {_t2} seconds")
|
||||
with mlflow.start_run(nested=True, run_name=run_name) as child_run:
|
||||
self._log_info_to_run(info, child_run.info.run_id, log_params=True)
|
||||
future = executor.submit(lambda: self._log_info_to_run(info, child_run.info.run_id, log_params=True))
|
||||
self.futures[future] = f"iter_{automl._track_iter}_log_info_to_run"
|
||||
future = executor.submit(lambda: self._log_automl_configurations(child_run.info.run_id))
|
||||
self.futures[future] = f"iter_{automl._track_iter}_log_automl_configurations"
|
||||
if automl._state.model_history:
|
||||
self.log_model(
|
||||
search_state.trained_estimator._model, estimator, signature=automl.estimator_signature
|
||||
)
|
||||
self.pickle_and_log_automl_artifacts(
|
||||
automl, search_state.trained_estimator, estimator, signature=automl.pipeline_signature
|
||||
)
|
||||
if estimator.endswith("_spark"):
|
||||
future = executor.submit(
|
||||
lambda: self.log_model(
|
||||
search_state.trained_estimator._model,
|
||||
estimator,
|
||||
automl.estimator_signature,
|
||||
child_run.info.run_id,
|
||||
)
|
||||
)
|
||||
self.futures_log_model[future] = f"record_state-log_model_{estimator}"
|
||||
else:
|
||||
future = executor.submit(
|
||||
lambda: self.pickle_and_log_automl_artifacts(
|
||||
automl,
|
||||
search_state.trained_estimator,
|
||||
estimator,
|
||||
automl.pipeline_signature,
|
||||
child_run.info.run_id,
|
||||
)
|
||||
)
|
||||
self.futures_log_model[future] = f"record_state-pickle_and_log_automl_artifacts_{estimator}"
|
||||
self.manual_run_ids.append(child_run.info.run_id)
|
||||
self.child_counter += 1
|
||||
return f"Successfully record_state iteration {automl._track_iter}"
|
||||
|
||||
@time_it
|
||||
def log_automl(self, automl):
|
||||
self.set_best_iter(automl)
|
||||
if self.autolog:
|
||||
if self.parent_run_id is not None:
|
||||
mlflow.start_run(run_id=self.parent_run_id, experiment_id=self.experiment_id)
|
||||
mlflow.log_metric("best_validation_loss", automl._state.best_loss)
|
||||
mlflow.log_metric("best_iteration", automl._best_iteration)
|
||||
mlflow.log_metric("num_child_runs", len(self.infos))
|
||||
if automl._trained_estimator is not None and not self.has_model:
|
||||
self.log_model(
|
||||
automl._trained_estimator._model, automl.best_estimator, signature=automl.estimator_signature
|
||||
)
|
||||
self.pickle_and_log_automl_artifacts(
|
||||
automl, automl.model, automl.best_estimator, signature=automl.pipeline_signature
|
||||
)
|
||||
mlflow.log_metrics(
|
||||
{
|
||||
"best_validation_loss": automl._state.best_loss,
|
||||
"best_iteration": automl._best_iteration,
|
||||
"num_child_runs": len(self.infos),
|
||||
}
|
||||
)
|
||||
if (
|
||||
automl._trained_estimator is not None
|
||||
and not self.has_model
|
||||
and automl._trained_estimator._model is not None
|
||||
):
|
||||
if automl.best_estimator.endswith("_spark"):
|
||||
self.log_model(
|
||||
automl._trained_estimator._model,
|
||||
automl.best_estimator,
|
||||
automl.estimator_signature,
|
||||
self.parent_run_id,
|
||||
)
|
||||
else:
|
||||
self.pickle_and_log_automl_artifacts(
|
||||
automl, automl.model, automl.best_estimator, automl.pipeline_signature, self.parent_run_id
|
||||
)
|
||||
self.has_model = True
|
||||
|
||||
self.adopt_children(automl)
|
||||
@@ -508,30 +798,65 @@ class MLflowIntegration:
|
||||
if "ml" in conf.keys():
|
||||
conf = conf["ml"]
|
||||
|
||||
mlflow.log_params(conf)
|
||||
mlflow.log_param("best_learner", automl._best_estimator)
|
||||
mlflow.log_params({**conf, "best_learner": automl._best_estimator}, run_id=self.parent_run_id)
|
||||
if not self.has_summary:
|
||||
logger.info(f"logging best model {automl.best_estimator}")
|
||||
self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id)
|
||||
future = executor.submit(lambda: self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id))
|
||||
self.futures[future] = "log_automl_copy_mlflow_run"
|
||||
future = executor.submit(lambda: self._log_automl_configurations(self.parent_run_id))
|
||||
self.futures[future] = "log_automl_log_automl_configurations"
|
||||
self.has_summary = True
|
||||
if automl._trained_estimator is not None and not self.has_model:
|
||||
self.log_model(
|
||||
automl._trained_estimator._model,
|
||||
automl.best_estimator,
|
||||
signature=automl.estimator_signature,
|
||||
)
|
||||
self.pickle_and_log_automl_artifacts(
|
||||
automl, automl.model, automl.best_estimator, signature=automl.pipeline_signature
|
||||
)
|
||||
_t1 = time.time()
|
||||
wait(self.futures_log_model)
|
||||
_t2 = time.time() - _t1
|
||||
logger.debug(f"wait futures_log_model in log_automl took {_t2} seconds")
|
||||
if (
|
||||
automl._trained_estimator is not None
|
||||
and not self.has_model
|
||||
and automl._trained_estimator._model is not None
|
||||
):
|
||||
if automl.best_estimator.endswith("_spark"):
|
||||
future = executor.submit(
|
||||
lambda: self.log_model(
|
||||
automl._trained_estimator._model,
|
||||
automl.best_estimator,
|
||||
signature=automl.estimator_signature,
|
||||
run_id=self.parent_run_id,
|
||||
)
|
||||
)
|
||||
self.futures_log_model[future] = f"log_automl-log_model_{automl.best_estimator}"
|
||||
else:
|
||||
future = executor.submit(
|
||||
lambda: self.pickle_and_log_automl_artifacts(
|
||||
automl,
|
||||
automl.model,
|
||||
automl.best_estimator,
|
||||
signature=automl.pipeline_signature,
|
||||
run_id=self.parent_run_id,
|
||||
)
|
||||
)
|
||||
self.futures_log_model[
|
||||
future
|
||||
] = f"log_automl-pickle_and_log_automl_artifacts_{automl.best_estimator}"
|
||||
self.has_model = True
|
||||
|
||||
def resume_mlflow(self):
|
||||
if len(self.resume_params) > 0:
|
||||
mlflow.autolog(**self.resume_params)
|
||||
|
||||
def _log_automl_configurations(self, run_id):
|
||||
self.mlflow_client.log_text(
|
||||
run_id=run_id,
|
||||
text=self.automl_user_configurations,
|
||||
artifact_file="automl_configurations/automl_user_configurations.json",
|
||||
)
|
||||
return f"Successfully _log_automl_configurations to run_id {run_id}"
|
||||
|
||||
def _log_info_to_run(self, info, run_id, log_params=False):
|
||||
_metrics = [Metric(key, value, int(time.time() * 1000), 0) for key, value in info["metrics"].items()]
|
||||
_tags = [RunTag(key, str(value)) for key, value in info["tags"].items()]
|
||||
_tags = [
|
||||
RunTag(key, str(value)[:5000]) for key, value in info["tags"].items()
|
||||
] # AML will raise error if value length > 5000
|
||||
_params = [
|
||||
Param(key, str(value))
|
||||
for key, value in info["params"].items()
|
||||
@@ -547,6 +872,7 @@ class MLflowIntegration:
|
||||
_tags = [RunTag("mlflow.parentRunId", run_id)]
|
||||
self.mlflow_client.log_batch(run_id=run.info.run_id, metrics=_metrics, params=[], tags=_tags)
|
||||
del info["submetrics"]["values"]
|
||||
return f"Successfully _log_info_to_run to run_id {run_id}"
|
||||
|
||||
def adopt_children(self, result=None):
|
||||
"""
|
||||
|
||||
37
flaml/tune/logger.py
Normal file
37
flaml/tune/logger.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
class ColoredFormatter(logging.Formatter):
|
||||
# ANSI escape codes for colors
|
||||
COLORS = {
|
||||
# logging.DEBUG: "\033[36m", # Cyan
|
||||
# logging.INFO: "\033[32m", # Green
|
||||
logging.WARNING: "\033[33m", # Yellow
|
||||
logging.ERROR: "\033[31m", # Red
|
||||
logging.CRITICAL: "\033[1;31m", # Bright Red
|
||||
}
|
||||
RESET = "\033[0m" # Reset to default
|
||||
|
||||
def __init__(self, fmt, datefmt, use_color=True):
|
||||
super().__init__(fmt, datefmt)
|
||||
self.use_color = use_color
|
||||
|
||||
def format(self, record):
|
||||
formatted = super().format(record)
|
||||
if self.use_color:
|
||||
color = self.COLORS.get(record.levelno, "")
|
||||
if color:
|
||||
return f"{color}{formatted}{self.RESET}"
|
||||
return formatted
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
use_color = True
|
||||
if os.getenv("FLAML_LOG_NO_COLOR"):
|
||||
use_color = False
|
||||
|
||||
logger_formatter = ColoredFormatter(
|
||||
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S", use_color
|
||||
)
|
||||
logger.propagate = False
|
||||
@@ -162,6 +162,10 @@ def broadcast_code(custom_code="", file_name="mylearner"):
|
||||
assert isinstance(MyLargeLGBM(), LGBMEstimator)
|
||||
```
|
||||
"""
|
||||
# Check if Spark is available
|
||||
spark_available, _ = check_spark()
|
||||
|
||||
# Write to local driver file system
|
||||
flaml_path = os.path.dirname(os.path.abspath(__file__))
|
||||
custom_code = textwrap.dedent(custom_code)
|
||||
custom_path = os.path.join(flaml_path, file_name + ".py")
|
||||
@@ -169,6 +173,24 @@ def broadcast_code(custom_code="", file_name="mylearner"):
|
||||
with open(custom_path, "w") as f:
|
||||
f.write(custom_code)
|
||||
|
||||
# If using Spark, broadcast the code content to executors
|
||||
if spark_available:
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
bc_code = spark.sparkContext.broadcast(custom_code)
|
||||
|
||||
# Execute a job to ensure the code is distributed to all executors
|
||||
def _write_code(bc):
|
||||
code = bc.value
|
||||
import os
|
||||
|
||||
module_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name + ".py")
|
||||
os.makedirs(os.path.dirname(module_path), exist_ok=True)
|
||||
with open(module_path, "w") as f:
|
||||
f.write(code)
|
||||
return True
|
||||
|
||||
spark.sparkContext.parallelize(range(1)).map(lambda _: _write_code(bc_code)).collect()
|
||||
|
||||
return custom_path
|
||||
|
||||
|
||||
|
||||
@@ -21,11 +21,11 @@ except (ImportError, AssertionError):
|
||||
from .analysis import ExperimentAnalysis as EA
|
||||
else:
|
||||
ray_available = True
|
||||
|
||||
import logging
|
||||
|
||||
from flaml.tune.spark.utils import PySparkOvertimeMonitor, check_spark
|
||||
|
||||
from .logger import logger, logger_formatter
|
||||
from .result import DEFAULT_METRIC
|
||||
from .trial import Trial
|
||||
|
||||
@@ -41,8 +41,6 @@ except ImportError:
|
||||
internal_mlflow = False
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.propagate = False
|
||||
_use_ray = True
|
||||
_runner = None
|
||||
_verbose = 0
|
||||
@@ -197,9 +195,16 @@ def report(_metric=None, **kwargs):
|
||||
global _training_iteration
|
||||
if _use_ray:
|
||||
try:
|
||||
from ray import tune
|
||||
from ray import __version__ as ray_version
|
||||
|
||||
return tune.report(_metric, **kwargs)
|
||||
if ray_version.startswith("1."):
|
||||
from ray import tune
|
||||
|
||||
return tune.report(_metric, **kwargs)
|
||||
else: # ray>=2
|
||||
from ray.air import session
|
||||
|
||||
return session.report(metrics={"metric": _metric, **kwargs})
|
||||
except ImportError:
|
||||
# calling tune.report() outside tune.run()
|
||||
return
|
||||
@@ -260,6 +265,8 @@ def run(
|
||||
mlflow_exp_name: Optional[str] = None,
|
||||
automl_info: Optional[Tuple[float]] = None,
|
||||
extra_tag: Optional[dict] = None,
|
||||
cost_attr: Optional[str] = "auto",
|
||||
cost_budget: Optional[float] = None,
|
||||
**ray_args,
|
||||
):
|
||||
"""The function-based way of performing HPO.
|
||||
@@ -462,6 +469,12 @@ def run(
|
||||
overwritten by the value of `n_concurrent_trials` in AutoML. When <= 0, the concurrent trials
|
||||
will be set to the number of executors.
|
||||
extra_tag: dict, default=None | Extra tags to be added to the mlflow runs created by autologging.
|
||||
cost_attr: None or str to specify the attribute to evaluate the cost of different trials.
|
||||
Default is "auto", which means that we will automatically choose the cost attribute to use (depending
|
||||
on the nature of the resource budget). When cost_attr is set to None, cost differences between different trials will be omitted
|
||||
in our search algorithm. When cost_attr is set to a str different from "auto" and "time_total_s",
|
||||
this cost_attr must be available in the result dict of the trial.
|
||||
cost_budget: A float of the cost budget. Only valid when cost_attr is a str different from "auto" and "time_total_s".
|
||||
**ray_args: keyword arguments to pass to ray.tune.run().
|
||||
Only valid when use_ray=True.
|
||||
"""
|
||||
@@ -506,10 +519,6 @@ def run(
|
||||
elif not logger.hasHandlers():
|
||||
# Add the console handler.
|
||||
_ch = logging.StreamHandler(stream=sys.stdout)
|
||||
logger_formatter = logging.Formatter(
|
||||
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s",
|
||||
"%m-%d %H:%M:%S",
|
||||
)
|
||||
_ch.setFormatter(logger_formatter)
|
||||
logger.addHandler(_ch)
|
||||
if verbose <= 2:
|
||||
@@ -600,6 +609,8 @@ def run(
|
||||
metric_constraints=metric_constraints,
|
||||
use_incumbent_result_in_evaluation=use_incumbent_result_in_evaluation,
|
||||
lexico_objectives=lexico_objectives,
|
||||
cost_attr=cost_attr,
|
||||
cost_budget=cost_budget,
|
||||
)
|
||||
else:
|
||||
if metric is None or mode is None:
|
||||
@@ -735,10 +746,16 @@ def run(
|
||||
max_concurrent = max(1, search_alg.max_concurrent)
|
||||
else:
|
||||
max_concurrent = max(1, max_spark_parallelism)
|
||||
passed_in_n_concurrent_trials = max(n_concurrent_trials, max_concurrent)
|
||||
n_concurrent_trials = min(
|
||||
n_concurrent_trials if n_concurrent_trials > 0 else num_executors,
|
||||
max_concurrent,
|
||||
)
|
||||
if n_concurrent_trials < passed_in_n_concurrent_trials:
|
||||
logger.warning(
|
||||
f"The actual concurrent trials is {n_concurrent_trials}. You can set the environment "
|
||||
f"variable `FLAML_MAX_CONCURRENT` to '{passed_in_n_concurrent_trials}' to override the detected num of executors."
|
||||
)
|
||||
with parallel_backend("spark"):
|
||||
with Parallel(n_jobs=n_concurrent_trials, verbose=max(0, (verbose - 1) * 50)) as parallel:
|
||||
try:
|
||||
@@ -760,7 +777,7 @@ def run(
|
||||
and num_failures < upperbound_num_failures
|
||||
):
|
||||
if automl_info and automl_info[0] > 0 and time_budget_s < np.inf:
|
||||
time_budget_s -= automl_info[0]
|
||||
time_budget_s -= automl_info[0] * n_concurrent_trials
|
||||
logger.debug(f"Remaining time budget with mlflow log latency: {time_budget_s} seconds.")
|
||||
while len(_runner.running_trials) < n_concurrent_trials:
|
||||
# suggest trials for spark
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.3.1"
|
||||
__version__ = "2.3.6"
|
||||
|
||||
3
pytest.ini
Normal file
3
pytest.ini
Normal file
@@ -0,0 +1,3 @@
|
||||
[pytest]
|
||||
markers =
|
||||
spark: mark a test as requiring Spark
|
||||
5
setup.py
5
setup.py
@@ -73,7 +73,7 @@ setuptools.setup(
|
||||
"psutil==5.8.0",
|
||||
"dataclasses",
|
||||
"transformers[torch]==4.26",
|
||||
"datasets",
|
||||
"datasets<=3.5.0",
|
||||
"nltk<=3.8.1", # 3.8.2 doesn't work with mlflow
|
||||
"rouge_score",
|
||||
"hcrystalball==0.1.10",
|
||||
@@ -170,10 +170,9 @@ setuptools.setup(
|
||||
"Operating System :: OS Independent",
|
||||
# Specify the Python versions you support here.
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
],
|
||||
python_requires=">=3.8",
|
||||
python_requires=">=3.9",
|
||||
)
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
import unittest
|
||||
from datetime import datetime
|
||||
from test.conftest import evaluate_cv_folds_with_underlying_model
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import (
|
||||
train_test_split,
|
||||
)
|
||||
|
||||
from flaml import AutoML, tune
|
||||
from flaml.automl.model import LGBMEstimator
|
||||
@@ -420,6 +424,122 @@ class TestClassification(unittest.TestCase):
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
"catboost",
|
||||
"extra_tree",
|
||||
"histgb",
|
||||
"kneighbor",
|
||||
"lgbm",
|
||||
# "lrl1",
|
||||
"lrl2",
|
||||
"rf",
|
||||
"svc",
|
||||
"xgboost",
|
||||
"xgb_limitdepth",
|
||||
],
|
||||
)
|
||||
def test_reproducibility_of_classification_models(estimator: str):
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues where FLAML was providing an incorrect model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
In this test we take the best model which FLAML provided us, and then retrain and test it on the
|
||||
same folds, to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": [estimator],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 10,
|
||||
"metric": "f1",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
}
|
||||
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
config = best_model.get_params()
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
|
||||
# Take the best model, and see if we can reproduce the best result
|
||||
reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
|
||||
config=config,
|
||||
estimator=best_model,
|
||||
X_train_all=automl._state.X_train_all,
|
||||
y_train_all=automl._state.y_train_all,
|
||||
budget=None,
|
||||
kf=automl._state.kf,
|
||||
eval_metric="f1",
|
||||
best_val_loss=None,
|
||||
cv_score_agg_func=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs=None,
|
||||
free_mem_ratio=0,
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
"catboost",
|
||||
"extra_tree",
|
||||
"histgb",
|
||||
"kneighbor",
|
||||
"lgbm",
|
||||
# "lrl1",
|
||||
"lrl2",
|
||||
"svc",
|
||||
"rf",
|
||||
"xgboost",
|
||||
"xgb_limitdepth",
|
||||
],
|
||||
)
|
||||
def test_reproducibility_of_underlying_classification_models(estimator: str):
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues where FLAML was providing an incorrect model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model.
|
||||
Ideally, FLAMLised models should perform identically to the underlying model, when fitted
|
||||
to the same data, with no budget. This verifies that this is the case for classification models.
|
||||
In this test we take the best model which FLAML provided us, extract the underlying model,
|
||||
before retraining and testing it on the same folds - to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": [estimator],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 10,
|
||||
"metric": "f1",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
}
|
||||
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
reproduced_val_loss_underlying_model = np.mean(
|
||||
evaluate_cv_folds_with_underlying_model(
|
||||
automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "classification"
|
||||
)
|
||||
)
|
||||
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test = TestClassification()
|
||||
test.test_preprocess()
|
||||
|
||||
@@ -17,6 +17,8 @@ from flaml import AutoML
|
||||
from flaml.automl.ml import sklearn_metric_loss_score
|
||||
from flaml.tune.spark.utils import check_spark
|
||||
|
||||
pytestmark = pytest.mark.spark
|
||||
|
||||
leaderboard = defaultdict(dict)
|
||||
|
||||
warnings.simplefilter(action="ignore")
|
||||
|
||||
@@ -477,7 +477,10 @@ def test_forecast_classification(budget=5):
|
||||
def get_stalliion_data():
|
||||
from pytorch_forecasting.data.examples import get_stallion_data
|
||||
|
||||
data = get_stallion_data()
|
||||
# data = get_stallion_data()
|
||||
data = pd.read_parquet(
|
||||
"https://raw.githubusercontent.com/sktime/pytorch-forecasting/refs/heads/main/examples/data/stallion.parquet"
|
||||
)
|
||||
# add time index - For datasets with no missing values, FLAML will automate this process
|
||||
data["time_idx"] = data["date"].dt.year * 12 + data["date"].dt.month
|
||||
data["time_idx"] -= data["time_idx"].min()
|
||||
|
||||
51
test/automl/test_max_iter_1.py
Normal file
51
test/automl/test_max_iter_1.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import mlflow
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from flaml import AutoML
|
||||
|
||||
|
||||
def test_max_iter_1():
|
||||
date_rng = pd.date_range(start="2024-01-01", periods=100, freq="H")
|
||||
X = pd.DataFrame({"ds": date_rng})
|
||||
y_train_24h = np.random.rand(len(X)) * 100
|
||||
|
||||
# AutoML
|
||||
settings = {
|
||||
"max_iter": 1,
|
||||
"estimator_list": ["xgboost", "lgbm"],
|
||||
"starting_points": {"xgboost": {}, "lgbm": {}},
|
||||
"task": "ts_forecast",
|
||||
"log_file_name": "test_max_iter_1.log",
|
||||
"seed": 41,
|
||||
"mlflow_exp_name": "TestExp-max_iter-1",
|
||||
"use_spark": False,
|
||||
"n_concurrent_trials": 1,
|
||||
"verbose": 1,
|
||||
"featurization": "off",
|
||||
"metric": "rmse",
|
||||
"mlflow_logging": True,
|
||||
}
|
||||
|
||||
automl = AutoML(**settings)
|
||||
|
||||
with mlflow.start_run(run_name="AutoMLModel-XGBoost-and-LGBM-max_iter_1"):
|
||||
automl.fit(
|
||||
X_train=X,
|
||||
y_train=y_train_24h,
|
||||
period=24,
|
||||
X_val=X,
|
||||
y_val=y_train_24h,
|
||||
split_ratio=0,
|
||||
force_cancel=False,
|
||||
)
|
||||
|
||||
assert automl.model is not None, "AutoML failed to return a model"
|
||||
assert automl.best_run_id is not None, "Best run ID should not be None with mlflow logging"
|
||||
|
||||
print("Best model:", automl.model)
|
||||
print("Best run ID:", automl.best_run_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_max_iter_1()
|
||||
@@ -10,6 +10,18 @@ from flaml import AutoML
|
||||
|
||||
|
||||
class TestMLFlowLoggingParam:
|
||||
def test_update_and_install_requirements(self):
|
||||
import mlflow
|
||||
from sklearn import tree
|
||||
|
||||
from flaml.fabric.mlflow import update_and_install_requirements
|
||||
|
||||
with mlflow.start_run(run_name="test") as run:
|
||||
sk_model = tree.DecisionTreeClassifier()
|
||||
mlflow.sklearn.log_model(sk_model, "model", registered_model_name="test")
|
||||
|
||||
update_and_install_requirements(run_id=run.info.run_id)
|
||||
|
||||
def test_should_start_new_run_by_default(self, automl_settings):
|
||||
with mlflow.start_run() as parent_run:
|
||||
automl = AutoML()
|
||||
|
||||
@@ -143,4 +143,5 @@ def test_prep():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_lrl2()
|
||||
test_prep()
|
||||
@@ -187,7 +187,6 @@ class TestMultiClass(unittest.TestCase):
|
||||
def test_custom_metric(self):
|
||||
df, y = load_iris(return_X_y=True, as_frame=True)
|
||||
df["label"] = y
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"dataframe": df,
|
||||
"label": "label",
|
||||
@@ -204,7 +203,8 @@ class TestMultiClass(unittest.TestCase):
|
||||
"pred_time_limit": 1e-5,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(**settings)
|
||||
automl = AutoML(**settings) # test safe_json_dumps
|
||||
automl.fit(dataframe=df, label="label")
|
||||
print(automl.classes_)
|
||||
print(automl.model)
|
||||
print(automl.config_history)
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import unittest
|
||||
from test.conftest import evaluate_cv_folds_with_underlying_model
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import (
|
||||
fetch_california_housing,
|
||||
make_regression,
|
||||
)
|
||||
|
||||
from flaml import AutoML
|
||||
@@ -205,7 +208,6 @@ class TestRegression(unittest.TestCase):
|
||||
|
||||
|
||||
def test_multioutput():
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
|
||||
|
||||
@@ -230,5 +232,210 @@ def test_multioutput():
|
||||
print(model.predict(X_test))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
"catboost",
|
||||
"enet",
|
||||
"extra_tree",
|
||||
"histgb",
|
||||
"kneighbor",
|
||||
"lgbm",
|
||||
"rf",
|
||||
"xgboost",
|
||||
"xgb_limitdepth",
|
||||
],
|
||||
)
|
||||
def test_reproducibility_of_regression_models(estimator: str):
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues where FLAML was providing an incorrect model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
In this test we take the best regression model which FLAML provided us, and then retrain and test it on the
|
||||
same folds, to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 2,
|
||||
"time_budget": -1,
|
||||
"task": "regression",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": [estimator],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "r2",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
"retrain_full": True,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
config = best_model.get_params()
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
|
||||
# Take the best model, and see if we can reproduce the best result
|
||||
reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
|
||||
config=config,
|
||||
estimator=best_model,
|
||||
X_train_all=automl._state.X_train_all,
|
||||
y_train_all=automl._state.y_train_all,
|
||||
budget=None,
|
||||
kf=automl._state.kf,
|
||||
eval_metric="r2",
|
||||
best_val_loss=None,
|
||||
cv_score_agg_func=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs=None,
|
||||
free_mem_ratio=0,
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss
|
||||
|
||||
|
||||
def test_reproducibility_of_catboost_regression_model():
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues around the catboost model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
In this test we take the best catboost regression model which FLAML provided us, and then retrain and test it on the
|
||||
same folds, to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 7,
|
||||
"task": "regression",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["catboost"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 10,
|
||||
"metric": "r2",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
"retrain_full": True,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
config = best_model.get_params()
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
|
||||
# Take the best model, and see if we can reproduce the best result
|
||||
reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
|
||||
config=config,
|
||||
estimator=best_model,
|
||||
X_train_all=automl._state.X_train_all,
|
||||
y_train_all=automl._state.y_train_all,
|
||||
budget=None,
|
||||
kf=automl._state.kf,
|
||||
eval_metric="r2",
|
||||
best_val_loss=None,
|
||||
cv_score_agg_func=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs=None,
|
||||
free_mem_ratio=0,
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss
|
||||
|
||||
|
||||
def test_reproducibility_of_lgbm_regression_model():
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues around LGBMs - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1368
|
||||
In this test we take the best LGBM regression model which FLAML provided us, and then retrain and test it on the
|
||||
same folds, to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"task": "regression",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["lgbm"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 9,
|
||||
"metric": "r2",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
"retrain_full": True,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
config = best_model.get_params()
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
|
||||
# Take the best model, and see if we can reproduce the best result
|
||||
reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
|
||||
config=config,
|
||||
estimator=best_model,
|
||||
X_train_all=automl._state.X_train_all,
|
||||
y_train_all=automl._state.y_train_all,
|
||||
budget=None,
|
||||
kf=automl._state.kf,
|
||||
eval_metric="r2",
|
||||
best_val_loss=None,
|
||||
cv_score_agg_func=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs=None,
|
||||
free_mem_ratio=0,
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss or val_loss_flaml > reproduced_val_loss
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
"catboost",
|
||||
"enet",
|
||||
"extra_tree",
|
||||
"histgb",
|
||||
"kneighbor",
|
||||
"lgbm",
|
||||
"rf",
|
||||
"xgboost",
|
||||
"xgb_limitdepth",
|
||||
],
|
||||
)
|
||||
def test_reproducibility_of_underlying_regression_models(estimator: str):
|
||||
"""FLAML finds the best model for a given dataset, which it then provides to users.
|
||||
|
||||
However, there are reported issues where FLAML was providing an incorrect model - see here:
|
||||
https://github.com/microsoft/FLAML/issues/1317
|
||||
FLAML defines FLAMLised models, which wrap around the underlying (SKLearn/XGBoost/CatBoost) model.
|
||||
Ideally, FLAMLised models should perform identically to the underlying model, when fitted
|
||||
to the same data, with no budget. This verifies that this is the case for regression models.
|
||||
In this test we take the best model which FLAML provided us, extract the underlying model,
|
||||
before retraining and testing it on the same folds - to verify that the result is reproducible.
|
||||
"""
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"task": "regression",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": [estimator],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 10,
|
||||
"metric": "r2",
|
||||
"keep_search_state": True,
|
||||
"skip_transform": True,
|
||||
"retrain_full": False,
|
||||
}
|
||||
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
||||
automl.fit(X_train=X, y_train=y, **automl_settings)
|
||||
best_model = automl.model
|
||||
assert best_model is not None
|
||||
val_loss_flaml = automl.best_result["val_loss"]
|
||||
reproduced_val_loss_underlying_model = np.mean(
|
||||
evaluate_cv_folds_with_underlying_model(
|
||||
automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "regression"
|
||||
)
|
||||
)
|
||||
assert pytest.approx(val_loss_flaml) == reproduced_val_loss_underlying_model
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from sklearn.datasets import fetch_openml
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.datasets import fetch_openml, load_iris
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.model_selection import GroupKFold, KFold, train_test_split
|
||||
|
||||
@@ -48,7 +50,7 @@ def test_time():
|
||||
_test(split_type="time")
|
||||
|
||||
|
||||
def test_groups():
|
||||
def test_groups_for_classification_task():
|
||||
from sklearn.externals._arff import ArffException
|
||||
|
||||
try:
|
||||
@@ -58,8 +60,6 @@ def test_groups():
|
||||
|
||||
X, y = load_wine(return_X_y=True)
|
||||
|
||||
import numpy as np
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
@@ -68,7 +68,7 @@ def test_groups():
|
||||
"model_history": True,
|
||||
"eval_method": "cv",
|
||||
"groups": np.random.randint(low=0, high=10, size=len(y)),
|
||||
"estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
|
||||
"estimator_list": ["catboost", "lgbm", "rf", "xgboost", "kneighbor"],
|
||||
"learner_selector": "roundrobin",
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
@@ -88,6 +88,72 @@ def test_groups():
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
|
||||
def test_groups_for_regression_task():
|
||||
"""Append nonsensical groups to iris dataset and use it to test that GroupKFold works for regression tasks"""
|
||||
iris_dict_data = load_iris(as_frame=True) # numpy arrays
|
||||
iris_data = iris_dict_data["frame"] # pandas dataframe data + target
|
||||
|
||||
rng = np.random.default_rng(42)
|
||||
iris_data["cluster"] = rng.integers(
|
||||
low=0, high=5, size=iris_data.shape[0]
|
||||
) # np.random.randint(0, 5, iris_data.shape[0])
|
||||
|
||||
automl = AutoML()
|
||||
X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
|
||||
y = iris_data["petal width (cm)"]
|
||||
X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
|
||||
X, y, iris_data["cluster"], random_state=42
|
||||
)
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"metric": "r2",
|
||||
"task": "regression",
|
||||
"estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
|
||||
"eval_method": "cv",
|
||||
"split_type": "uniform",
|
||||
"groups": groups_train,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
|
||||
|
||||
def test_groups_with_sample_weights():
|
||||
"""Verifies that sample weights can be used with group splits i.e. that https://github.com/microsoft/FLAML/issues/1396 remains fixed"""
|
||||
iris_dict_data = load_iris(as_frame=True) # numpy arrays
|
||||
iris_data = iris_dict_data["frame"] # pandas dataframe data + target
|
||||
iris_data["cluster"] = np.random.randint(0, 5, iris_data.shape[0])
|
||||
automl = AutoML()
|
||||
|
||||
X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
|
||||
y = iris_data["petal width (cm)"]
|
||||
sample_weight = pd.Series(np.random.rand(X.shape[0]))
|
||||
(
|
||||
X_train,
|
||||
X_test,
|
||||
y_train,
|
||||
y_test,
|
||||
groups_train,
|
||||
groups_test,
|
||||
sample_weight_train,
|
||||
sample_weight_test,
|
||||
) = train_test_split(X, y, iris_data["cluster"], sample_weight, random_state=42)
|
||||
automl_settings = {
|
||||
"max_iter": 5,
|
||||
"time_budget": -1,
|
||||
"metric": "r2",
|
||||
"task": "regression",
|
||||
"log_file_name": "error.log",
|
||||
"log_type": "all",
|
||||
"estimator_list": ["lgbm"],
|
||||
"eval_method": "cv",
|
||||
"split_type": "group",
|
||||
"groups": groups_train,
|
||||
"sample_weight": sample_weight_train,
|
||||
}
|
||||
automl.fit(X_train, y_train, **automl_settings)
|
||||
assert automl.model is not None
|
||||
|
||||
|
||||
def test_stratified_groupkfold():
|
||||
from minio.error import ServerError
|
||||
from sklearn.model_selection import StratifiedGroupKFold
|
||||
@@ -108,6 +174,7 @@ def test_stratified_groupkfold():
|
||||
"split_type": splitter,
|
||||
"groups": X_train["Airline"],
|
||||
"estimator_list": [
|
||||
"catboost",
|
||||
"lgbm",
|
||||
"rf",
|
||||
"xgboost",
|
||||
@@ -203,4 +270,4 @@ def test_object():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_groups()
|
||||
test_groups_for_classification_task()
|
||||
|
||||
42
test/conftest.py
Normal file
42
test/conftest.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
|
||||
from sklearn.metrics import f1_score, r2_score
|
||||
|
||||
|
||||
def evaluate_cv_folds_with_underlying_model(X_train_all, y_train_all, kf, model: Any, task: str) -> pd.DataFrame:
|
||||
"""Mimic the FLAML CV process to calculate the metrics across each fold.
|
||||
|
||||
:param X_train_all: X training data
|
||||
:param y_train_all: y training data
|
||||
:param kf: The splitter object to use to generate the folds
|
||||
:param model: The estimator to fit to the data during the CV process
|
||||
:param task: classification or regression
|
||||
:return: An array containing the metrics
|
||||
"""
|
||||
rng = np.random.RandomState(2020)
|
||||
all_fold_metrics: List[Dict[str, Union[int, float]]] = []
|
||||
for train_index, val_index in kf.split(X_train_all, y_train_all):
|
||||
X_train_split, y_train_split = X_train_all, y_train_all
|
||||
train_index = rng.permutation(train_index)
|
||||
X_train = X_train_split.iloc[train_index]
|
||||
X_val = X_train_split.iloc[val_index]
|
||||
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
||||
model_type = type(model)
|
||||
if model_type is not CatBoostClassifier and model_type is not CatBoostRegressor:
|
||||
model.fit(X_train, y_train)
|
||||
else:
|
||||
use_best_model = True
|
||||
n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
|
||||
X_tr, y_tr = (X_train)[:n], y_train[:n]
|
||||
eval_set = Pool(data=X_train[n:], label=y_train[n:], cat_features=[]) if use_best_model else None
|
||||
model.fit(X_tr, y_tr, eval_set=eval_set, use_best_model=True)
|
||||
y_pred_classes = model.predict(X_val)
|
||||
if task == "classification":
|
||||
reproduced_metric = 1 - f1_score(y_val, y_pred_classes)
|
||||
else:
|
||||
reproduced_metric = 1 - r2_score(y_val, y_pred_classes)
|
||||
all_fold_metrics.append(reproduced_metric)
|
||||
return all_fold_metrics
|
||||
@@ -24,6 +24,8 @@ model_path_list = [
|
||||
if sys.platform.startswith("darwin") and sys.version_info[0] == 3 and sys.version_info[1] == 11:
|
||||
pytest.skip("skipping Python 3.11 on MacOS", allow_module_level=True)
|
||||
|
||||
pytestmark = pytest.mark.spark # set to spark as parallel testing raised RuntimeError
|
||||
|
||||
|
||||
def test_switch_1_1():
|
||||
data_idx, model_path_idx = 0, 0
|
||||
|
||||
@@ -5,6 +5,8 @@ import sys
|
||||
import pytest
|
||||
from utils import get_automl_settings, get_toy_data_seqclassification
|
||||
|
||||
pytestmark = pytest.mark.spark # set to spark as parallel testing raised MlflowException of changing parameter
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows")
|
||||
def test_cv():
|
||||
|
||||
@@ -10,6 +10,10 @@ from flaml.default import portfolio
|
||||
if sys.platform.startswith("darwin") and sys.version_info[0] == 3 and sys.version_info[1] == 11:
|
||||
pytest.skip("skipping Python 3.11 on MacOS", allow_module_level=True)
|
||||
|
||||
pytestmark = (
|
||||
pytest.mark.spark
|
||||
) # set to spark as parallel testing raised ValueError: Feature NonExisting not implemented.
|
||||
|
||||
|
||||
def pop_args(fit_kwargs):
|
||||
fit_kwargs.pop("max_iter", None)
|
||||
|
||||
@@ -3,11 +3,13 @@ import sys
|
||||
import warnings
|
||||
|
||||
import mlflow
|
||||
import numpy as np
|
||||
import pytest
|
||||
import sklearn.datasets as skds
|
||||
from packaging.version import Version
|
||||
|
||||
from flaml import AutoML
|
||||
from flaml.automl.data import auto_convert_dtypes_pandas, auto_convert_dtypes_spark, get_random_dataframe
|
||||
from flaml.tune.spark.utils import check_spark
|
||||
|
||||
warnings.simplefilter(action="ignore")
|
||||
@@ -58,7 +60,7 @@ if sys.version_info >= (3, 11):
|
||||
else:
|
||||
skip_py311 = False
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
|
||||
def _test_spark_synapseml_lightgbm(spark=None, task="classification"):
|
||||
@@ -296,11 +298,88 @@ def _test_spark_large_df():
|
||||
print("time cost in minutes: ", (end_time - start_time) / 60)
|
||||
|
||||
|
||||
def test_get_random_dataframe():
|
||||
# Test with default parameters
|
||||
df = get_random_dataframe(n_rows=50, ratio_none=0.2, seed=123)
|
||||
assert df.shape == (50, 14) # Default is 200 rows and 14 columns
|
||||
|
||||
# Test column types
|
||||
assert "timestamp" in df.columns and np.issubdtype(df["timestamp"].dtype, np.datetime64)
|
||||
assert "id" in df.columns and np.issubdtype(df["id"].dtype, np.integer)
|
||||
assert "score" in df.columns and np.issubdtype(df["score"].dtype, np.floating)
|
||||
assert "category" in df.columns and df["category"].dtype.name == "category"
|
||||
|
||||
|
||||
def test_auto_convert_dtypes_pandas():
|
||||
# Create a test DataFrame with various types
|
||||
import pandas as pd
|
||||
|
||||
test_df = pd.DataFrame(
|
||||
{
|
||||
"int_col": ["1", "2", "3", "4", "5", "6", "6"],
|
||||
"float_col": ["1.1", "2.2", "3.3", "NULL", "5.5", "6.6", "6.6"],
|
||||
"date_col": ["2021-01-01", "2021-02-01", "NA", "2021-04-01", "2021-05-01", "2021-06-01", "2021-06-01"],
|
||||
"cat_col": ["A", "B", "A", "A", "B", "A", "B"],
|
||||
"string_col": ["text1", "text2", "text3", "text4", "text5", "text6", "text7"],
|
||||
}
|
||||
)
|
||||
|
||||
# Convert dtypes
|
||||
converted_df, schema = auto_convert_dtypes_pandas(test_df)
|
||||
|
||||
# Check conversions
|
||||
assert schema["int_col"] == "int"
|
||||
assert schema["float_col"] == "double"
|
||||
assert schema["date_col"] == "timestamp"
|
||||
assert schema["cat_col"] == "category"
|
||||
assert schema["string_col"] == "string"
|
||||
|
||||
|
||||
def test_auto_convert_dtypes_spark():
|
||||
"""Test auto_convert_dtypes_spark function with various data types."""
|
||||
import pandas as pd
|
||||
|
||||
# Create a test DataFrame with various types
|
||||
test_pdf = pd.DataFrame(
|
||||
{
|
||||
"int_col": ["1", "2", "3", "4", "NA"],
|
||||
"float_col": ["1.1", "2.2", "3.3", "NULL", "5.5"],
|
||||
"date_col": ["2021-01-01", "2021-02-01", "NA", "2021-04-01", "2021-05-01"],
|
||||
"cat_col": ["A", "B", "A", "C", "B"],
|
||||
"string_col": ["text1", "text2", "text3", "text4", "text5"],
|
||||
}
|
||||
)
|
||||
|
||||
# Convert pandas DataFrame to Spark DataFrame
|
||||
test_df = spark.createDataFrame(test_pdf)
|
||||
|
||||
# Convert dtypes
|
||||
converted_df, schema = auto_convert_dtypes_spark(test_df)
|
||||
|
||||
# Check conversions
|
||||
assert schema["int_col"] == "int"
|
||||
assert schema["float_col"] == "double"
|
||||
assert schema["date_col"] == "timestamp"
|
||||
assert schema["cat_col"] == "string" # Conceptual category in schema
|
||||
assert schema["string_col"] == "string"
|
||||
|
||||
# Verify the actual data types from the Spark DataFrame
|
||||
spark_dtypes = dict(converted_df.dtypes)
|
||||
assert spark_dtypes["int_col"] == "int"
|
||||
assert spark_dtypes["float_col"] == "double"
|
||||
assert spark_dtypes["date_col"] == "timestamp"
|
||||
assert spark_dtypes["cat_col"] == "string" # In Spark, categories are still strings
|
||||
assert spark_dtypes["string_col"] == "string"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_spark_synapseml_classification()
|
||||
test_spark_synapseml_regression()
|
||||
test_spark_synapseml_rank()
|
||||
test_spark_input_df()
|
||||
test_get_random_dataframe()
|
||||
test_auto_convert_dtypes_pandas()
|
||||
test_auto_convert_dtypes_spark()
|
||||
|
||||
# import cProfile
|
||||
# import pstats
|
||||
|
||||
@@ -25,7 +25,7 @@ os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
|
||||
def test_parallel_xgboost(hpo_method=None, data_size=1000):
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
from sklearn.datasets import load_wine
|
||||
|
||||
from flaml import AutoML
|
||||
@@ -24,6 +25,8 @@ if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "custom_mylearner.p
|
||||
else:
|
||||
skip_my_learner = True
|
||||
|
||||
pytestmark = pytest.mark.spark
|
||||
|
||||
|
||||
class TestEnsemble(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
|
||||
@@ -9,7 +9,7 @@ from flaml.tune.spark.utils import check_spark
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ try:
|
||||
from pyspark.ml.feature import VectorAssembler
|
||||
except ImportError:
|
||||
pass
|
||||
pytestmark = pytest.mark.spark
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
skip_spark = importlib.util.find_spec("pyspark") is None
|
||||
|
||||
@@ -2,6 +2,7 @@ import os
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import load_iris, load_wine
|
||||
|
||||
@@ -12,6 +13,7 @@ from flaml.tune.spark.utils import check_spark
|
||||
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
pytestmark = pytest.mark.spark
|
||||
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from flaml.tune.spark.utils import check_spark
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
here = os.path.abspath(os.path.dirname(__file__))
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
|
||||
@@ -25,7 +25,7 @@ try:
|
||||
except ImportError:
|
||||
skip_spark = True
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
|
||||
def test_overtime():
|
||||
|
||||
@@ -11,7 +11,7 @@ from flaml.tune.spark.utils import check_spark
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ from flaml.tune.spark.utils import check_spark
|
||||
spark_available, _ = check_spark()
|
||||
skip_spark = not spark_available
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
|
||||
@@ -36,7 +36,7 @@ except ImportError:
|
||||
print("Spark is not installed. Skip all spark tests.")
|
||||
skip_spark = True
|
||||
|
||||
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
||||
pytestmark = [pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests."), pytest.mark.spark]
|
||||
|
||||
|
||||
def test_with_parameters_spark():
|
||||
|
||||
@@ -59,6 +59,17 @@ def _test_hf_data():
|
||||
except requests.exceptions.ConnectionError:
|
||||
return
|
||||
|
||||
# Tests will only run if there is a GPU available
|
||||
try:
|
||||
import ray
|
||||
|
||||
pg = ray.util.placement_group([{"CPU": 1, "GPU": 1}])
|
||||
|
||||
if not pg.wait(timeout_seconds=10): # Wait 10 seconds for resources
|
||||
raise RuntimeError("No available node types can fulfill resource request!")
|
||||
except RuntimeError:
|
||||
return
|
||||
|
||||
custom_sent_keys = ["sentence1", "sentence2"]
|
||||
label_key = "label"
|
||||
|
||||
|
||||
@@ -32,6 +32,8 @@ print(automl.predict_proba(X_train))
|
||||
print(automl.model.estimator)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample of output
|
||||
|
||||
```
|
||||
|
||||
@@ -47,6 +47,8 @@ if os.path.exists("data/output/"):
|
||||
shutil.rmtree("data/output/")
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -28,6 +28,8 @@ automl.fit(
|
||||
)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -32,6 +32,8 @@ print(automl.predict(X_train))
|
||||
print(automl.model.estimator)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -29,6 +29,8 @@ automl.fit(
|
||||
print(automl.predict(X_train[84:]))
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -29,6 +29,8 @@ settings = {
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -31,6 +31,8 @@ settings = {
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings)
|
||||
```
|
||||
|
||||
**Note**: You can access the best model's estimator using `automl.model.estimator`.
|
||||
|
||||
#### Sample output
|
||||
|
||||
```
|
||||
|
||||
@@ -80,11 +80,38 @@ from flaml import AutoML
|
||||
from sklearn.datasets import load_iris
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
|
||||
automl = AutoML(settings={"time_budget": 3})
|
||||
settings = {"time_budget": 3}
|
||||
automl = AutoML(**settings)
|
||||
automl.fit(X, y)
|
||||
|
||||
print(f"{automl.best_estimator=}")
|
||||
print(f"{automl.best_config=}")
|
||||
print(f"params for best estimator: {automl.model.config2params(automl.best_config)}")
|
||||
```
|
||||
|
||||
If the automl instance is not accessible and you've the `best_config`. You can also convert it with below code:
|
||||
|
||||
```python
|
||||
from flaml.automl.task.factory import task_factory
|
||||
|
||||
task = "classification"
|
||||
best_estimator = "rf"
|
||||
best_config = {
|
||||
"n_estimators": 15,
|
||||
"max_features": 0.35807183923834934,
|
||||
"max_leaves": 12,
|
||||
"criterion": "gini",
|
||||
}
|
||||
|
||||
model_class = task_factory(task).estimator_class_from_str(best_estimator)(task=task)
|
||||
best_params = model_class.config2params(best_config)
|
||||
```
|
||||
|
||||
Then you can use it to train the sklearn estimators directly:
|
||||
|
||||
```python
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
model = RandomForestClassifier(**best_params)
|
||||
model.fit(X, y)
|
||||
```
|
||||
|
||||
@@ -393,7 +393,7 @@ For holdout, you can also set:
|
||||
|
||||
- `split_ratio`: the fraction for validation data, 0.1 by default.
|
||||
- `X_val`, `y_val`: a separate validation dataset. When they are passed, the validation metrics will be computed against this given validation dataset. If they are not passed, then a validation dataset will be split from the training data and held out from training during the model search. After the model search, flaml will retrain the model with best configuration on the full training data.
|
||||
You can set`retrain_full` to be `False` to skip the final retraining or "budget" to ask flaml to do its best to retrain within the time budget.
|
||||
You can set`retrain_full` to be `False` to skip the final retraining or "budget" to ask flaml to do its best to retrain within the time budget. When `retrain_full` is set to `True`, the user-provided validation data is not used in the final retraining of the model.
|
||||
|
||||
For cross validation, you can also set `n_splits` of the number of folds. By default it is 5.
|
||||
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
"name": "website",
|
||||
"version": "0.0.0",
|
||||
"private": true,
|
||||
"resolutions" :{
|
||||
"nth-check":"2.0.1",
|
||||
"trim":"0.0.3",
|
||||
"resolutions": {
|
||||
"nth-check": "2.0.1",
|
||||
"trim": "0.0.3",
|
||||
"got": "11.8.5",
|
||||
"node-forge": "1.3.0",
|
||||
"minimatch": "3.0.5",
|
||||
@@ -12,7 +12,7 @@
|
||||
"eta": "2.0.0",
|
||||
"@sideway/formula": "3.0.1",
|
||||
"http-cache-semantics": "4.1.1"
|
||||
},
|
||||
},
|
||||
"scripts": {
|
||||
"docusaurus": "docusaurus",
|
||||
"start": "docusaurus start",
|
||||
@@ -33,13 +33,13 @@
|
||||
"clsx": "^1.1.1",
|
||||
"file-loader": "^6.2.0",
|
||||
"hast-util-is-element": "1.1.0",
|
||||
"minimatch": "3.0.5",
|
||||
"react": "^17.0.1",
|
||||
"react-dom": "^17.0.1",
|
||||
"rehype-katex": "4",
|
||||
"remark-math": "3",
|
||||
"trim": "^0.0.3",
|
||||
"url-loader": "^4.1.1",
|
||||
"minimatch": "3.0.5"
|
||||
"url-loader": "^4.1.1"
|
||||
},
|
||||
"browserslist": {
|
||||
"production": [
|
||||
|
||||
@@ -153,6 +153,15 @@
|
||||
"@babel/highlight" "^7.23.4"
|
||||
chalk "^2.4.2"
|
||||
|
||||
"@babel/code-frame@^7.26.2":
|
||||
version "7.26.2"
|
||||
resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.26.2.tgz#4b5fab97d33338eff916235055f0ebc21e573a85"
|
||||
integrity sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ==
|
||||
dependencies:
|
||||
"@babel/helper-validator-identifier" "^7.25.9"
|
||||
js-tokens "^4.0.0"
|
||||
picocolors "^1.0.0"
|
||||
|
||||
"@babel/compat-data@^7.17.7", "@babel/compat-data@^7.20.0", "@babel/compat-data@^7.20.1":
|
||||
version "7.20.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/compat-data/-/compat-data-7.20.1.tgz#f2e6ef7790d8c8dbf03d379502dcc246dcce0b30"
|
||||
@@ -429,6 +438,11 @@
|
||||
resolved "https://registry.yarnpkg.com/@babel/helper-string-parser/-/helper-string-parser-7.23.4.tgz#9478c707febcbbe1ddb38a3d91a2e054ae622d83"
|
||||
integrity sha512-803gmbQdqwdf4olxrX4AJyFBV/RTr3rSmOj0rKwesmzlfhYNDEs+/iOcznzpNWlJlIlTJC2QfPFcHB6DlzdVLQ==
|
||||
|
||||
"@babel/helper-string-parser@^7.25.9":
|
||||
version "7.25.9"
|
||||
resolved "https://registry.yarnpkg.com/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz#1aabb72ee72ed35789b4bbcad3ca2862ce614e8c"
|
||||
integrity sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==
|
||||
|
||||
"@babel/helper-validator-identifier@^7.18.6", "@babel/helper-validator-identifier@^7.19.1":
|
||||
version "7.19.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.19.1.tgz#7eea834cf32901ffdc1a7ee555e2f9c27e249ca2"
|
||||
@@ -439,6 +453,11 @@
|
||||
resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz#c4ae002c61d2879e724581d96665583dbc1dc0e0"
|
||||
integrity sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==
|
||||
|
||||
"@babel/helper-validator-identifier@^7.25.9":
|
||||
version "7.25.9"
|
||||
resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz#24b64e2c3ec7cd3b3c547729b8d16871f22cbdc7"
|
||||
integrity sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==
|
||||
|
||||
"@babel/helper-validator-option@^7.18.6":
|
||||
version "7.18.6"
|
||||
resolved "https://registry.npmmirror.com/@babel/helper-validator-option/-/helper-validator-option-7.18.6.tgz#bf0d2b5a509b1f336099e4ff36e1a63aa5db4db8"
|
||||
@@ -455,13 +474,12 @@
|
||||
"@babel/types" "^7.19.0"
|
||||
|
||||
"@babel/helpers@^7.12.5", "@babel/helpers@^7.20.1":
|
||||
version "7.20.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/helpers/-/helpers-7.20.1.tgz#2ab7a0fcb0a03b5bf76629196ed63c2d7311f4c9"
|
||||
integrity sha512-J77mUVaDTUJFZ5BpP6mMn6OIl3rEWymk2ZxDBQJUG3P+PbmyMcF3bYWvz0ma69Af1oobDqT/iAsvzhB58xhQUg==
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/helpers/-/helpers-7.26.10.tgz#6baea3cd62ec2d0c1068778d63cb1314f6637384"
|
||||
integrity sha512-UPYc3SauzZ3JGgj87GgZ89JVdC5dj0AoetR5Bw6wj4niittNyFh6+eOGonYvJ1ao6B8lEa3Q3klS7ADZ53bc5g==
|
||||
dependencies:
|
||||
"@babel/template" "^7.18.10"
|
||||
"@babel/traverse" "^7.20.1"
|
||||
"@babel/types" "^7.20.0"
|
||||
"@babel/template" "^7.26.9"
|
||||
"@babel/types" "^7.26.10"
|
||||
|
||||
"@babel/highlight@^7.18.6":
|
||||
version "7.18.6"
|
||||
@@ -491,6 +509,13 @@
|
||||
resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.23.6.tgz#ba1c9e512bda72a47e285ae42aff9d2a635a9e3b"
|
||||
integrity sha512-Z2uID7YJ7oNvAI20O9X0bblw7Qqs8Q2hFy0R9tAfnfLkp5MW0UH9eUvnDSnFwKZ0AvgS1ucqR4KzvVHgnke1VQ==
|
||||
|
||||
"@babel/parser@^7.26.9":
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.26.10.tgz#e9bdb82f14b97df6569b0b038edd436839c57749"
|
||||
integrity sha512-6aQR2zGE/QFi8JpDLjUZEPYOs7+mhKXm86VaKFiLP35JQwQb6bwUE+XbvkH0EptsYhbNBSUGaUBLKqxH1xSgsA==
|
||||
dependencies:
|
||||
"@babel/types" "^7.26.10"
|
||||
|
||||
"@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression@^7.18.6":
|
||||
version "7.18.6"
|
||||
resolved "https://registry.npmmirror.com/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression/-/plugin-bugfix-safari-id-destructuring-collision-in-function-expression-7.18.6.tgz#da5b8f9a580acdfbe53494dba45ea389fb09a4d2"
|
||||
@@ -1196,19 +1221,19 @@
|
||||
"@babel/plugin-transform-typescript" "^7.18.6"
|
||||
|
||||
"@babel/runtime-corejs3@^7.15.4":
|
||||
version "7.20.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/runtime-corejs3/-/runtime-corejs3-7.20.1.tgz#d0775a49bb5fba77e42cbb7276c9955c7b05af8d"
|
||||
integrity sha512-CGulbEDcg/ND1Im7fUNRZdGXmX2MTWVVZacQi/6DiKE5HNwZ3aVTm5PV4lO8HHz0B2h8WQyvKKjbX5XgTtydsg==
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/runtime-corejs3/-/runtime-corejs3-7.26.10.tgz#5a3185ca2813f8de8ae68622572086edf5cf51f2"
|
||||
integrity sha512-uITFQYO68pMEYR46AHgQoyBg7KPPJDAbGn4jUTIRgCFJIp88MIBUianVOplhZDEec07bp9zIyr4Kp0FCyQzmWg==
|
||||
dependencies:
|
||||
core-js-pure "^3.25.1"
|
||||
regenerator-runtime "^0.13.10"
|
||||
core-js-pure "^3.30.2"
|
||||
regenerator-runtime "^0.14.0"
|
||||
|
||||
"@babel/runtime@^7.1.2", "@babel/runtime@^7.10.2", "@babel/runtime@^7.10.3", "@babel/runtime@^7.12.13", "@babel/runtime@^7.15.4", "@babel/runtime@^7.8.4":
|
||||
version "7.20.1"
|
||||
resolved "https://registry.npmmirror.com/@babel/runtime/-/runtime-7.20.1.tgz#1148bb33ab252b165a06698fde7576092a78b4a9"
|
||||
integrity sha512-mrzLkl6U9YLF8qpqI7TB82PESyEGjm/0Ly91jG575eVxMMlb8fYfOXFZIJ8XfLrJZQbm7dlKry2bJmXBUEkdFg==
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.26.10.tgz#a07b4d8fa27af131a633d7b3524db803eb4764c2"
|
||||
integrity sha512-2WJMeRQPHKSPemqk/awGrAiuFfzBmOIPXKizAsVhWH9YJqLZ0H+HS4c8loHGgW6utJ3E/ejXQUsiGaQy2NZ9Fw==
|
||||
dependencies:
|
||||
regenerator-runtime "^0.13.10"
|
||||
regenerator-runtime "^0.14.0"
|
||||
|
||||
"@babel/template@^7.12.7", "@babel/template@^7.18.10":
|
||||
version "7.18.10"
|
||||
@@ -1228,6 +1253,15 @@
|
||||
"@babel/parser" "^7.22.15"
|
||||
"@babel/types" "^7.22.15"
|
||||
|
||||
"@babel/template@^7.26.9":
|
||||
version "7.26.9"
|
||||
resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.26.9.tgz#4577ad3ddf43d194528cff4e1fa6b232fa609bb2"
|
||||
integrity sha512-qyRplbeIpNZhmzOysF/wFMuP9sctmh2cFzRAZOn1YapxBsE1i9bJIY586R/WBLfLcmcBlM8ROBiQURnnNy+zfA==
|
||||
dependencies:
|
||||
"@babel/code-frame" "^7.26.2"
|
||||
"@babel/parser" "^7.26.9"
|
||||
"@babel/types" "^7.26.9"
|
||||
|
||||
"@babel/traverse@^7.12.13", "@babel/traverse@^7.12.9", "@babel/traverse@^7.19.0", "@babel/traverse@^7.19.1", "@babel/traverse@^7.20.1":
|
||||
version "7.23.6"
|
||||
resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.23.6.tgz#b53526a2367a0dd6edc423637f3d2d0f2521abc5"
|
||||
@@ -1262,6 +1296,14 @@
|
||||
"@babel/helper-validator-identifier" "^7.22.20"
|
||||
to-fast-properties "^2.0.0"
|
||||
|
||||
"@babel/types@^7.26.10", "@babel/types@^7.26.9":
|
||||
version "7.26.10"
|
||||
resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.26.10.tgz#396382f6335bd4feb65741eacfc808218f859259"
|
||||
integrity sha512-emqcG3vHrpxUKTrxcblR36dcrcoRDvKmnL/dCL6ZsHaShW80qxCAcNhzQZrpeM765VzEos+xOi4s+r4IXzTwdQ==
|
||||
dependencies:
|
||||
"@babel/helper-string-parser" "^7.25.9"
|
||||
"@babel/helper-validator-identifier" "^7.25.9"
|
||||
|
||||
"@docsearch/css@3.3.0":
|
||||
version "3.3.0"
|
||||
resolved "https://registry.npmmirror.com/@docsearch/css/-/css-3.3.0.tgz#d698e48302d12240d7c2f7452ccb2d2239a8cd80"
|
||||
@@ -2863,9 +2905,9 @@ boxen@^5.0.0, boxen@^5.0.1:
|
||||
wrap-ansi "^7.0.0"
|
||||
|
||||
brace-expansion@^1.1.7:
|
||||
version "1.1.11"
|
||||
resolved "https://registry.npmmirror.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd"
|
||||
integrity sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==
|
||||
version "1.1.12"
|
||||
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.12.tgz#ab9b454466e5a8cc3a187beaad580412a9c5b843"
|
||||
integrity sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==
|
||||
dependencies:
|
||||
balanced-match "^1.0.0"
|
||||
concat-map "0.0.1"
|
||||
@@ -2995,15 +3037,10 @@ caniuse-api@^3.0.0:
|
||||
lodash.memoize "^4.1.2"
|
||||
lodash.uniq "^4.5.0"
|
||||
|
||||
caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001400, caniuse-lite@^1.0.30001426:
|
||||
version "1.0.30001430"
|
||||
resolved "https://registry.npmmirror.com/caniuse-lite/-/caniuse-lite-1.0.30001430.tgz#638a8ae00b5a8a97e66ff43733b2701f81b101fa"
|
||||
integrity sha512-IB1BXTZKPDVPM7cnV4iaKaHxckvdr/3xtctB3f7Hmenx3qYBhGtTZ//7EllK66aKXW98Lx0+7Yr0kxBtIt3tzg==
|
||||
|
||||
caniuse-lite@^1.0.30001646:
|
||||
version "1.0.30001657"
|
||||
resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001657.tgz#29fd504bffca719d1c6b63a1f6f840be1973a660"
|
||||
integrity sha512-DPbJAlP8/BAXy3IgiWmZKItubb3TYGP0WscQQlVGIfT4s/YlFYVuJgyOsQNP7rJRChx/qdMeLJQJP0Sgg2yjNA==
|
||||
caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001400, caniuse-lite@^1.0.30001426, caniuse-lite@^1.0.30001646:
|
||||
version "1.0.30001718"
|
||||
resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001718.tgz"
|
||||
integrity sha512-AflseV1ahcSunK53NfEs9gFWgOEmzr0f+kaMFA4xiLZlr9Hzt7HxcSpIFcnNCUkz6R6dWKa54rUz3HUmI3nVcw==
|
||||
|
||||
ccount@^1.0.0, ccount@^1.0.3:
|
||||
version "1.1.0"
|
||||
@@ -3326,10 +3363,10 @@ core-js-compat@^3.25.1:
|
||||
dependencies:
|
||||
browserslist "^4.21.4"
|
||||
|
||||
core-js-pure@^3.25.1:
|
||||
version "3.26.0"
|
||||
resolved "https://registry.npmmirror.com/core-js-pure/-/core-js-pure-3.26.0.tgz#7ad8a5dd7d910756f3124374b50026e23265ca9a"
|
||||
integrity sha512-LiN6fylpVBVwT8twhhluD9TzXmZQQsr2I2eIKtWNbZI1XMfBT7CV18itaN6RA7EtQd/SDdRx/wzvAShX2HvhQA==
|
||||
core-js-pure@^3.30.2:
|
||||
version "3.41.0"
|
||||
resolved "https://registry.yarnpkg.com/core-js-pure/-/core-js-pure-3.41.0.tgz#349fecad168d60807a31e83c99d73d786fe80811"
|
||||
integrity sha512-71Gzp96T9YPk63aUvE5Q5qP+DryB4ZloUZPSOebGM88VNw8VNfvdA7z6kGA8iGOTEzAomsRidp4jXSmUIJsL+Q==
|
||||
|
||||
core-js@^3.18.0:
|
||||
version "3.26.0"
|
||||
@@ -3371,9 +3408,9 @@ cross-fetch@^3.1.5:
|
||||
node-fetch "2.6.7"
|
||||
|
||||
cross-spawn@^7.0.3:
|
||||
version "7.0.3"
|
||||
resolved "https://registry.npmmirror.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6"
|
||||
integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==
|
||||
version "7.0.6"
|
||||
resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f"
|
||||
integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==
|
||||
dependencies:
|
||||
path-key "^3.1.0"
|
||||
shebang-command "^2.0.0"
|
||||
@@ -4830,9 +4867,9 @@ http-parser-js@>=0.5.1:
|
||||
integrity sha512-SGeBX54F94Wgu5RH3X5jsDtf4eHyRogWX1XGT3b4HuW3tQPM4AaBzoUji/4AAJNXCEOWZ5O0DgZmJw1947gD5Q==
|
||||
|
||||
http-proxy-middleware@^2.0.3:
|
||||
version "2.0.6"
|
||||
resolved "https://registry.npmmirror.com/http-proxy-middleware/-/http-proxy-middleware-2.0.6.tgz#e1a4dd6979572c7ab5a4e4b55095d1f32a74963f"
|
||||
integrity sha512-ya/UeJ6HVBYxrgYotAZo1KvPWlgB48kUJLDePFeneHsVujFaW5WNj2NgWCAE//B1Dl02BIfYlpNgBy8Kf8Rjmw==
|
||||
version "2.0.9"
|
||||
resolved "https://registry.yarnpkg.com/http-proxy-middleware/-/http-proxy-middleware-2.0.9.tgz#e9e63d68afaa4eee3d147f39149ab84c0c2815ef"
|
||||
integrity sha512-c1IyJYLYppU574+YI7R4QyX2ystMtVXZwIdzazUIPIJsHuWNd+mho2j+bKoHftndicGj9yh+xjd+l0yj7VeT1Q==
|
||||
dependencies:
|
||||
"@types/http-proxy" "^1.17.8"
|
||||
http-proxy "^1.18.1"
|
||||
@@ -5709,9 +5746,9 @@ multicast-dns@^7.2.5:
|
||||
thunky "^1.0.2"
|
||||
|
||||
nanoid@^3.3.6:
|
||||
version "3.3.6"
|
||||
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.6.tgz#443380c856d6e9f9824267d960b4236ad583ea4c"
|
||||
integrity sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==
|
||||
version "3.3.8"
|
||||
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.8.tgz#b1be3030bee36aaff18bacb375e5cce521684baf"
|
||||
integrity sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==
|
||||
|
||||
negotiator@0.6.3:
|
||||
version "0.6.3"
|
||||
@@ -6441,9 +6478,9 @@ prism-react-renderer@^1.2.1:
|
||||
integrity sha512-IJ+MSwBWKG+SM3b2SUfdrhC+gu01QkV2KmRQgREThBfSQRoufqRfxfHUxpG1WcaFjP+kojcFyO9Qqtpgt3qLCg==
|
||||
|
||||
prismjs@^1.23.0:
|
||||
version "1.29.0"
|
||||
resolved "https://registry.npmmirror.com/prismjs/-/prismjs-1.29.0.tgz#f113555a8fa9b57c35e637bba27509dcf802dd12"
|
||||
integrity sha512-Kx/1w86q/epKcmte75LNrEoT+lX8pBpavuAbvJWRXar7Hz8jrtF+e3vY751p0R8H9HdArwaCTNDDzHg/ScJK1Q==
|
||||
version "1.30.0"
|
||||
resolved "https://registry.yarnpkg.com/prismjs/-/prismjs-1.30.0.tgz#d9709969d9d4e16403f6f348c63553b19f0975a9"
|
||||
integrity sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==
|
||||
|
||||
process-nextick-args@~2.0.0:
|
||||
version "2.0.1"
|
||||
@@ -6816,10 +6853,10 @@ regenerate@^1.4.2:
|
||||
resolved "https://registry.npmmirror.com/regenerate/-/regenerate-1.4.2.tgz#b9346d8827e8f5a32f7ba29637d398b69014848a"
|
||||
integrity sha512-zrceR/XhGYU/d/opr2EKO7aRHUeiBI8qjtfHqADTwZd6Szfy16la6kqD0MIUs5z5hx6AaKa+PixpPrR289+I0A==
|
||||
|
||||
regenerator-runtime@^0.13.10:
|
||||
version "0.13.10"
|
||||
resolved "https://registry.npmmirror.com/regenerator-runtime/-/regenerator-runtime-0.13.10.tgz#ed07b19616bcbec5da6274ebc75ae95634bfc2ee"
|
||||
integrity sha512-KepLsg4dU12hryUO7bp/axHAKvwGOCV0sGloQtpagJ12ai+ojVDqkeGSiRX1zlq+kjIMZ1t7gpze+26QqtdGqw==
|
||||
regenerator-runtime@^0.14.0:
|
||||
version "0.14.1"
|
||||
resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz#356ade10263f685dda125100cd862c1db895327f"
|
||||
integrity sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==
|
||||
|
||||
regenerator-transform@^0.15.0:
|
||||
version "0.15.0"
|
||||
@@ -7272,14 +7309,7 @@ send@0.19.0:
|
||||
range-parser "~1.2.1"
|
||||
statuses "2.0.1"
|
||||
|
||||
serialize-javascript@^6.0.0:
|
||||
version "6.0.0"
|
||||
resolved "https://registry.npmmirror.com/serialize-javascript/-/serialize-javascript-6.0.0.tgz#efae5d88f45d7924141da8b5c3a7a7e663fefeb8"
|
||||
integrity sha512-Qr3TosvguFt8ePWqsvRfrKyQXIiW+nGbYpy8XK24NQHE83caxWt+mIymTT19DGFbNWNLfEwsrkSmN64lVWB9ag==
|
||||
dependencies:
|
||||
randombytes "^2.1.0"
|
||||
|
||||
serialize-javascript@^6.0.1:
|
||||
serialize-javascript@^6.0.0, serialize-javascript@^6.0.1:
|
||||
version "6.0.2"
|
||||
resolved "https://registry.yarnpkg.com/serialize-javascript/-/serialize-javascript-6.0.2.tgz#defa1e055c83bf6d59ea805d8da862254eb6a6c2"
|
||||
integrity sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==
|
||||
|
||||
Reference in New Issue
Block a user