mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-09 02:09:16 +08:00
apidoc (#116)
* fixing apidoc errors Co-authored-by: Chi Wang (MSR) <wang.chi@microsoft.com> Co-authored-by: liususan091219 <Xqq630517>
This commit is contained in:
@@ -2,21 +2,18 @@ import json
|
||||
import os
|
||||
import numpy as np
|
||||
import time
|
||||
import logging
|
||||
|
||||
try:
|
||||
import ray
|
||||
import transformers
|
||||
from transformers import TrainingArguments
|
||||
import datasets
|
||||
import torch
|
||||
from .dataset.task_auto import get_default_task
|
||||
from .result_analysis.azure_utils import JobID
|
||||
from .huggingface.trainer import TrainerForAutoTransformers
|
||||
except ImportError:
|
||||
print("To use the nlp component in flaml, run pip install flaml[nlp]")
|
||||
|
||||
from .dataset.task_auto import get_default_task
|
||||
from .result_analysis.azure_utils import JobID
|
||||
from .huggingface.trainer import TrainerForAutoTransformers
|
||||
|
||||
task_list = [
|
||||
"seq-classification",
|
||||
"regression",
|
||||
@@ -116,20 +113,18 @@ class AutoTransformers:
|
||||
fold_name=None,
|
||||
resplit_portion=None,
|
||||
**custom_data_args):
|
||||
'''Prepare data
|
||||
"""Prepare data
|
||||
|
||||
An example:
|
||||
Example:
|
||||
|
||||
preparedata_setting = {
|
||||
"server_name": "tmdev",
|
||||
"data_root_path": "data/",
|
||||
"max_seq_length": 128,
|
||||
"jobid_config": jobid_config,
|
||||
"wandb_utils": wandb_utils,
|
||||
"resplit_portion": {"source": ["train", "validation"],
|
||||
"train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}
|
||||
}
|
||||
autohf.prepare_data(**preparedata_setting)
|
||||
.. code-block:: python
|
||||
|
||||
preparedata_setting = {"server_name": "tmdev", "data_root_path": "data/", "max_seq_length": 128,
|
||||
"jobid_config": jobid_config, "wandb_utils": wandb_utils,
|
||||
"resplit_portion": {"source": ["train", "validation"],
|
||||
"train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}}
|
||||
|
||||
autohf.prepare_data(**preparedata_setting)
|
||||
|
||||
Args:
|
||||
server_name:
|
||||
@@ -148,7 +143,7 @@ class AutoTransformers:
|
||||
If args.resplit_mode = "rspt", resplit_portion is required
|
||||
is_wandb_on:
|
||||
A boolean variable indicating whether wandb is used
|
||||
'''
|
||||
"""
|
||||
from .dataset.dataprocess_auto import AutoEncodeText
|
||||
from transformers import AutoTokenizer
|
||||
from datasets import load_dataset
|
||||
@@ -682,16 +677,20 @@ class AutoTransformers:
|
||||
resources_per_trial=None,
|
||||
ray_local_mode=False,
|
||||
**custom_hpo_args):
|
||||
'''Fine tuning the huggingface using the hpo setting
|
||||
"""Fine tuning the huggingface using the hpo setting
|
||||
|
||||
An example:
|
||||
autohf_settings = {"resources_per_trial": {"cpu": 1},
|
||||
"num_samples": 1,
|
||||
"time_budget": 100000,
|
||||
"ckpt_per_epoch": 1,
|
||||
"fp16": False,
|
||||
}
|
||||
validation_metric, analysis = autohf.fit(**autohf_settings)
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
autohf_settings = {"resources_per_trial": {"cpu": 1},
|
||||
"num_samples": 1,
|
||||
"time_budget": 100000,
|
||||
"ckpt_per_epoch": 1,
|
||||
"fp16": False,
|
||||
}
|
||||
|
||||
validation_metric, analysis = autohf.fit(**autohf_settings)
|
||||
|
||||
Args:
|
||||
resources_per_trial:
|
||||
@@ -710,28 +709,25 @@ class AutoTransformers:
|
||||
ckpt_per_epoch:
|
||||
An integer value of number of checkpoints per epoch, default = 1
|
||||
ray_verbose:
|
||||
int, default=1 | verbosit of ray,
|
||||
An integer, default=1 | verbosit of ray,
|
||||
transformers_verbose:
|
||||
int, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
|
||||
An integer, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
|
||||
transformers.logging.ERROR, transformers.logging.INFO, transformers.logging.WARNING,
|
||||
or transformers.logging.DEBUG
|
||||
fp16:
|
||||
boolean, default = True | whether to use fp16
|
||||
A boolean, default = True | whether to use fp16
|
||||
ray_local_mode:
|
||||
boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
|
||||
A boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
|
||||
custom_hpo_args:
|
||||
The additional keyword arguments, e.g.,
|
||||
custom_hpo_args = {"points_to_evaluate": [{
|
||||
"num_train_epochs": 1,
|
||||
"per_device_train_batch_size": 128, }]}
|
||||
The additional keyword arguments, e.g., custom_hpo_args = {"points_to_evaluate": [{
|
||||
"num_train_epochs": 1, "per_device_train_batch_size": 128, }]}
|
||||
|
||||
Returns:
|
||||
validation_metric:
|
||||
a dict storing the validation score
|
||||
analysis:
|
||||
a ray.tune.analysis.Analysis object storing the analysis results from tune.run
|
||||
|
||||
'''
|
||||
validation_metric: A dict storing the validation score
|
||||
|
||||
analysis: A ray.tune.analysis.Analysis object storing the analysis results from tune.run
|
||||
"""
|
||||
from .hpo.scheduler_auto import AutoScheduler
|
||||
self._transformers_verbose = transformers_verbose
|
||||
|
||||
@@ -854,14 +850,14 @@ class AutoTransformers:
|
||||
|
||||
Args:
|
||||
predictions:
|
||||
a list of predictions, which is the output of AutoTransformers.predict()
|
||||
A list of predictions, which is the output of AutoTransformers.predict()
|
||||
output_prediction_path:
|
||||
output path for the prediction
|
||||
Output path for the prediction
|
||||
output_zip_file_name:
|
||||
an string, which is the name of the output zip file
|
||||
An string, which is the name of the output zip file
|
||||
|
||||
Returns:
|
||||
the path of the output .zip file
|
||||
The path of the output .zip file
|
||||
"""
|
||||
from .dataset.submission_auto import auto_output_prediction
|
||||
return auto_output_prediction(self.jobid_config.dat,
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
import os
|
||||
import transformers
|
||||
|
||||
try:
|
||||
from transformers import Trainer as TFTrainer
|
||||
except ImportError:
|
||||
TFTrainer = object
|
||||
|
||||
|
||||
class TrainerForAutoTransformers(transformers.Trainer):
|
||||
"""
|
||||
Overriding transformers.Trainer.
|
||||
|
||||
Args:
|
||||
huggingface (:class:`~transformers.PreTrainedModel` or :obj:`torch.nn.Module`, `optional`):
|
||||
"""
|
||||
class TrainerForAutoTransformers(TFTrainer):
|
||||
|
||||
def evaluate(self,
|
||||
eval_dataset=None):
|
||||
|
||||
@@ -132,28 +132,23 @@ class JobID:
|
||||
self.sdhf = 42
|
||||
|
||||
def is_match(self, partial_jobid):
|
||||
"""
|
||||
return a boolean variable whether the current object matches the partial jobid defined
|
||||
in partial_jobid. For example,
|
||||
self = JobID(dat = ['glue'],
|
||||
subdat = 'cola',
|
||||
mod = 'bestnn',
|
||||
spa = 'buni',
|
||||
arg = 'cus',
|
||||
alg = 'bs',
|
||||
pru = 'None',
|
||||
pre = 'funnel',
|
||||
presz = 'xlarge',
|
||||
spt = 'rspt',
|
||||
rep = 0,
|
||||
sddt = 43,
|
||||
sdhf = 42)
|
||||
partial_jobid1 = JobID(dat = ['glue'],
|
||||
subdat = 'cola',
|
||||
mod = 'hpo')
|
||||
partial_jobid2 = JobID(dat = ['glue'],
|
||||
subdat = 'cola',
|
||||
mod = 'bestnn')
|
||||
"""Return a boolean variable whether the current object matches the partial jobid defined in partial_jobid.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
self = JobID(dat = ['glue'], subdat = 'cola', mod = 'bestnn', spa = 'buni', arg = 'cus', alg = 'bs',
|
||||
pru = 'None', pre = 'funnel', presz = 'xlarge', spt = 'rspt', rep = 0, sddt = 43, sdhf = 42)
|
||||
|
||||
partial_jobid1 = JobID(dat = ['glue'],
|
||||
subdat = 'cola',
|
||||
mod = 'hpo')
|
||||
|
||||
partial_jobid2 = JobID(dat = ['glue'],
|
||||
subdat = 'cola',
|
||||
mod = 'bestnn')
|
||||
|
||||
return False for partial_jobid1 and True for partial_jobid2
|
||||
"""
|
||||
is_not_match = False
|
||||
@@ -166,7 +161,7 @@ class JobID:
|
||||
|
||||
def to_wandb_string(self):
|
||||
"""
|
||||
preparing for the job ID for wandb
|
||||
Preparing for the job ID for wandb
|
||||
"""
|
||||
field_dict = self.__dict__
|
||||
keytoval_str = "_".join([JobID.dataset_list_to_str(field_dict[key])
|
||||
@@ -177,7 +172,7 @@ class JobID:
|
||||
|
||||
def to_jobid_string(self):
|
||||
"""
|
||||
convert the current JobID into a blob name string which contains all the fields
|
||||
Convert the current JobID into a blob name string which contains all the fields
|
||||
"""
|
||||
list_keys = list(JobID.__dataclass_fields__.keys())
|
||||
field_dict = self.__dict__
|
||||
@@ -189,7 +184,7 @@ class JobID:
|
||||
|
||||
def to_partial_jobid_string(self):
|
||||
"""
|
||||
convert the current JobID into a blob name string which only contains the fields whose values are not "None"
|
||||
Convert the current JobID into a blob name string which only contains the fields whose values are not "None"
|
||||
"""
|
||||
list_keys = list(JobID.__dataclass_fields__.keys())
|
||||
field_dict = self.__dict__ # field_dict contains fields whose values are not None
|
||||
@@ -202,9 +197,10 @@ class JobID:
|
||||
@staticmethod
|
||||
def blobname_to_jobid_dict(keytoval_str):
|
||||
"""
|
||||
converting an azure blobname to a JobID config,
|
||||
Converting an azure blobname to a JobID config,
|
||||
e.g., blobname = "dat=glue_subdat=cola_mod=bestnn_spa=buni_arg=cus_
|
||||
alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json"
|
||||
alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json"
|
||||
|
||||
the converted jobid dict = {dat = ['glue'], subdat = 'cola', mod = 'bestnn',
|
||||
spa = 'buni', arg = 'cus', alg = 'bs', pru = 'None',
|
||||
pre = 'funnel', presz = 'xlarge', spt = 'rspt',
|
||||
@@ -257,7 +253,7 @@ class JobID:
|
||||
**jobid_list
|
||||
):
|
||||
"""
|
||||
set the jobid from a dict object
|
||||
Set the jobid from a dict object
|
||||
"""
|
||||
for key in jobid_list.keys():
|
||||
assert key in JobID.__dataclass_fields__.keys()
|
||||
@@ -268,7 +264,7 @@ class JobID:
|
||||
@staticmethod
|
||||
def convert_blobname_to_jobid(blobname):
|
||||
"""
|
||||
converting a blobname string to a JobID object
|
||||
Converting a blobname string to a JobID object
|
||||
"""
|
||||
jobconfig_dict = JobID.blobname_to_jobid_dict(blobname)
|
||||
if jobconfig_dict:
|
||||
@@ -281,7 +277,7 @@ class JobID:
|
||||
@staticmethod
|
||||
def get_full_data_name(dataset_name: Union[list, str], subdataset_name=None):
|
||||
"""
|
||||
convert a dataset name and sub dataset name to a full dataset name
|
||||
Convert a dataset name and sub dataset name to a full dataset name
|
||||
"""
|
||||
if isinstance(dataset_name, list):
|
||||
full_dataset_name = JobID.dataset_list_to_str(dataset_name)
|
||||
@@ -293,7 +289,7 @@ class JobID:
|
||||
|
||||
def get_jobid_full_data_name(self):
|
||||
"""
|
||||
get the full dataset name of the current JobID object
|
||||
Get the full dataset name of the current JobID object
|
||||
"""
|
||||
return JobID.get_full_data_name(JobID.dataset_list_to_str(self.dat), self.subdat)
|
||||
|
||||
@@ -573,7 +569,7 @@ class AzureUtils:
|
||||
predictions=None,
|
||||
duration=None):
|
||||
"""
|
||||
write the key info from a job and upload to azure blob storage
|
||||
Write the key info from a job and upload to azure blob storage
|
||||
"""
|
||||
local_file_path = self.generate_local_json_path()
|
||||
output_json = {}
|
||||
@@ -590,7 +586,7 @@ class AzureUtils:
|
||||
|
||||
def generate_local_json_path(self):
|
||||
"""
|
||||
return a path string for storing the json file locally
|
||||
Return a path string for storing the json file locally
|
||||
"""
|
||||
full_dataset_name = self.jobid.get_jobid_full_data_name()
|
||||
jobid_str = self.jobid.to_jobid_string()
|
||||
@@ -608,7 +604,7 @@ class AzureUtils:
|
||||
local_json_file,
|
||||
predictions):
|
||||
"""
|
||||
store predictions (a .zip file) locally and upload
|
||||
Store predictions (a .zip file) locally and upload
|
||||
"""
|
||||
azure_save_file_name = local_json_file.split("/")[-1][:-5]
|
||||
if self.data_root_dir is None:
|
||||
@@ -637,7 +633,7 @@ class AzureUtils:
|
||||
partial_jobid,
|
||||
earliest_time: Tuple[int, int, int] = None):
|
||||
"""
|
||||
get all blobs whose jobid configs match the partial_jobid
|
||||
Get all blobs whose jobid configs match the partial_jobid
|
||||
"""
|
||||
blob_list = []
|
||||
container_client = self._init_azure_clients()
|
||||
|
||||
@@ -11,12 +11,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class AutoVW:
|
||||
"""The AutoML class
|
||||
|
||||
Methods:
|
||||
predict(data_sample)
|
||||
learn(data_sample)
|
||||
AUTO
|
||||
"""
|
||||
|
||||
WARMSTART_NUM = 100
|
||||
AUTOMATIC = '_auto'
|
||||
VW_INTERACTION_ARG_NAME = 'interactions'
|
||||
@@ -134,7 +130,7 @@ class AutoVW:
|
||||
self._y_predict = self._best_trial.predict(data_sample)
|
||||
# code for debugging purpose
|
||||
if self._prediction_trial_id is None or \
|
||||
self._prediction_trial_id != self._best_trial.trial_id:
|
||||
self._prediction_trial_id != self._best_trial.trial_id:
|
||||
self._prediction_trial_id = self._best_trial.trial_id
|
||||
logger.info('prediction trial id changed to %s at iter %s, resource used: %s',
|
||||
self._prediction_trial_id, self._iter,
|
||||
@@ -160,7 +156,7 @@ class AutoVW:
|
||||
or trial.result.resource_used >= self.WARMSTART_NUM):
|
||||
score = trial.result.get_score(self._model_select_policy)
|
||||
if ('min' == self._model_selection_mode and score < best_score) or \
|
||||
('max' == self._model_selection_mode and score > best_score):
|
||||
('max' == self._model_selection_mode and score > best_score):
|
||||
best_score = score
|
||||
new_best_trial = trial
|
||||
if new_best_trial is not None:
|
||||
|
||||
@@ -189,11 +189,15 @@ class BlendSearch(Searcher):
|
||||
self._metric_constraint_penalty = None
|
||||
|
||||
def save(self, checkpoint_path: str):
|
||||
''' save states to a checkpoint path
|
||||
'''
|
||||
save_object = self
|
||||
with open(checkpoint_path, "wb") as outputFile:
|
||||
pickle.dump(save_object, outputFile)
|
||||
|
||||
def restore(self, checkpoint_path: str):
|
||||
''' restore states from checkpoint
|
||||
'''
|
||||
with open(checkpoint_path, "rb") as inputFile:
|
||||
state = pickle.load(inputFile)
|
||||
self._metric_target = state._metric_target
|
||||
@@ -220,9 +224,6 @@ class BlendSearch(Searcher):
|
||||
def metric_target(self):
|
||||
return self._metric_target
|
||||
|
||||
def restore_from_dir(self, checkpoint_dir: str):
|
||||
super.restore_from_dir(checkpoint_dir)
|
||||
|
||||
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None,
|
||||
error: bool = False):
|
||||
''' search thread updater and cleaner
|
||||
@@ -353,6 +354,8 @@ class BlendSearch(Searcher):
|
||||
return False
|
||||
|
||||
def on_trial_result(self, trial_id: str, result: Dict):
|
||||
''' receive intermediate result
|
||||
'''
|
||||
if trial_id not in self._trial_proposed_by:
|
||||
return
|
||||
thread_id = self._trial_proposed_by[trial_id]
|
||||
|
||||
@@ -180,10 +180,10 @@ def run(training_function,
|
||||
prune_attr: A string of the attribute used for pruning.
|
||||
Not necessarily in space.
|
||||
When prune_attr is in space, it is a hyperparameter, e.g.,
|
||||
'n_iters', and the best value is unknown.
|
||||
'n_iters', and the best value is unknown.
|
||||
When prune_attr is not in space, it is a resource dimension,
|
||||
e.g., 'sample_size', and the peak performance is assumed
|
||||
to be at the max_resource.
|
||||
e.g., 'sample_size', and the peak performance is assumed
|
||||
to be at the max_resource.
|
||||
min_resource: A float of the minimal resource to use for the
|
||||
prune_attr; only valid if prune_attr is not in space.
|
||||
max_resource: A float of the maximal resource to use for the
|
||||
|
||||
Reference in New Issue
Block a user