* fixing apidoc errors

Co-authored-by: Chi Wang (MSR) <wang.chi@microsoft.com>
Co-authored-by: liususan091219 <Xqq630517>
This commit is contained in:
Xueqing Liu
2021-06-19 22:09:49 -04:00
committed by GitHub
parent 6133db84e8
commit d40993d920
6 changed files with 90 additions and 101 deletions

View File

@@ -2,21 +2,18 @@ import json
import os
import numpy as np
import time
import logging
try:
import ray
import transformers
from transformers import TrainingArguments
import datasets
import torch
from .dataset.task_auto import get_default_task
from .result_analysis.azure_utils import JobID
from .huggingface.trainer import TrainerForAutoTransformers
except ImportError:
print("To use the nlp component in flaml, run pip install flaml[nlp]")
from .dataset.task_auto import get_default_task
from .result_analysis.azure_utils import JobID
from .huggingface.trainer import TrainerForAutoTransformers
task_list = [
"seq-classification",
"regression",
@@ -116,20 +113,18 @@ class AutoTransformers:
fold_name=None,
resplit_portion=None,
**custom_data_args):
'''Prepare data
"""Prepare data
An example:
Example:
preparedata_setting = {
"server_name": "tmdev",
"data_root_path": "data/",
"max_seq_length": 128,
"jobid_config": jobid_config,
"wandb_utils": wandb_utils,
"resplit_portion": {"source": ["train", "validation"],
"train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}
}
autohf.prepare_data(**preparedata_setting)
.. code-block:: python
preparedata_setting = {"server_name": "tmdev", "data_root_path": "data/", "max_seq_length": 128,
"jobid_config": jobid_config, "wandb_utils": wandb_utils,
"resplit_portion": {"source": ["train", "validation"],
"train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}}
autohf.prepare_data(**preparedata_setting)
Args:
server_name:
@@ -148,7 +143,7 @@ class AutoTransformers:
If args.resplit_mode = "rspt", resplit_portion is required
is_wandb_on:
A boolean variable indicating whether wandb is used
'''
"""
from .dataset.dataprocess_auto import AutoEncodeText
from transformers import AutoTokenizer
from datasets import load_dataset
@@ -682,16 +677,20 @@ class AutoTransformers:
resources_per_trial=None,
ray_local_mode=False,
**custom_hpo_args):
'''Fine tuning the huggingface using the hpo setting
"""Fine tuning the huggingface using the hpo setting
An example:
autohf_settings = {"resources_per_trial": {"cpu": 1},
"num_samples": 1,
"time_budget": 100000,
"ckpt_per_epoch": 1,
"fp16": False,
}
validation_metric, analysis = autohf.fit(**autohf_settings)
Example:
.. code-block:: python
autohf_settings = {"resources_per_trial": {"cpu": 1},
"num_samples": 1,
"time_budget": 100000,
"ckpt_per_epoch": 1,
"fp16": False,
}
validation_metric, analysis = autohf.fit(**autohf_settings)
Args:
resources_per_trial:
@@ -710,28 +709,25 @@ class AutoTransformers:
ckpt_per_epoch:
An integer value of number of checkpoints per epoch, default = 1
ray_verbose:
int, default=1 | verbosit of ray,
An integer, default=1 | verbosit of ray,
transformers_verbose:
int, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
An integer, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
transformers.logging.ERROR, transformers.logging.INFO, transformers.logging.WARNING,
or transformers.logging.DEBUG
fp16:
boolean, default = True | whether to use fp16
A boolean, default = True | whether to use fp16
ray_local_mode:
boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
A boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
custom_hpo_args:
The additional keyword arguments, e.g.,
custom_hpo_args = {"points_to_evaluate": [{
"num_train_epochs": 1,
"per_device_train_batch_size": 128, }]}
The additional keyword arguments, e.g., custom_hpo_args = {"points_to_evaluate": [{
"num_train_epochs": 1, "per_device_train_batch_size": 128, }]}
Returns:
validation_metric:
a dict storing the validation score
analysis:
a ray.tune.analysis.Analysis object storing the analysis results from tune.run
'''
validation_metric: A dict storing the validation score
analysis: A ray.tune.analysis.Analysis object storing the analysis results from tune.run
"""
from .hpo.scheduler_auto import AutoScheduler
self._transformers_verbose = transformers_verbose
@@ -854,14 +850,14 @@ class AutoTransformers:
Args:
predictions:
a list of predictions, which is the output of AutoTransformers.predict()
A list of predictions, which is the output of AutoTransformers.predict()
output_prediction_path:
output path for the prediction
Output path for the prediction
output_zip_file_name:
an string, which is the name of the output zip file
An string, which is the name of the output zip file
Returns:
the path of the output .zip file
The path of the output .zip file
"""
from .dataset.submission_auto import auto_output_prediction
return auto_output_prediction(self.jobid_config.dat,

View File

@@ -1,14 +1,12 @@
import os
import transformers
try:
from transformers import Trainer as TFTrainer
except ImportError:
TFTrainer = object
class TrainerForAutoTransformers(transformers.Trainer):
"""
Overriding transformers.Trainer.
Args:
huggingface (:class:`~transformers.PreTrainedModel` or :obj:`torch.nn.Module`, `optional`):
"""
class TrainerForAutoTransformers(TFTrainer):
def evaluate(self,
eval_dataset=None):

View File

@@ -132,28 +132,23 @@ class JobID:
self.sdhf = 42
def is_match(self, partial_jobid):
"""
return a boolean variable whether the current object matches the partial jobid defined
in partial_jobid. For example,
self = JobID(dat = ['glue'],
subdat = 'cola',
mod = 'bestnn',
spa = 'buni',
arg = 'cus',
alg = 'bs',
pru = 'None',
pre = 'funnel',
presz = 'xlarge',
spt = 'rspt',
rep = 0,
sddt = 43,
sdhf = 42)
partial_jobid1 = JobID(dat = ['glue'],
subdat = 'cola',
mod = 'hpo')
partial_jobid2 = JobID(dat = ['glue'],
subdat = 'cola',
mod = 'bestnn')
"""Return a boolean variable whether the current object matches the partial jobid defined in partial_jobid.
Example:
.. code-block:: python
self = JobID(dat = ['glue'], subdat = 'cola', mod = 'bestnn', spa = 'buni', arg = 'cus', alg = 'bs',
pru = 'None', pre = 'funnel', presz = 'xlarge', spt = 'rspt', rep = 0, sddt = 43, sdhf = 42)
partial_jobid1 = JobID(dat = ['glue'],
subdat = 'cola',
mod = 'hpo')
partial_jobid2 = JobID(dat = ['glue'],
subdat = 'cola',
mod = 'bestnn')
return False for partial_jobid1 and True for partial_jobid2
"""
is_not_match = False
@@ -166,7 +161,7 @@ class JobID:
def to_wandb_string(self):
"""
preparing for the job ID for wandb
Preparing for the job ID for wandb
"""
field_dict = self.__dict__
keytoval_str = "_".join([JobID.dataset_list_to_str(field_dict[key])
@@ -177,7 +172,7 @@ class JobID:
def to_jobid_string(self):
"""
convert the current JobID into a blob name string which contains all the fields
Convert the current JobID into a blob name string which contains all the fields
"""
list_keys = list(JobID.__dataclass_fields__.keys())
field_dict = self.__dict__
@@ -189,7 +184,7 @@ class JobID:
def to_partial_jobid_string(self):
"""
convert the current JobID into a blob name string which only contains the fields whose values are not "None"
Convert the current JobID into a blob name string which only contains the fields whose values are not "None"
"""
list_keys = list(JobID.__dataclass_fields__.keys())
field_dict = self.__dict__ # field_dict contains fields whose values are not None
@@ -202,9 +197,10 @@ class JobID:
@staticmethod
def blobname_to_jobid_dict(keytoval_str):
"""
converting an azure blobname to a JobID config,
Converting an azure blobname to a JobID config,
e.g., blobname = "dat=glue_subdat=cola_mod=bestnn_spa=buni_arg=cus_
alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json"
alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json"
the converted jobid dict = {dat = ['glue'], subdat = 'cola', mod = 'bestnn',
spa = 'buni', arg = 'cus', alg = 'bs', pru = 'None',
pre = 'funnel', presz = 'xlarge', spt = 'rspt',
@@ -257,7 +253,7 @@ class JobID:
**jobid_list
):
"""
set the jobid from a dict object
Set the jobid from a dict object
"""
for key in jobid_list.keys():
assert key in JobID.__dataclass_fields__.keys()
@@ -268,7 +264,7 @@ class JobID:
@staticmethod
def convert_blobname_to_jobid(blobname):
"""
converting a blobname string to a JobID object
Converting a blobname string to a JobID object
"""
jobconfig_dict = JobID.blobname_to_jobid_dict(blobname)
if jobconfig_dict:
@@ -281,7 +277,7 @@ class JobID:
@staticmethod
def get_full_data_name(dataset_name: Union[list, str], subdataset_name=None):
"""
convert a dataset name and sub dataset name to a full dataset name
Convert a dataset name and sub dataset name to a full dataset name
"""
if isinstance(dataset_name, list):
full_dataset_name = JobID.dataset_list_to_str(dataset_name)
@@ -293,7 +289,7 @@ class JobID:
def get_jobid_full_data_name(self):
"""
get the full dataset name of the current JobID object
Get the full dataset name of the current JobID object
"""
return JobID.get_full_data_name(JobID.dataset_list_to_str(self.dat), self.subdat)
@@ -573,7 +569,7 @@ class AzureUtils:
predictions=None,
duration=None):
"""
write the key info from a job and upload to azure blob storage
Write the key info from a job and upload to azure blob storage
"""
local_file_path = self.generate_local_json_path()
output_json = {}
@@ -590,7 +586,7 @@ class AzureUtils:
def generate_local_json_path(self):
"""
return a path string for storing the json file locally
Return a path string for storing the json file locally
"""
full_dataset_name = self.jobid.get_jobid_full_data_name()
jobid_str = self.jobid.to_jobid_string()
@@ -608,7 +604,7 @@ class AzureUtils:
local_json_file,
predictions):
"""
store predictions (a .zip file) locally and upload
Store predictions (a .zip file) locally and upload
"""
azure_save_file_name = local_json_file.split("/")[-1][:-5]
if self.data_root_dir is None:
@@ -637,7 +633,7 @@ class AzureUtils:
partial_jobid,
earliest_time: Tuple[int, int, int] = None):
"""
get all blobs whose jobid configs match the partial_jobid
Get all blobs whose jobid configs match the partial_jobid
"""
blob_list = []
container_client = self._init_azure_clients()

View File

@@ -11,12 +11,8 @@ logger = logging.getLogger(__name__)
class AutoVW:
"""The AutoML class
Methods:
predict(data_sample)
learn(data_sample)
AUTO
"""
WARMSTART_NUM = 100
AUTOMATIC = '_auto'
VW_INTERACTION_ARG_NAME = 'interactions'
@@ -134,7 +130,7 @@ class AutoVW:
self._y_predict = self._best_trial.predict(data_sample)
# code for debugging purpose
if self._prediction_trial_id is None or \
self._prediction_trial_id != self._best_trial.trial_id:
self._prediction_trial_id != self._best_trial.trial_id:
self._prediction_trial_id = self._best_trial.trial_id
logger.info('prediction trial id changed to %s at iter %s, resource used: %s',
self._prediction_trial_id, self._iter,
@@ -160,7 +156,7 @@ class AutoVW:
or trial.result.resource_used >= self.WARMSTART_NUM):
score = trial.result.get_score(self._model_select_policy)
if ('min' == self._model_selection_mode and score < best_score) or \
('max' == self._model_selection_mode and score > best_score):
('max' == self._model_selection_mode and score > best_score):
best_score = score
new_best_trial = trial
if new_best_trial is not None:

View File

@@ -189,11 +189,15 @@ class BlendSearch(Searcher):
self._metric_constraint_penalty = None
def save(self, checkpoint_path: str):
''' save states to a checkpoint path
'''
save_object = self
with open(checkpoint_path, "wb") as outputFile:
pickle.dump(save_object, outputFile)
def restore(self, checkpoint_path: str):
''' restore states from checkpoint
'''
with open(checkpoint_path, "rb") as inputFile:
state = pickle.load(inputFile)
self._metric_target = state._metric_target
@@ -220,9 +224,6 @@ class BlendSearch(Searcher):
def metric_target(self):
return self._metric_target
def restore_from_dir(self, checkpoint_dir: str):
super.restore_from_dir(checkpoint_dir)
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None,
error: bool = False):
''' search thread updater and cleaner
@@ -353,6 +354,8 @@ class BlendSearch(Searcher):
return False
def on_trial_result(self, trial_id: str, result: Dict):
''' receive intermediate result
'''
if trial_id not in self._trial_proposed_by:
return
thread_id = self._trial_proposed_by[trial_id]

View File

@@ -180,10 +180,10 @@ def run(training_function,
prune_attr: A string of the attribute used for pruning.
Not necessarily in space.
When prune_attr is in space, it is a hyperparameter, e.g.,
'n_iters', and the best value is unknown.
'n_iters', and the best value is unknown.
When prune_attr is not in space, it is a resource dimension,
e.g., 'sample_size', and the peak performance is assumed
to be at the max_resource.
e.g., 'sample_size', and the peak performance is assumed
to be at the max_resource.
min_resource: A float of the minimal resource to use for the
prune_attr; only valid if prune_attr is not in space.
max_resource: A float of the maximal resource to use for the