diff --git a/flaml/nlp/autotransformers.py b/flaml/nlp/autotransformers.py index 29f979edc..803290e40 100644 --- a/flaml/nlp/autotransformers.py +++ b/flaml/nlp/autotransformers.py @@ -2,21 +2,18 @@ import json import os import numpy as np import time -import logging try: import ray import transformers from transformers import TrainingArguments import datasets - import torch + from .dataset.task_auto import get_default_task + from .result_analysis.azure_utils import JobID + from .huggingface.trainer import TrainerForAutoTransformers except ImportError: print("To use the nlp component in flaml, run pip install flaml[nlp]") -from .dataset.task_auto import get_default_task -from .result_analysis.azure_utils import JobID -from .huggingface.trainer import TrainerForAutoTransformers - task_list = [ "seq-classification", "regression", @@ -116,20 +113,18 @@ class AutoTransformers: fold_name=None, resplit_portion=None, **custom_data_args): - '''Prepare data + """Prepare data - An example: + Example: - preparedata_setting = { - "server_name": "tmdev", - "data_root_path": "data/", - "max_seq_length": 128, - "jobid_config": jobid_config, - "wandb_utils": wandb_utils, - "resplit_portion": {"source": ["train", "validation"], - "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]} - } - autohf.prepare_data(**preparedata_setting) + .. code-block:: python + + preparedata_setting = {"server_name": "tmdev", "data_root_path": "data/", "max_seq_length": 128, + "jobid_config": jobid_config, "wandb_utils": wandb_utils, + "resplit_portion": {"source": ["train", "validation"], + "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}} + + autohf.prepare_data(**preparedata_setting) Args: server_name: @@ -148,7 +143,7 @@ class AutoTransformers: If args.resplit_mode = "rspt", resplit_portion is required is_wandb_on: A boolean variable indicating whether wandb is used - ''' + """ from .dataset.dataprocess_auto import AutoEncodeText from transformers import AutoTokenizer from datasets import load_dataset @@ -682,16 +677,20 @@ class AutoTransformers: resources_per_trial=None, ray_local_mode=False, **custom_hpo_args): - '''Fine tuning the huggingface using the hpo setting + """Fine tuning the huggingface using the hpo setting - An example: - autohf_settings = {"resources_per_trial": {"cpu": 1}, - "num_samples": 1, - "time_budget": 100000, - "ckpt_per_epoch": 1, - "fp16": False, - } - validation_metric, analysis = autohf.fit(**autohf_settings) + Example: + + .. code-block:: python + + autohf_settings = {"resources_per_trial": {"cpu": 1}, + "num_samples": 1, + "time_budget": 100000, + "ckpt_per_epoch": 1, + "fp16": False, + } + + validation_metric, analysis = autohf.fit(**autohf_settings) Args: resources_per_trial: @@ -710,28 +709,25 @@ class AutoTransformers: ckpt_per_epoch: An integer value of number of checkpoints per epoch, default = 1 ray_verbose: - int, default=1 | verbosit of ray, + An integer, default=1 | verbosit of ray, transformers_verbose: - int, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of + An integer, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of transformers.logging.ERROR, transformers.logging.INFO, transformers.logging.WARNING, or transformers.logging.DEBUG fp16: - boolean, default = True | whether to use fp16 + A boolean, default = True | whether to use fp16 ray_local_mode: - boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run + A boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run custom_hpo_args: - The additional keyword arguments, e.g., - custom_hpo_args = {"points_to_evaluate": [{ - "num_train_epochs": 1, - "per_device_train_batch_size": 128, }]} + The additional keyword arguments, e.g., custom_hpo_args = {"points_to_evaluate": [{ + "num_train_epochs": 1, "per_device_train_batch_size": 128, }]} Returns: - validation_metric: - a dict storing the validation score - analysis: - a ray.tune.analysis.Analysis object storing the analysis results from tune.run - ''' + validation_metric: A dict storing the validation score + + analysis: A ray.tune.analysis.Analysis object storing the analysis results from tune.run + """ from .hpo.scheduler_auto import AutoScheduler self._transformers_verbose = transformers_verbose @@ -854,14 +850,14 @@ class AutoTransformers: Args: predictions: - a list of predictions, which is the output of AutoTransformers.predict() + A list of predictions, which is the output of AutoTransformers.predict() output_prediction_path: - output path for the prediction + Output path for the prediction output_zip_file_name: - an string, which is the name of the output zip file + An string, which is the name of the output zip file Returns: - the path of the output .zip file + The path of the output .zip file """ from .dataset.submission_auto import auto_output_prediction return auto_output_prediction(self.jobid_config.dat, diff --git a/flaml/nlp/huggingface/trainer.py b/flaml/nlp/huggingface/trainer.py index ce2c2c438..53ac6a323 100644 --- a/flaml/nlp/huggingface/trainer.py +++ b/flaml/nlp/huggingface/trainer.py @@ -1,14 +1,12 @@ import os -import transformers + +try: + from transformers import Trainer as TFTrainer +except ImportError: + TFTrainer = object -class TrainerForAutoTransformers(transformers.Trainer): - """ - Overriding transformers.Trainer. - - Args: - huggingface (:class:`~transformers.PreTrainedModel` or :obj:`torch.nn.Module`, `optional`): - """ +class TrainerForAutoTransformers(TFTrainer): def evaluate(self, eval_dataset=None): diff --git a/flaml/nlp/result_analysis/azure_utils.py b/flaml/nlp/result_analysis/azure_utils.py index b87de3d7d..850cf3cfd 100644 --- a/flaml/nlp/result_analysis/azure_utils.py +++ b/flaml/nlp/result_analysis/azure_utils.py @@ -132,28 +132,23 @@ class JobID: self.sdhf = 42 def is_match(self, partial_jobid): - """ - return a boolean variable whether the current object matches the partial jobid defined - in partial_jobid. For example, - self = JobID(dat = ['glue'], - subdat = 'cola', - mod = 'bestnn', - spa = 'buni', - arg = 'cus', - alg = 'bs', - pru = 'None', - pre = 'funnel', - presz = 'xlarge', - spt = 'rspt', - rep = 0, - sddt = 43, - sdhf = 42) - partial_jobid1 = JobID(dat = ['glue'], - subdat = 'cola', - mod = 'hpo') - partial_jobid2 = JobID(dat = ['glue'], - subdat = 'cola', - mod = 'bestnn') + """Return a boolean variable whether the current object matches the partial jobid defined in partial_jobid. + + Example: + + .. code-block:: python + + self = JobID(dat = ['glue'], subdat = 'cola', mod = 'bestnn', spa = 'buni', arg = 'cus', alg = 'bs', + pru = 'None', pre = 'funnel', presz = 'xlarge', spt = 'rspt', rep = 0, sddt = 43, sdhf = 42) + + partial_jobid1 = JobID(dat = ['glue'], + subdat = 'cola', + mod = 'hpo') + + partial_jobid2 = JobID(dat = ['glue'], + subdat = 'cola', + mod = 'bestnn') + return False for partial_jobid1 and True for partial_jobid2 """ is_not_match = False @@ -166,7 +161,7 @@ class JobID: def to_wandb_string(self): """ - preparing for the job ID for wandb + Preparing for the job ID for wandb """ field_dict = self.__dict__ keytoval_str = "_".join([JobID.dataset_list_to_str(field_dict[key]) @@ -177,7 +172,7 @@ class JobID: def to_jobid_string(self): """ - convert the current JobID into a blob name string which contains all the fields + Convert the current JobID into a blob name string which contains all the fields """ list_keys = list(JobID.__dataclass_fields__.keys()) field_dict = self.__dict__ @@ -189,7 +184,7 @@ class JobID: def to_partial_jobid_string(self): """ - convert the current JobID into a blob name string which only contains the fields whose values are not "None" + Convert the current JobID into a blob name string which only contains the fields whose values are not "None" """ list_keys = list(JobID.__dataclass_fields__.keys()) field_dict = self.__dict__ # field_dict contains fields whose values are not None @@ -202,9 +197,10 @@ class JobID: @staticmethod def blobname_to_jobid_dict(keytoval_str): """ - converting an azure blobname to a JobID config, + Converting an azure blobname to a JobID config, e.g., blobname = "dat=glue_subdat=cola_mod=bestnn_spa=buni_arg=cus_ - alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json" + alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json" + the converted jobid dict = {dat = ['glue'], subdat = 'cola', mod = 'bestnn', spa = 'buni', arg = 'cus', alg = 'bs', pru = 'None', pre = 'funnel', presz = 'xlarge', spt = 'rspt', @@ -257,7 +253,7 @@ class JobID: **jobid_list ): """ - set the jobid from a dict object + Set the jobid from a dict object """ for key in jobid_list.keys(): assert key in JobID.__dataclass_fields__.keys() @@ -268,7 +264,7 @@ class JobID: @staticmethod def convert_blobname_to_jobid(blobname): """ - converting a blobname string to a JobID object + Converting a blobname string to a JobID object """ jobconfig_dict = JobID.blobname_to_jobid_dict(blobname) if jobconfig_dict: @@ -281,7 +277,7 @@ class JobID: @staticmethod def get_full_data_name(dataset_name: Union[list, str], subdataset_name=None): """ - convert a dataset name and sub dataset name to a full dataset name + Convert a dataset name and sub dataset name to a full dataset name """ if isinstance(dataset_name, list): full_dataset_name = JobID.dataset_list_to_str(dataset_name) @@ -293,7 +289,7 @@ class JobID: def get_jobid_full_data_name(self): """ - get the full dataset name of the current JobID object + Get the full dataset name of the current JobID object """ return JobID.get_full_data_name(JobID.dataset_list_to_str(self.dat), self.subdat) @@ -573,7 +569,7 @@ class AzureUtils: predictions=None, duration=None): """ - write the key info from a job and upload to azure blob storage + Write the key info from a job and upload to azure blob storage """ local_file_path = self.generate_local_json_path() output_json = {} @@ -590,7 +586,7 @@ class AzureUtils: def generate_local_json_path(self): """ - return a path string for storing the json file locally + Return a path string for storing the json file locally """ full_dataset_name = self.jobid.get_jobid_full_data_name() jobid_str = self.jobid.to_jobid_string() @@ -608,7 +604,7 @@ class AzureUtils: local_json_file, predictions): """ - store predictions (a .zip file) locally and upload + Store predictions (a .zip file) locally and upload """ azure_save_file_name = local_json_file.split("/")[-1][:-5] if self.data_root_dir is None: @@ -637,7 +633,7 @@ class AzureUtils: partial_jobid, earliest_time: Tuple[int, int, int] = None): """ - get all blobs whose jobid configs match the partial_jobid + Get all blobs whose jobid configs match the partial_jobid """ blob_list = [] container_client = self._init_azure_clients() diff --git a/flaml/onlineml/autovw.py b/flaml/onlineml/autovw.py index f1e4a1624..8ccddd55e 100644 --- a/flaml/onlineml/autovw.py +++ b/flaml/onlineml/autovw.py @@ -11,12 +11,8 @@ logger = logging.getLogger(__name__) class AutoVW: """The AutoML class - - Methods: - predict(data_sample) - learn(data_sample) - AUTO """ + WARMSTART_NUM = 100 AUTOMATIC = '_auto' VW_INTERACTION_ARG_NAME = 'interactions' @@ -134,7 +130,7 @@ class AutoVW: self._y_predict = self._best_trial.predict(data_sample) # code for debugging purpose if self._prediction_trial_id is None or \ - self._prediction_trial_id != self._best_trial.trial_id: + self._prediction_trial_id != self._best_trial.trial_id: self._prediction_trial_id = self._best_trial.trial_id logger.info('prediction trial id changed to %s at iter %s, resource used: %s', self._prediction_trial_id, self._iter, @@ -160,7 +156,7 @@ class AutoVW: or trial.result.resource_used >= self.WARMSTART_NUM): score = trial.result.get_score(self._model_select_policy) if ('min' == self._model_selection_mode and score < best_score) or \ - ('max' == self._model_selection_mode and score > best_score): + ('max' == self._model_selection_mode and score > best_score): best_score = score new_best_trial = trial if new_best_trial is not None: diff --git a/flaml/searcher/blendsearch.py b/flaml/searcher/blendsearch.py index 1fba8f04b..69f8edfaa 100644 --- a/flaml/searcher/blendsearch.py +++ b/flaml/searcher/blendsearch.py @@ -189,11 +189,15 @@ class BlendSearch(Searcher): self._metric_constraint_penalty = None def save(self, checkpoint_path: str): + ''' save states to a checkpoint path + ''' save_object = self with open(checkpoint_path, "wb") as outputFile: pickle.dump(save_object, outputFile) def restore(self, checkpoint_path: str): + ''' restore states from checkpoint + ''' with open(checkpoint_path, "rb") as inputFile: state = pickle.load(inputFile) self._metric_target = state._metric_target @@ -220,9 +224,6 @@ class BlendSearch(Searcher): def metric_target(self): return self._metric_target - def restore_from_dir(self, checkpoint_dir: str): - super.restore_from_dir(checkpoint_dir) - def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None, error: bool = False): ''' search thread updater and cleaner @@ -353,6 +354,8 @@ class BlendSearch(Searcher): return False def on_trial_result(self, trial_id: str, result: Dict): + ''' receive intermediate result + ''' if trial_id not in self._trial_proposed_by: return thread_id = self._trial_proposed_by[trial_id] diff --git a/flaml/tune/tune.py b/flaml/tune/tune.py index 4a3b02cf8..46f5453f1 100644 --- a/flaml/tune/tune.py +++ b/flaml/tune/tune.py @@ -180,10 +180,10 @@ def run(training_function, prune_attr: A string of the attribute used for pruning. Not necessarily in space. When prune_attr is in space, it is a hyperparameter, e.g., - 'n_iters', and the best value is unknown. + 'n_iters', and the best value is unknown. When prune_attr is not in space, it is a resource dimension, - e.g., 'sample_size', and the peak performance is assumed - to be at the max_resource. + e.g., 'sample_size', and the peak performance is assumed + to be at the max_resource. min_resource: A float of the minimal resource to use for the prune_attr; only valid if prune_attr is not in space. max_resource: A float of the maximal resource to use for the