apidoc (#116)

* fixing apidoc errors Co-authored-by: Chi Wang (MSR) <wang.chi@microsoft.com> Co-authored-by: liususan091219 <Xqq630517>
2026-02-09 02:09:16 +08:00 · 2021-06-19 22:09:49 -04:00
parent 6133db84e8
commit d40993d920
6 changed files with 90 additions and 101 deletions
--- a/flaml/nlp/autotransformers.py
+++ b/flaml/nlp/autotransformers.py
@@ -2,21 +2,18 @@ import json
 import os
 import numpy as np
 import time
-import logging

 try:
    import ray
    import transformers
    from transformers import TrainingArguments
    import datasets
-    import torch
+    from .dataset.task_auto import get_default_task
+    from .result_analysis.azure_utils import JobID
+    from .huggingface.trainer import TrainerForAutoTransformers
 except ImportError:
    print("To use the nlp component in flaml, run pip install flaml[nlp]")

-from .dataset.task_auto import get_default_task
-from .result_analysis.azure_utils import JobID
-from .huggingface.trainer import TrainerForAutoTransformers
-
 task_list = [
    "seq-classification",
    "regression",
@@ -116,20 +113,18 @@ class AutoTransformers:
                     fold_name=None,
                     resplit_portion=None,
                     **custom_data_args):
-        '''Prepare data
+        """Prepare data

-            An example:
+            Example:

-                preparedata_setting = {
-                "server_name": "tmdev",
-                "data_root_path": "data/",
-                "max_seq_length": 128,
-                "jobid_config": jobid_config,
-                "wandb_utils": wandb_utils,
-                "resplit_portion": {"source": ["train", "validation"],
-                "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}
-                }
-                autohf.prepare_data(**preparedata_setting)
+                .. code-block:: python
+
+                    preparedata_setting = {"server_name": "tmdev", "data_root_path": "data/", "max_seq_length": 128,
+                                               "jobid_config": jobid_config, "wandb_utils": wandb_utils,
+                                               "resplit_portion": {"source": ["train", "validation"],
+                                               "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}}
+
+                    autohf.prepare_data(**preparedata_setting)

            Args:
                server_name:
@@ -148,7 +143,7 @@ class AutoTransformers:
                    If args.resplit_mode = "rspt", resplit_portion is required
                is_wandb_on:
                    A boolean variable indicating whether wandb is used
-            '''
+        """
        from .dataset.dataprocess_auto import AutoEncodeText
        from transformers import AutoTokenizer
        from datasets import load_dataset
@@ -682,16 +677,20 @@ class AutoTransformers:
            resources_per_trial=None,
            ray_local_mode=False,
            **custom_hpo_args):
-        '''Fine tuning the huggingface using the hpo setting
+        """Fine tuning the huggingface using the hpo setting

-        An example:
-            autohf_settings = {"resources_per_trial": {"cpu": 1},
-                       "num_samples": 1,
-                       "time_budget": 100000,
-                       "ckpt_per_epoch": 1,
-                       "fp16": False,
-                      }
-            validation_metric, analysis = autohf.fit(**autohf_settings)
+        Example:
+
+            .. code-block:: python
+
+                autohf_settings = {"resources_per_trial": {"cpu": 1},
+                           "num_samples": 1,
+                           "time_budget": 100000,
+                           "ckpt_per_epoch": 1,
+                           "fp16": False,
+                          }
+
+                validation_metric, analysis = autohf.fit(**autohf_settings)

        Args:
            resources_per_trial:
@@ -710,28 +709,25 @@ class AutoTransformers:
            ckpt_per_epoch:
                An integer value of number of checkpoints per epoch, default = 1
            ray_verbose:
-                int, default=1 | verbosit of ray,
+                An integer, default=1 | verbosit of ray,
            transformers_verbose:
-                int, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
+                An integer, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
                transformers.logging.ERROR, transformers.logging.INFO, transformers.logging.WARNING,
                or transformers.logging.DEBUG
            fp16:
-                boolean, default = True | whether to use fp16
+                A boolean, default = True | whether to use fp16
            ray_local_mode:
-                boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
+                A boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
            custom_hpo_args:
-                The additional keyword arguments, e.g.,
-                custom_hpo_args = {"points_to_evaluate": [{
-                           "num_train_epochs": 1,
-                           "per_device_train_batch_size": 128, }]}
+                The additional keyword arguments, e.g., custom_hpo_args = {"points_to_evaluate": [{
+                "num_train_epochs": 1, "per_device_train_batch_size": 128, }]}

        Returns:
-           validation_metric:
-                a dict storing the validation score
-           analysis:
-                a ray.tune.analysis.Analysis object storing the analysis results from tune.run

-        '''
+            validation_metric: A dict storing the validation score
+
+            analysis: A ray.tune.analysis.Analysis object storing the analysis results from tune.run
+        """
        from .hpo.scheduler_auto import AutoScheduler
        self._transformers_verbose = transformers_verbose

@@ -854,14 +850,14 @@ class AutoTransformers:

            Args:
                predictions:
-                    a list of predictions, which is the output of AutoTransformers.predict()
+                    A list of predictions, which is the output of AutoTransformers.predict()
                output_prediction_path:
-                    output path for the prediction
+                    Output path for the prediction
                output_zip_file_name:
-                    an string, which is the name of the output zip file
+                    An string, which is the name of the output zip file

            Returns:
-                the path of the output .zip file
+                The path of the output .zip file
        """
        from .dataset.submission_auto import auto_output_prediction
        return auto_output_prediction(self.jobid_config.dat,
--- a/flaml/nlp/huggingface/trainer.py
+++ b/flaml/nlp/huggingface/trainer.py
@@ -1,14 +1,12 @@
 import os
-import transformers
+
+try:
+    from transformers import Trainer as TFTrainer
+except ImportError:
+    TFTrainer = object


-class TrainerForAutoTransformers(transformers.Trainer):
-    """
-        Overriding transformers.Trainer.
-
-        Args:
-            huggingface (:class:`~transformers.PreTrainedModel` or :obj:`torch.nn.Module`, `optional`):
-    """
+class TrainerForAutoTransformers(TFTrainer):

    def evaluate(self,
                 eval_dataset=None):
--- a/flaml/nlp/result_analysis/azure_utils.py
+++ b/flaml/nlp/result_analysis/azure_utils.py
@@ -132,28 +132,23 @@ class JobID:
        self.sdhf = 42

    def is_match(self, partial_jobid):
-        """
-            return a boolean variable whether the current object matches the partial jobid defined
-            in partial_jobid. For example,
-            self = JobID(dat = ['glue'],
-                            subdat = 'cola',
-                            mod = 'bestnn',
-                            spa = 'buni',
-                            arg = 'cus',
-                            alg = 'bs',
-                            pru = 'None',
-                            pre = 'funnel',
-                            presz = 'xlarge',
-                            spt = 'rspt',
-                            rep = 0,
-                            sddt = 43,
-                            sdhf = 42)
-            partial_jobid1 = JobID(dat = ['glue'],
-                                  subdat = 'cola',
-                                  mod = 'hpo')
-           partial_jobid2 = JobID(dat = ['glue'],
-                                  subdat = 'cola',
-                                  mod = 'bestnn')
+        """Return a boolean variable whether the current object matches the partial jobid defined in partial_jobid.
+
+           Example:
+
+               .. code-block:: python
+
+                   self = JobID(dat = ['glue'], subdat = 'cola', mod = 'bestnn', spa = 'buni', arg = 'cus', alg = 'bs',
+                                pru = 'None', pre = 'funnel', presz = 'xlarge', spt = 'rspt', rep = 0, sddt = 43, sdhf = 42)
+
+                   partial_jobid1 = JobID(dat = ['glue'],
+                                          subdat = 'cola',
+                                          mod = 'hpo')
+
+                   partial_jobid2 = JobID(dat = ['glue'],
+                                          subdat = 'cola',
+                                          mod = 'bestnn')
+
            return False for partial_jobid1 and True for partial_jobid2
        """
        is_not_match = False
@@ -166,7 +161,7 @@ class JobID:

    def to_wandb_string(self):
        """
-            preparing for the job ID for wandb
+            Preparing for the job ID for wandb
        """
        field_dict = self.__dict__
        keytoval_str = "_".join([JobID.dataset_list_to_str(field_dict[key])
@@ -177,7 +172,7 @@ class JobID:

    def to_jobid_string(self):
        """
-            convert the current JobID into a blob name string which contains all the fields
+            Convert the current JobID into a blob name string which contains all the fields
        """
        list_keys = list(JobID.__dataclass_fields__.keys())
        field_dict = self.__dict__
@@ -189,7 +184,7 @@ class JobID:

    def to_partial_jobid_string(self):
        """
-            convert the current JobID into a blob name string which only contains the fields whose values are not "None"
+            Convert the current JobID into a blob name string which only contains the fields whose values are not "None"
        """
        list_keys = list(JobID.__dataclass_fields__.keys())
        field_dict = self.__dict__  # field_dict contains fields whose values are not None
@@ -202,9 +197,10 @@ class JobID:
    @staticmethod
    def blobname_to_jobid_dict(keytoval_str):
        """
-            converting an azure blobname to a JobID config,
+            Converting an azure blobname to a JobID config,
            e.g., blobname = "dat=glue_subdat=cola_mod=bestnn_spa=buni_arg=cus_
-                              alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json"
+            alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json"
+
            the converted jobid dict = {dat = ['glue'], subdat = 'cola', mod = 'bestnn',
                                   spa = 'buni', arg = 'cus', alg = 'bs', pru = 'None',
                                   pre = 'funnel', presz = 'xlarge', spt = 'rspt',
@@ -257,7 +253,7 @@ class JobID:
                                **jobid_list
                                ):
        """
-            set the jobid from a dict object
+            Set the jobid from a dict object
        """
        for key in jobid_list.keys():
            assert key in JobID.__dataclass_fields__.keys()
@@ -268,7 +264,7 @@ class JobID:
    @staticmethod
    def convert_blobname_to_jobid(blobname):
        """
-            converting a blobname string to a JobID object
+            Converting a blobname string to a JobID object
        """
        jobconfig_dict = JobID.blobname_to_jobid_dict(blobname)
        if jobconfig_dict:
@@ -281,7 +277,7 @@ class JobID:
    @staticmethod
    def get_full_data_name(dataset_name: Union[list, str], subdataset_name=None):
        """
-            convert a dataset name and sub dataset name to a full dataset name
+            Convert a dataset name and sub dataset name to a full dataset name
        """
        if isinstance(dataset_name, list):
            full_dataset_name = JobID.dataset_list_to_str(dataset_name)
@@ -293,7 +289,7 @@ class JobID:

    def get_jobid_full_data_name(self):
        """
-            get the full dataset name of the current JobID object
+            Get the full dataset name of the current JobID object
        """
        return JobID.get_full_data_name(JobID.dataset_list_to_str(self.dat), self.subdat)

@@ -573,7 +569,7 @@ class AzureUtils:
                            predictions=None,
                            duration=None):
        """
-            write the key info from a job and upload to azure blob storage
+            Write the key info from a job and upload to azure blob storage
        """
        local_file_path = self.generate_local_json_path()
        output_json = {}
@@ -590,7 +586,7 @@ class AzureUtils:

    def generate_local_json_path(self):
        """
-            return a path string for storing the json file locally
+            Return a path string for storing the json file locally
        """
        full_dataset_name = self.jobid.get_jobid_full_data_name()
        jobid_str = self.jobid.to_jobid_string()
@@ -608,7 +604,7 @@ class AzureUtils:
                                           local_json_file,
                                           predictions):
        """
-            store predictions (a .zip file) locally and upload
+            Store predictions (a .zip file) locally and upload
        """
        azure_save_file_name = local_json_file.split("/")[-1][:-5]
        if self.data_root_dir is None:
@@ -637,7 +633,7 @@ class AzureUtils:
                                          partial_jobid,
                                          earliest_time: Tuple[int, int, int] = None):
        """
-            get all blobs whose jobid configs match the partial_jobid
+            Get all blobs whose jobid configs match the partial_jobid
        """
        blob_list = []
        container_client = self._init_azure_clients()
--- a/flaml/onlineml/autovw.py
+++ b/flaml/onlineml/autovw.py
@@ -11,12 +11,8 @@ logger = logging.getLogger(__name__)

 class AutoVW:
    """The AutoML class
-
-    Methods:
-        predict(data_sample)
-        learn(data_sample)
-        AUTO
    """
+
    WARMSTART_NUM = 100
    AUTOMATIC = '_auto'
    VW_INTERACTION_ARG_NAME = 'interactions'
@@ -134,7 +130,7 @@ class AutoVW:
        self._y_predict = self._best_trial.predict(data_sample)
        # code for debugging purpose
        if self._prediction_trial_id is None or \
-           self._prediction_trial_id != self._best_trial.trial_id:
+                self._prediction_trial_id != self._best_trial.trial_id:
            self._prediction_trial_id = self._best_trial.trial_id
            logger.info('prediction trial id changed to %s at iter %s, resource used: %s',
                        self._prediction_trial_id, self._iter,
@@ -160,7 +156,7 @@ class AutoVW:
                                             or trial.result.resource_used >= self.WARMSTART_NUM):
                score = trial.result.get_score(self._model_select_policy)
                if ('min' == self._model_selection_mode and score < best_score) or \
-                   ('max' == self._model_selection_mode and score > best_score):
+                        ('max' == self._model_selection_mode and score > best_score):
                    best_score = score
                    new_best_trial = trial
        if new_best_trial is not None:
--- a/flaml/searcher/blendsearch.py
+++ b/flaml/searcher/blendsearch.py
@@ -189,11 +189,15 @@ class BlendSearch(Searcher):
            self._metric_constraint_penalty = None

    def save(self, checkpoint_path: str):
+        ''' save states to a checkpoint path
+        '''
        save_object = self
        with open(checkpoint_path, "wb") as outputFile:
            pickle.dump(save_object, outputFile)

    def restore(self, checkpoint_path: str):
+        ''' restore states from checkpoint
+        '''
        with open(checkpoint_path, "rb") as inputFile:
            state = pickle.load(inputFile)
        self._metric_target = state._metric_target
@@ -220,9 +224,6 @@ class BlendSearch(Searcher):
    def metric_target(self):
        return self._metric_target

-    def restore_from_dir(self, checkpoint_dir: str):
-        super.restore_from_dir(checkpoint_dir)
-
    def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None,
                          error: bool = False):
        ''' search thread updater and cleaner
@@ -353,6 +354,8 @@ class BlendSearch(Searcher):
        return False

    def on_trial_result(self, trial_id: str, result: Dict):
+        ''' receive intermediate result
+        '''
        if trial_id not in self._trial_proposed_by:
            return
        thread_id = self._trial_proposed_by[trial_id]
--- a/flaml/tune/tune.py
+++ b/flaml/tune/tune.py
@@ -180,10 +180,10 @@ def run(training_function,
        prune_attr: A string of the attribute used for pruning.
            Not necessarily in space.
            When prune_attr is in space, it is a hyperparameter, e.g.,
-                'n_iters', and the best value is unknown.
+            'n_iters', and the best value is unknown.
            When prune_attr is not in space, it is a resource dimension,
-                e.g., 'sample_size', and the peak performance is assumed
-                to be at the max_resource.
+            e.g., 'sample_size', and the peak performance is assumed
+            to be at the max_resource.
        min_resource: A float of the minimal resource to use for the
            prune_attr; only valid if prune_attr is not in space.
        max_resource: A float of the maximal resource to use for the