diff --git a/flaml/nlp/autotransformers.py b/flaml/nlp/autotransformers.py
index 29f979edc..803290e40 100644
--- a/flaml/nlp/autotransformers.py
+++ b/flaml/nlp/autotransformers.py
@@ -2,21 +2,18 @@ import json
 import os
 import numpy as np
 import time
-import logging
 
 try:
     import ray
     import transformers
     from transformers import TrainingArguments
     import datasets
-    import torch
+    from .dataset.task_auto import get_default_task
+    from .result_analysis.azure_utils import JobID
+    from .huggingface.trainer import TrainerForAutoTransformers
 except ImportError:
     print("To use the nlp component in flaml, run pip install flaml[nlp]")
 
-from .dataset.task_auto import get_default_task
-from .result_analysis.azure_utils import JobID
-from .huggingface.trainer import TrainerForAutoTransformers
-
 task_list = [
     "seq-classification",
     "regression",
@@ -116,20 +113,18 @@ class AutoTransformers:
                      fold_name=None,
                      resplit_portion=None,
                      **custom_data_args):
-        '''Prepare data
+        """Prepare data
 
-            An example:
+            Example:
 
-                preparedata_setting = {
-                "server_name": "tmdev",
-                "data_root_path": "data/",
-                "max_seq_length": 128,
-                "jobid_config": jobid_config,
-                "wandb_utils": wandb_utils,
-                "resplit_portion": {"source": ["train", "validation"],
-                "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}
-                }
-                autohf.prepare_data(**preparedata_setting)
+                .. code-block:: python
+
+                    preparedata_setting = {"server_name": "tmdev", "data_root_path": "data/", "max_seq_length": 128,
+                                               "jobid_config": jobid_config, "wandb_utils": wandb_utils,
+                                               "resplit_portion": {"source": ["train", "validation"],
+                                               "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}}
+
+                    autohf.prepare_data(**preparedata_setting)
 
             Args:
                 server_name:
@@ -148,7 +143,7 @@ class AutoTransformers:
                     If args.resplit_mode = "rspt", resplit_portion is required
                 is_wandb_on:
                     A boolean variable indicating whether wandb is used
-            '''
+        """
         from .dataset.dataprocess_auto import AutoEncodeText
         from transformers import AutoTokenizer
         from datasets import load_dataset
@@ -682,16 +677,20 @@ class AutoTransformers:
             resources_per_trial=None,
             ray_local_mode=False,
             **custom_hpo_args):
-        '''Fine tuning the huggingface using the hpo setting
+        """Fine tuning the huggingface using the hpo setting
 
-        An example:
-            autohf_settings = {"resources_per_trial": {"cpu": 1},
-                       "num_samples": 1,
-                       "time_budget": 100000,
-                       "ckpt_per_epoch": 1,
-                       "fp16": False,
-                      }
-            validation_metric, analysis = autohf.fit(**autohf_settings)
+        Example:
+
+            .. code-block:: python
+
+                autohf_settings = {"resources_per_trial": {"cpu": 1},
+                           "num_samples": 1,
+                           "time_budget": 100000,
+                           "ckpt_per_epoch": 1,
+                           "fp16": False,
+                          }
+
+                validation_metric, analysis = autohf.fit(**autohf_settings)
 
         Args:
             resources_per_trial:
@@ -710,28 +709,25 @@ class AutoTransformers:
             ckpt_per_epoch:
                 An integer value of number of checkpoints per epoch, default = 1
             ray_verbose:
-                int, default=1 | verbosit of ray,
+                An integer, default=1 | verbosit of ray,
             transformers_verbose:
-                int, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
+                An integer, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
                 transformers.logging.ERROR, transformers.logging.INFO, transformers.logging.WARNING,
                 or transformers.logging.DEBUG
             fp16:
-                boolean, default = True | whether to use fp16
+                A boolean, default = True | whether to use fp16
             ray_local_mode:
-                boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
+                A boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
             custom_hpo_args:
-                The additional keyword arguments, e.g.,
-                custom_hpo_args = {"points_to_evaluate": [{
-                           "num_train_epochs": 1,
-                           "per_device_train_batch_size": 128, }]}
+                The additional keyword arguments, e.g., custom_hpo_args = {"points_to_evaluate": [{
+                "num_train_epochs": 1, "per_device_train_batch_size": 128, }]}
 
         Returns:
-           validation_metric:
-                a dict storing the validation score
-           analysis:
-                a ray.tune.analysis.Analysis object storing the analysis results from tune.run
 
-        '''
+            validation_metric: A dict storing the validation score
+
+            analysis: A ray.tune.analysis.Analysis object storing the analysis results from tune.run
+        """
         from .hpo.scheduler_auto import AutoScheduler
         self._transformers_verbose = transformers_verbose
 
@@ -854,14 +850,14 @@ class AutoTransformers:
 
             Args:
                 predictions:
-                    a list of predictions, which is the output of AutoTransformers.predict()
+                    A list of predictions, which is the output of AutoTransformers.predict()
                 output_prediction_path:
-                    output path for the prediction
+                    Output path for the prediction
                 output_zip_file_name:
-                    an string, which is the name of the output zip file
+                    An string, which is the name of the output zip file
 
             Returns:
-                the path of the output .zip file
+                The path of the output .zip file
         """
         from .dataset.submission_auto import auto_output_prediction
         return auto_output_prediction(self.jobid_config.dat,
diff --git a/flaml/nlp/huggingface/trainer.py b/flaml/nlp/huggingface/trainer.py
index ce2c2c438..53ac6a323 100644
--- a/flaml/nlp/huggingface/trainer.py
+++ b/flaml/nlp/huggingface/trainer.py
@@ -1,14 +1,12 @@
 import os
-import transformers
+
+try:
+    from transformers import Trainer as TFTrainer
+except ImportError:
+    TFTrainer = object
 
 
-class TrainerForAutoTransformers(transformers.Trainer):
-    """
-        Overriding transformers.Trainer.
-
-        Args:
-            huggingface (:class:`~transformers.PreTrainedModel` or :obj:`torch.nn.Module`, `optional`):
-    """
+class TrainerForAutoTransformers(TFTrainer):
 
     def evaluate(self,
                  eval_dataset=None):
diff --git a/flaml/nlp/result_analysis/azure_utils.py b/flaml/nlp/result_analysis/azure_utils.py
index b87de3d7d..850cf3cfd 100644
--- a/flaml/nlp/result_analysis/azure_utils.py
+++ b/flaml/nlp/result_analysis/azure_utils.py
@@ -132,28 +132,23 @@ class JobID:
         self.sdhf = 42
 
     def is_match(self, partial_jobid):
-        """
-            return a boolean variable whether the current object matches the partial jobid defined
-            in partial_jobid. For example,
-            self = JobID(dat = ['glue'],
-                            subdat = 'cola',
-                            mod = 'bestnn',
-                            spa = 'buni',
-                            arg = 'cus',
-                            alg = 'bs',
-                            pru = 'None',
-                            pre = 'funnel',
-                            presz = 'xlarge',
-                            spt = 'rspt',
-                            rep = 0,
-                            sddt = 43,
-                            sdhf = 42)
-            partial_jobid1 = JobID(dat = ['glue'],
-                                  subdat = 'cola',
-                                  mod = 'hpo')
-           partial_jobid2 = JobID(dat = ['glue'],
-                                  subdat = 'cola',
-                                  mod = 'bestnn')
+        """Return a boolean variable whether the current object matches the partial jobid defined in partial_jobid.
+
+           Example:
+
+               .. code-block:: python
+
+                   self = JobID(dat = ['glue'], subdat = 'cola', mod = 'bestnn', spa = 'buni', arg = 'cus', alg = 'bs',
+                                pru = 'None', pre = 'funnel', presz = 'xlarge', spt = 'rspt', rep = 0, sddt = 43, sdhf = 42)
+
+                   partial_jobid1 = JobID(dat = ['glue'],
+                                          subdat = 'cola',
+                                          mod = 'hpo')
+
+                   partial_jobid2 = JobID(dat = ['glue'],
+                                          subdat = 'cola',
+                                          mod = 'bestnn')
+
             return False for partial_jobid1 and True for partial_jobid2
         """
         is_not_match = False
@@ -166,7 +161,7 @@ class JobID:
 
     def to_wandb_string(self):
         """
-            preparing for the job ID for wandb
+            Preparing for the job ID for wandb
         """
         field_dict = self.__dict__
         keytoval_str = "_".join([JobID.dataset_list_to_str(field_dict[key])
@@ -177,7 +172,7 @@ class JobID:
 
     def to_jobid_string(self):
         """
-            convert the current JobID into a blob name string which contains all the fields
+            Convert the current JobID into a blob name string which contains all the fields
         """
         list_keys = list(JobID.__dataclass_fields__.keys())
         field_dict = self.__dict__
@@ -189,7 +184,7 @@ class JobID:
 
     def to_partial_jobid_string(self):
         """
-            convert the current JobID into a blob name string which only contains the fields whose values are not "None"
+            Convert the current JobID into a blob name string which only contains the fields whose values are not "None"
         """
         list_keys = list(JobID.__dataclass_fields__.keys())
         field_dict = self.__dict__  # field_dict contains fields whose values are not None
@@ -202,9 +197,10 @@ class JobID:
     @staticmethod
     def blobname_to_jobid_dict(keytoval_str):
         """
-            converting an azure blobname to a JobID config,
+            Converting an azure blobname to a JobID config,
             e.g., blobname = "dat=glue_subdat=cola_mod=bestnn_spa=buni_arg=cus_
-                              alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json"
+            alg=bs_pru=None_pre=funnel_presz=xlarge_spt=rspt_rep=0.json"
+
             the converted jobid dict = {dat = ['glue'], subdat = 'cola', mod = 'bestnn',
                                    spa = 'buni', arg = 'cus', alg = 'bs', pru = 'None',
                                    pre = 'funnel', presz = 'xlarge', spt = 'rspt',
@@ -257,7 +253,7 @@ class JobID:
                                 **jobid_list
                                 ):
         """
-            set the jobid from a dict object
+            Set the jobid from a dict object
         """
         for key in jobid_list.keys():
             assert key in JobID.__dataclass_fields__.keys()
@@ -268,7 +264,7 @@ class JobID:
     @staticmethod
     def convert_blobname_to_jobid(blobname):
         """
-            converting a blobname string to a JobID object
+            Converting a blobname string to a JobID object
         """
         jobconfig_dict = JobID.blobname_to_jobid_dict(blobname)
         if jobconfig_dict:
@@ -281,7 +277,7 @@ class JobID:
     @staticmethod
     def get_full_data_name(dataset_name: Union[list, str], subdataset_name=None):
         """
-            convert a dataset name and sub dataset name to a full dataset name
+            Convert a dataset name and sub dataset name to a full dataset name
         """
         if isinstance(dataset_name, list):
             full_dataset_name = JobID.dataset_list_to_str(dataset_name)
@@ -293,7 +289,7 @@ class JobID:
 
     def get_jobid_full_data_name(self):
         """
-            get the full dataset name of the current JobID object
+            Get the full dataset name of the current JobID object
         """
         return JobID.get_full_data_name(JobID.dataset_list_to_str(self.dat), self.subdat)
 
@@ -573,7 +569,7 @@ class AzureUtils:
                             predictions=None,
                             duration=None):
         """
-            write the key info from a job and upload to azure blob storage
+            Write the key info from a job and upload to azure blob storage
         """
         local_file_path = self.generate_local_json_path()
         output_json = {}
@@ -590,7 +586,7 @@ class AzureUtils:
 
     def generate_local_json_path(self):
         """
-            return a path string for storing the json file locally
+            Return a path string for storing the json file locally
         """
         full_dataset_name = self.jobid.get_jobid_full_data_name()
         jobid_str = self.jobid.to_jobid_string()
@@ -608,7 +604,7 @@ class AzureUtils:
                                            local_json_file,
                                            predictions):
         """
-            store predictions (a .zip file) locally and upload
+            Store predictions (a .zip file) locally and upload
         """
         azure_save_file_name = local_json_file.split("/")[-1][:-5]
         if self.data_root_dir is None:
@@ -637,7 +633,7 @@ class AzureUtils:
                                           partial_jobid,
                                           earliest_time: Tuple[int, int, int] = None):
         """
-            get all blobs whose jobid configs match the partial_jobid
+            Get all blobs whose jobid configs match the partial_jobid
         """
         blob_list = []
         container_client = self._init_azure_clients()
diff --git a/flaml/onlineml/autovw.py b/flaml/onlineml/autovw.py
index f1e4a1624..8ccddd55e 100644
--- a/flaml/onlineml/autovw.py
+++ b/flaml/onlineml/autovw.py
@@ -11,12 +11,8 @@ logger = logging.getLogger(__name__)
 
 class AutoVW:
     """The AutoML class
-
-    Methods:
-        predict(data_sample)
-        learn(data_sample)
-        AUTO
     """
+
     WARMSTART_NUM = 100
     AUTOMATIC = '_auto'
     VW_INTERACTION_ARG_NAME = 'interactions'
@@ -134,7 +130,7 @@ class AutoVW:
         self._y_predict = self._best_trial.predict(data_sample)
         # code for debugging purpose
         if self._prediction_trial_id is None or \
-           self._prediction_trial_id != self._best_trial.trial_id:
+                self._prediction_trial_id != self._best_trial.trial_id:
             self._prediction_trial_id = self._best_trial.trial_id
             logger.info('prediction trial id changed to %s at iter %s, resource used: %s',
                         self._prediction_trial_id, self._iter,
@@ -160,7 +156,7 @@ class AutoVW:
                                              or trial.result.resource_used >= self.WARMSTART_NUM):
                 score = trial.result.get_score(self._model_select_policy)
                 if ('min' == self._model_selection_mode and score < best_score) or \
-                   ('max' == self._model_selection_mode and score > best_score):
+                        ('max' == self._model_selection_mode and score > best_score):
                     best_score = score
                     new_best_trial = trial
         if new_best_trial is not None:
diff --git a/flaml/searcher/blendsearch.py b/flaml/searcher/blendsearch.py
index 1fba8f04b..69f8edfaa 100644
--- a/flaml/searcher/blendsearch.py
+++ b/flaml/searcher/blendsearch.py
@@ -189,11 +189,15 @@ class BlendSearch(Searcher):
             self._metric_constraint_penalty = None
 
     def save(self, checkpoint_path: str):
+        ''' save states to a checkpoint path
+        '''
         save_object = self
         with open(checkpoint_path, "wb") as outputFile:
             pickle.dump(save_object, outputFile)
 
     def restore(self, checkpoint_path: str):
+        ''' restore states from checkpoint
+        '''
         with open(checkpoint_path, "rb") as inputFile:
             state = pickle.load(inputFile)
         self._metric_target = state._metric_target
@@ -220,9 +224,6 @@ class BlendSearch(Searcher):
     def metric_target(self):
         return self._metric_target
 
-    def restore_from_dir(self, checkpoint_dir: str):
-        super.restore_from_dir(checkpoint_dir)
-
     def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None,
                           error: bool = False):
         ''' search thread updater and cleaner
@@ -353,6 +354,8 @@ class BlendSearch(Searcher):
         return False
 
     def on_trial_result(self, trial_id: str, result: Dict):
+        ''' receive intermediate result
+        '''
         if trial_id not in self._trial_proposed_by:
             return
         thread_id = self._trial_proposed_by[trial_id]
diff --git a/flaml/tune/tune.py b/flaml/tune/tune.py
index 4a3b02cf8..46f5453f1 100644
--- a/flaml/tune/tune.py
+++ b/flaml/tune/tune.py
@@ -180,10 +180,10 @@ def run(training_function,
         prune_attr: A string of the attribute used for pruning.
             Not necessarily in space.
             When prune_attr is in space, it is a hyperparameter, e.g.,
-                'n_iters', and the best value is unknown.
+            'n_iters', and the best value is unknown.
             When prune_attr is not in space, it is a resource dimension,
-                e.g., 'sample_size', and the peak performance is assumed
-                to be at the max_resource.
+            e.g., 'sample_size', and the peak performance is assumed
+            to be at the max_resource.
         min_resource: A float of the minimal resource to use for the
             prune_attr; only valid if prune_attr is not in space.
         max_resource: A float of the maximal resource to use for the