mirror of
https://github.com/microsoft/FLAML.git
synced 2026-02-17 14:12:26 +08:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6ff0ed434b | ||
|
|
2d3bd84038 | ||
|
|
79a851e408 | ||
|
|
a1b0b303ed | ||
|
|
3328157f31 | ||
|
|
da88aa77e3 | ||
|
|
bd16eeee69 | ||
|
|
d18d292081 | ||
|
|
80d3b14097 | ||
|
|
f757a55097 | ||
|
|
20ce01b33d | ||
|
|
9d661759b4 | ||
|
|
6393cc81e9 | ||
|
|
38775b16c0 | ||
|
|
d659079a5d |
2
.github/workflows/python-package.yml
vendored
2
.github/workflows/python-package.yml
vendored
@@ -1,7 +1,7 @@
|
||||
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
|
||||
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
|
||||
|
||||
name: Python package
|
||||
name: Build
|
||||
|
||||
on:
|
||||
push:
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -146,6 +146,8 @@ dmypy.json
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
/catboost_info
|
||||
|
||||
catboost_info
|
||||
notebook/*.pkl
|
||||
notebook/.azureml
|
||||
mlruns
|
||||
|
||||
13
README.md
13
README.md
@@ -1,3 +1,8 @@
|
||||
[](https://badge.fury.io/py/FLAML)
|
||||
[](https://github.com/microsoft/FLAML/actions/workflows/python-package.yml)
|
||||

|
||||
[](https://pepy.tech/project/flaml)
|
||||
|
||||
# FLAML - Fast and Lightweight AutoML
|
||||
|
||||
<p align="center">
|
||||
@@ -5,8 +10,8 @@
|
||||
<br>
|
||||
</p>
|
||||
|
||||
FLAML is a Python library designed to automatically produce accurate machine
|
||||
learning models with low computational cost. It frees users from selecting
|
||||
FLAML is a lightweight Python library that finds accurate machine
|
||||
learning models automatically, efficiently and economically. It frees users from selecting
|
||||
learners and hyperparameters for each learner. It is fast and cheap.
|
||||
The simple and lightweight design makes it easy to extend, such as
|
||||
adding customized learners or metrics. FLAML is powered by a new, [cost-effective
|
||||
@@ -115,7 +120,7 @@ For more technical details, please check our papers.
|
||||
* [FLAML: A Fast and Lightweight AutoML Library](https://arxiv.org/abs/1911.04706). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. To appear in MLSys, 2021.
|
||||
```
|
||||
@inproceedings{wang2021flaml,
|
||||
title={Frugal Optimization for Cost-related Hyperparameters},
|
||||
title={FLAML: A Fast and Lightweight AutoML Library},
|
||||
author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},
|
||||
year={2021},
|
||||
booktitle={MLSys},
|
||||
@@ -143,7 +148,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
|
||||
* Chi Wang
|
||||
* Qingyun Wu
|
||||
|
||||
Contributors (alphabetical order): Alex Deng, Silu Huang, John Langford, Amin Saied, Markus Weimer, Haozhe Zhang, Erkang Zhu.
|
||||
Contributors (alphabetical order): Sebastien Bubeck, Surajit Chaudhuri, Nadiia Chepurko, Ofer Dekel, Alex Deng, Anshuman Dutt, Nicolo Fusi, Jianfeng Gao, Johannes Gehrke, Silu Huang, Dongwoo Kim, Christian Konig, John Langford, Amin Saied, Neil Tenenholtz, Markus Weimer, Haozhe Zhang, Erkang Zhu.
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from flaml.searcher import CFO, BlendSearch, FLOW2
|
||||
from flaml.automl import AutoML
|
||||
from flaml.automl import AutoML, logger_formatter
|
||||
from flaml.version import __version__
|
||||
import logging
|
||||
|
||||
@@ -7,10 +7,3 @@ import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Add the console handler.
|
||||
_ch = logging.StreamHandler()
|
||||
logger_formatter = logging.Formatter(
|
||||
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
|
||||
'%m-%d %H:%M:%S')
|
||||
_ch.setFormatter(logger_formatter)
|
||||
logger.addHandler(_ch)
|
||||
@@ -25,6 +25,10 @@ from .training_log import training_log_reader, training_log_writer
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger_formatter = logging.Formatter(
|
||||
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
|
||||
'%m-%d %H:%M:%S')
|
||||
|
||||
try:
|
||||
import mlflow
|
||||
except:
|
||||
@@ -326,6 +330,10 @@ class AutoML:
|
||||
A numpy array of shape n * 1 - - each element is a predicted class
|
||||
label for an instance.
|
||||
'''
|
||||
if self._trained_estimator is None:
|
||||
warnings.warn(
|
||||
"No estimator is trained. Please run fit with enough budget.")
|
||||
return None
|
||||
X_test = self._preprocess(X_test)
|
||||
y_pred = self._trained_estimator.predict(X_test)
|
||||
if y_pred.ndim > 1: y_pred = y_pred.flatten()
|
||||
@@ -402,7 +410,7 @@ class AutoML:
|
||||
self._X_train_all, self._y_train_all = \
|
||||
self._transformer.fit_transform(X, y, self._state.task)
|
||||
self._label_transformer = self._transformer.label_transformer
|
||||
|
||||
self._sample_weight_full = self._state.fit_kwargs.get('sample_weight')
|
||||
if X_val is not None and y_val is not None:
|
||||
if not (isinstance(X_val, np.ndarray) or
|
||||
issparse(X_val) or
|
||||
@@ -446,7 +454,8 @@ class AutoML:
|
||||
self._X_train_all, self._y_train_all
|
||||
if issparse(X_train_all):
|
||||
X_train_all = X_train_all.tocsr()
|
||||
if self._state.task != 'regression':
|
||||
if self._state.task != 'regression' and self._state.fit_kwargs.get(
|
||||
'sample_weight') is None:
|
||||
# logger.info(f"label {pd.unique(y_train_all)}")
|
||||
label_set, counts = np.unique(y_train_all, return_counts=True)
|
||||
# augment rare classes
|
||||
@@ -836,6 +845,11 @@ class AutoML:
|
||||
if eval_method == 'auto' or self._state.X_val is not None:
|
||||
eval_method = self._decide_eval_method(time_budget)
|
||||
self._state.eval_method = eval_method
|
||||
if not mlflow or not mlflow.active_run() and not logger.handler:
|
||||
# Add the console handler.
|
||||
_ch = logging.StreamHandler()
|
||||
_ch.setFormatter(logger_formatter)
|
||||
logger.addHandler(_ch)
|
||||
logger.info("Evaluation method: {}".format(eval_method))
|
||||
|
||||
self._retrain_full = retrain_full and (eval_method == 'holdout' and
|
||||
@@ -1093,8 +1107,9 @@ class AutoML:
|
||||
self._state.best_loss))
|
||||
else:
|
||||
logger.info(f"no enough budget for learner {estimator}")
|
||||
self.estimator_list.remove(estimator)
|
||||
self._estimator_index -= 1
|
||||
if self._estimator_index is not None:
|
||||
self.estimator_list.remove(estimator)
|
||||
self._estimator_index -= 1
|
||||
if self._retrain_full and best_config_sig and not better and (
|
||||
self._search_states[self._best_estimator].sample_size ==
|
||||
self._state.data_size) and (est_retrain_time <=
|
||||
@@ -1151,7 +1166,11 @@ class AutoML:
|
||||
stacker = Stacker(estimators, best_m,
|
||||
n_jobs=self._state.n_jobs,
|
||||
passthrough=True)
|
||||
stacker.fit(self._X_train_all, self._y_train_all)
|
||||
if self._sample_weight_full is not None:
|
||||
self._state.fit_kwargs[
|
||||
'sample_weight'] = self._sample_weight_full
|
||||
stacker.fit(self._X_train_all, self._y_train_all,
|
||||
**self._state.fit_kwargs)
|
||||
logger.info(f'ensemble: {stacker}')
|
||||
self._trained_estimator = stacker
|
||||
self._trained_estimator.model = stacker
|
||||
|
||||
@@ -506,7 +506,6 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
|
||||
def get_params(self, deep=False):
|
||||
params = super().get_params()
|
||||
params["criterion"] = 1 if params["criterion"]=='gini' else 2
|
||||
return params
|
||||
|
||||
|
||||
|
||||
@@ -113,8 +113,9 @@ class BlendSearch(Searcher):
|
||||
self._deadline = config.get('time_budget_s') + time.time()
|
||||
if 'metric_target' in config:
|
||||
self._metric_target = config.get('metric_target')
|
||||
else:
|
||||
self._metric, self._mode = metric, mode
|
||||
else:
|
||||
if metric: self._metric = metric
|
||||
if mode: self._mode = mode
|
||||
self._ls.set_search_properties(metric, mode, config)
|
||||
if self._gs is not None:
|
||||
self._gs.set_search_properties(metric, mode, config)
|
||||
@@ -300,11 +301,9 @@ class BlendSearch(Searcher):
|
||||
else: # use init config
|
||||
init_config = self._points_to_evaluate.pop(
|
||||
0) if self._points_to_evaluate else self._ls.init_config
|
||||
if init_config==self._ls.init_config:
|
||||
config = self._ls.complete_config(init_config,
|
||||
config = self._ls.complete_config(init_config,
|
||||
self._admissible_min, self._admissible_max)
|
||||
# logger.info(f"reset config to {config}")
|
||||
else: config = init_config
|
||||
config_signature = self._ls.config_signature(config)
|
||||
result = self._result.get(config_signature)
|
||||
if result: # tried before
|
||||
@@ -314,7 +313,6 @@ class BlendSearch(Searcher):
|
||||
self._result[config_signature] = {}
|
||||
else: return None # running but no result yet
|
||||
self._init_used = True
|
||||
self._trial_proposed_by[trial_id] = 0
|
||||
# logger.info(f"config={config}")
|
||||
return config
|
||||
|
||||
|
||||
@@ -121,8 +121,8 @@ class FLOW2(Searcher):
|
||||
self._unordered_cat_hp = {}
|
||||
self._cat_hp_cost = {}
|
||||
for key, domain in self.space.items():
|
||||
assert not isinstance(domain, dict), \
|
||||
key+"'s domain is grid search which is not supported in FLOW2."
|
||||
assert not (isinstance(domain, dict) and 'grid_search' in domain
|
||||
), key+"'s domain is grid search which is not supported in FLOW2."
|
||||
if callable(getattr(domain, 'get_sampler', None)):
|
||||
self._tunable_keys.append(key)
|
||||
sampler = domain.get_sampler()
|
||||
@@ -190,6 +190,8 @@ class FLOW2(Searcher):
|
||||
self._K = 0
|
||||
self._iter_best_config = self.trial_count = 1
|
||||
self._reset_times = 0
|
||||
# record intermediate trial cost
|
||||
self._trial_cost = {}
|
||||
|
||||
@property
|
||||
def step_lower_bound(self) -> float:
|
||||
@@ -237,7 +239,8 @@ class FLOW2(Searcher):
|
||||
''' generate a complete config from the partial config input
|
||||
add minimal resource to config if available
|
||||
'''
|
||||
if self._reset_times: # not the first time, use random gaussian
|
||||
if self._reset_times and partial_config==self.init_config:
|
||||
# not the first time to complete init_config, use random gaussian
|
||||
normalized = self.normalize(partial_config)
|
||||
for key in normalized:
|
||||
# don't change unordered cat choice
|
||||
@@ -258,21 +261,22 @@ class FLOW2(Searcher):
|
||||
normalized[key] = max(l, min(u, normalized[key] + delta))
|
||||
# use best config for unordered cat choice
|
||||
config = self.denormalize(normalized)
|
||||
self._reset_times += 1
|
||||
else:
|
||||
# first time init_config, or other configs, take as is
|
||||
config = partial_config.copy()
|
||||
|
||||
for key, value in self.space.items():
|
||||
if key not in config:
|
||||
config[key] = value
|
||||
logger.debug(f'before random {config}')
|
||||
# logger.debug(f'before random {config}')
|
||||
for _, generated in generate_variants({'config': config}):
|
||||
config = generated['config']
|
||||
break
|
||||
logger.debug(f'after random {config}')
|
||||
# logger.debug(f'after random {config}')
|
||||
|
||||
if self._resource:
|
||||
config[self.prune_attr] = self.min_resource
|
||||
self._reset_times += 1
|
||||
return config
|
||||
|
||||
def create(self, init_config: Dict, obj: float, cost: float) -> Searcher:
|
||||
@@ -442,7 +446,8 @@ class FLOW2(Searcher):
|
||||
if proposed_by == self.incumbent:
|
||||
# proposed by current incumbent and no better
|
||||
self._num_complete4incumbent += 1
|
||||
cost = result.get(self.cost_attr)
|
||||
cost = result.get(
|
||||
self.cost_attr) if result else self._trial_cost.get(trial_id)
|
||||
if cost: self._cost_complete4incumbent += cost
|
||||
if self._num_complete4incumbent >= 2*self.dim and \
|
||||
self._num_allowed4incumbent == 0:
|
||||
@@ -483,6 +488,9 @@ class FLOW2(Searcher):
|
||||
self._num_allowed4incumbent = 2 * self.dim
|
||||
self._proposed_by.clear()
|
||||
self._iter_best_config = self.trial_count
|
||||
cost = result.get(self.cost_attr)
|
||||
# record the cost in case it is pruned and cost info is lost
|
||||
self._trial_cost[trial_id] = cost
|
||||
|
||||
def rand_vector_unit_sphere(self, dim) -> np.ndarray:
|
||||
vec = self._random.normal(0, 1, dim)
|
||||
|
||||
@@ -6,6 +6,7 @@ The API is compatible with ray tune.
|
||||
Example:
|
||||
|
||||
```python
|
||||
# require: pip install flaml[blendsearch]
|
||||
from flaml import tune
|
||||
import time
|
||||
|
||||
@@ -42,15 +43,16 @@ print(analysis.best_config) # the best config
|
||||
|
||||
Or, using ray tune's API:
|
||||
```python
|
||||
# require: pip install flaml[blendsearch] ray[tune]
|
||||
from ray import tune as raytune
|
||||
from flaml import CFO, BlendSearch
|
||||
import time
|
||||
|
||||
def evaluate_config(config):
|
||||
'''evaluate a hyperparameter configuration'''
|
||||
# we uss a toy example with 2 hyperparameters
|
||||
# we use a toy example with 2 hyperparameters
|
||||
metric = (round(config['x'])-85000)**2 - config['x']/config['y']
|
||||
# usually the evaluation takes an non-neglible cost
|
||||
# usually the evaluation takes a non-neglible cost
|
||||
# and the cost could be related to certain hyperparameters
|
||||
# in this example, we assume it's proportional to x
|
||||
time.sleep(config['x']/100000)
|
||||
@@ -146,6 +148,7 @@ based on optimism in face of uncertainty.
|
||||
Example:
|
||||
|
||||
```python
|
||||
# require: pip install flaml[blendsearch]
|
||||
from flaml import BlendSearch
|
||||
tune.run(...
|
||||
search_alg = BlendSearch(points_to_evaluate=[init_config]),
|
||||
@@ -178,4 +181,4 @@ For more technical details, please check our papers.
|
||||
year={2021},
|
||||
booktitle={ICLR'21},
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "0.2.2"
|
||||
__version__ = "0.2.5"
|
||||
|
||||
969
notebook/flaml_automl.ipynb
Normal file
969
notebook/flaml_automl.ipynb
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
799
notebook/flaml_finetune_transformer.ipynb
Normal file
799
notebook/flaml_finetune_transformer.ipynb
Normal file
@@ -0,0 +1,799 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This notebook uses flaml to finetune a transformer model from Huggingface transformers library.\n",
|
||||
"\n",
|
||||
"**Requirements.** This notebook has additional requirements:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install torch transformers datasets ipywidgets"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Tokenizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL_CHECKPOINT = \"distilbert-base-uncased\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tokenizer(\"this is a test\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TASK = \"cola\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import datasets"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Reusing dataset glue (/home/amin/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
|
||||
"/home/amin/miniconda/lib/python3.7/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:100.)\n",
|
||||
" return torch._C._cuda_getDeviceCount() > 0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"raw_dataset = datasets.load_dataset(\"glue\", TASK)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# define tokenization function used to process data\n",
|
||||
"COLUMN_NAME = \"sentence\"\n",
|
||||
"def tokenize(examples):\n",
|
||||
" return tokenizer(examples[COLUMN_NAME], truncation=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "5bd7b23a478043eaaf6e14e119143fcd",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "d7b648c2dbdc4fb9907e43da7db8af9a",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "36a9d6e62dbe462d94b1769f36fbd0f3",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"encoded_dataset = raw_dataset.map(tokenize, batched=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
|
||||
" 'idx': 0,\n",
|
||||
" 'input_ids': [101,\n",
|
||||
" 2256,\n",
|
||||
" 2814,\n",
|
||||
" 2180,\n",
|
||||
" 1005,\n",
|
||||
" 1056,\n",
|
||||
" 4965,\n",
|
||||
" 2023,\n",
|
||||
" 4106,\n",
|
||||
" 1010,\n",
|
||||
" 2292,\n",
|
||||
" 2894,\n",
|
||||
" 1996,\n",
|
||||
" 2279,\n",
|
||||
" 2028,\n",
|
||||
" 2057,\n",
|
||||
" 16599,\n",
|
||||
" 1012,\n",
|
||||
" 102],\n",
|
||||
" 'label': 1,\n",
|
||||
" 'sentence': \"Our friends won't buy this analysis, let alone the next one we propose.\"}"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"encoded_dataset[\"train\"][0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoModelForSequenceClassification"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "35b76e51b5c8406fae416fcdc3dd885e",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
|
||||
"- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
|
||||
"- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
||||
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
|
||||
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"NUM_LABELS = 2\n",
|
||||
"model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_LABELS)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"DistilBertForSequenceClassification(\n",
|
||||
" (distilbert): DistilBertModel(\n",
|
||||
" (embeddings): Embeddings(\n",
|
||||
" (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
|
||||
" (position_embeddings): Embedding(512, 768)\n",
|
||||
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" )\n",
|
||||
" (transformer): Transformer(\n",
|
||||
" (layer): ModuleList(\n",
|
||||
" (0): TransformerBlock(\n",
|
||||
" (attention): MultiHeadSelfAttention(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" (ffn): FFN(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
|
||||
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" (1): TransformerBlock(\n",
|
||||
" (attention): MultiHeadSelfAttention(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" (ffn): FFN(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
|
||||
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" (2): TransformerBlock(\n",
|
||||
" (attention): MultiHeadSelfAttention(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" (ffn): FFN(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
|
||||
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" (3): TransformerBlock(\n",
|
||||
" (attention): MultiHeadSelfAttention(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" (ffn): FFN(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
|
||||
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" (4): TransformerBlock(\n",
|
||||
" (attention): MultiHeadSelfAttention(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" (ffn): FFN(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
|
||||
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" (5): TransformerBlock(\n",
|
||||
" (attention): MultiHeadSelfAttention(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" (ffn): FFN(\n",
|
||||
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
||||
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
|
||||
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
|
||||
" )\n",
|
||||
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (pre_classifier): Linear(in_features=768, out_features=768, bias=True)\n",
|
||||
" (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
|
||||
" (dropout): Dropout(p=0.2, inplace=False)\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Metric"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metric = datasets.load_metric(\"glue\", TASK)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Metric(name: \"glue\", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: \"\"\"\n",
|
||||
"Compute GLUE evaluation metric associated to each GLUE dataset.\n",
|
||||
"Args:\n",
|
||||
" predictions: list of translations to score.\n",
|
||||
" Each translation should be tokenized into a list of tokens.\n",
|
||||
" references: list of lists of references for each translation.\n",
|
||||
" Each reference should be tokenized into a list of tokens.\n",
|
||||
"Returns: depending on the GLUE subset, one or several of:\n",
|
||||
" \"accuracy\": Accuracy\n",
|
||||
" \"f1\": F1\n",
|
||||
" \"pearson\": Pearson Correlation\n",
|
||||
" \"spearmanr\": Spearman Correlation\n",
|
||||
" \"matthews_correlation\": Matthew Correlation\n",
|
||||
"\"\"\", stored examples: 0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"metric"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def compute_metrics(eval_pred):\n",
|
||||
" predictions, labels = eval_pred\n",
|
||||
" predictions = np.argmax(predictions, axis=1)\n",
|
||||
" return metric.compute(predictions=predictions, references=labels)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Training (aka Finetuning)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import Trainer\n",
|
||||
"from transformers import TrainingArguments"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"args = TrainingArguments(\n",
|
||||
" output_dir='output',\n",
|
||||
" do_eval=True,\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"trainer = Trainer(\n",
|
||||
" model=model,\n",
|
||||
" args=args,\n",
|
||||
" train_dataset=encoded_dataset[\"train\"],\n",
|
||||
" eval_dataset=encoded_dataset[\"validation\"],\n",
|
||||
" tokenizer=tokenizer,\n",
|
||||
" compute_metrics=compute_metrics,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"\n",
|
||||
" <div>\n",
|
||||
" <style>\n",
|
||||
" /* Turns off some styling */\n",
|
||||
" progress {\n",
|
||||
" /* gets rid of default border in Firefox and Opera. */\n",
|
||||
" border: none;\n",
|
||||
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
|
||||
" background-size: auto;\n",
|
||||
" }\n",
|
||||
" </style>\n",
|
||||
" \n",
|
||||
" <progress value='322' max='3207' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
||||
" [ 322/3207 02:51 < 25:41, 1.87 it/s, Epoch 0.30/3]\n",
|
||||
" </div>\n",
|
||||
" <table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: left;\">\n",
|
||||
" <th>Step</th>\n",
|
||||
" <th>Training Loss</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" </tbody>\n",
|
||||
"</table><p>"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.HTML object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"trainer.train()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"## Hyperparameter Optimization\n",
|
||||
"\n",
|
||||
"`flaml.tune` is a module for economical hyperparameter tuning. It frees users from manually tuning many hyperparameters for a software, such as machine learning training procedures. \n",
|
||||
"The API is compatible with ray tune.\n",
|
||||
"\n",
|
||||
"### Step 1. Define training method\n",
|
||||
"\n",
|
||||
"We define a function `train_distilbert(config: dict)` that accepts a hyperparameter configuration dict `config`. The specific configs will be generated by flaml's search algorithm in a given search space.\n"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import flaml\n",
|
||||
"\n",
|
||||
"def train_distilbert(config: dict):\n",
|
||||
"\n",
|
||||
" # Define tokenize method\n",
|
||||
" tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)\n",
|
||||
" def tokenize(examples):\n",
|
||||
" return tokenizer(examples[COLUMN_NAME], truncation=True)\n",
|
||||
" # Load CoLA dataset and apply tokenizer\n",
|
||||
" cola_raw = load_dataset(\"glue\", TASK)\n",
|
||||
" cola_encoded = cola_raw.map(tokenize, batched=True)\n",
|
||||
" # QUESTION: Write processed data to disk?\n",
|
||||
" train_dataset, eval_dataset = cola_encoded[\"train\"], cola_encoded[\"validation\"]\n",
|
||||
"\n",
|
||||
" model = AutoModelForSequenceClassification.from_pretrained(\n",
|
||||
" MODEL_CHECKPOINT, num_labels=NUM_LABELS\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" metric = load_metric(\"glue\", TASK)\n",
|
||||
"\n",
|
||||
" training_args = TrainingArguments(\n",
|
||||
" output_dir='.',\n",
|
||||
" do_eval=False,\n",
|
||||
" **config,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" trainer = Trainer(\n",
|
||||
" model,\n",
|
||||
" training_args,\n",
|
||||
" train_dataset=train_dataset,\n",
|
||||
" eval_dataset=eval_dataset,\n",
|
||||
" tokenizer=tokenizer,\n",
|
||||
" compute_metrics=compute_metrics,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # train model\n",
|
||||
" trainer.train()\n",
|
||||
"\n",
|
||||
" # evaluate model\n",
|
||||
" eval_output = trainer.evaluate()\n",
|
||||
"\n",
|
||||
" # report the metric to optimize\n",
|
||||
" flaml.tune.report(\n",
|
||||
" loss=eval_output[\"eval_loss\"],\n",
|
||||
" matthews_correlation=eval_output[\"eval_matthews_correlation\"],\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"### Step 2. Define the search\n",
|
||||
"\n",
|
||||
"We are now ready to define our search. This includes:\n",
|
||||
"\n",
|
||||
"- The `search_space` for our hyperparameters\n",
|
||||
"- The metric and the mode ('max' or 'min') for optimization\n",
|
||||
"- The constraints (`n_cpus`, `n_gpus`, `num_samples`, and `time_budget_s`)"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"max_num_epoch = 64\n",
|
||||
"search_space = {\n",
|
||||
" # You can mix constants with search space objects.\n",
|
||||
" \"num_train_epochs\": flaml.tune.loguniform(1, max_num_epoch),\n",
|
||||
" \"learning_rate\": flaml.tune.loguniform(1e-6, 1e-4),\n",
|
||||
" \"adam_epsilon\": flaml.tune.loguniform(1e-9, 1e-7),\n",
|
||||
" \"adam_beta1\": flaml.tune.uniform(0.8, 0.99),\n",
|
||||
" \"adam_beta2\": flaml.tune.loguniform(98e-2, 9999e-4),\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# optimization objective\n",
|
||||
"HP_METRIC, MODE = \"matthews_correlation\", \"max\"\n",
|
||||
"\n",
|
||||
"# resources\n",
|
||||
"num_cpus = 4\n",
|
||||
"num_gpus = 4\n",
|
||||
"\n",
|
||||
"# constraints\n",
|
||||
"num_samples = -1 # number of trials, -1 means unlimited\n",
|
||||
"time_budget_s = 10800 # time budget in seconds"
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"### Step 3. Launch with `flaml.tune.run`\n",
|
||||
"\n",
|
||||
"We are now ready to launch the tuning using `flaml.tune.run`:"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import ray\n",
|
||||
"start_time = time.time()\n",
|
||||
"ray.init(num_cpus=num_cpus, num_gpus=num_gpus)\n",
|
||||
"\n",
|
||||
"print(\"Tuning started...\")\n",
|
||||
"analysis = flaml.tune.run(\n",
|
||||
" train_distilbert,\n",
|
||||
" config=search_space,\n",
|
||||
" init_config={\n",
|
||||
" \"num_train_epochs\": 1,\n",
|
||||
" },\n",
|
||||
" metric=HP_METRIC,\n",
|
||||
" mode=MODE,\n",
|
||||
" report_intermediate_result=False,\n",
|
||||
" # uncomment the following if report_intermediate_result = True\n",
|
||||
" # max_resource=max_num_epoch, min_resource=1,\n",
|
||||
" resources_per_trial={\"gpu\": 1},\n",
|
||||
" local_dir='logs/',\n",
|
||||
" num_samples=num_samples,\n",
|
||||
" time_budget_s=time_budget_s,\n",
|
||||
" use_ray=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ray.shutdown()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_trial = analysis.get_best_trial(HP_METRIC, MODE, \"all\")\n",
|
||||
"metric = best_trial.metric_analysis[HP_METRIC][MODE]\n",
|
||||
"print(f\"n_trials={len(analysis.trials)}\")\n",
|
||||
"print(f\"time={time.time()-start_time}\")\n",
|
||||
"print(f\"Best model eval {HP_METRIC}: {metric:.4f}\")\n",
|
||||
"print(f\"Best model parameters: {best_trial.config}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"## Next Steps\n",
|
||||
"\n",
|
||||
"Notice that we only reported the metric with `flaml.tune.report` at the end of full training loop. It is possible to enable reporting of intermediate performance - allowing early stopping - as follows:\n",
|
||||
"\n",
|
||||
"- Huggingface provides _Callbacks_ which can be used to insert the `flaml.tune.report` call inside the training loop\n",
|
||||
"- Make sure to set `do_eval=True` in the `TrainingArguments` provided to `Trainer` and adjust theevaluation frequency accordingly"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3.7.7 64-bit ('flaml': conda)",
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "bfcd9a6a9254a5e160761a1fd7a9e444f011592c6770d9f4180dde058a9df5dd"
|
||||
}
|
||||
}
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.7-final"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
649
notebook/flaml_lightgbm.ipynb
Normal file
649
notebook/flaml_lightgbm.ipynb
Normal file
File diff suppressed because one or more lines are too long
5
setup.py
5
setup.py
@@ -20,7 +20,6 @@ install_requires = [
|
||||
"scipy>=1.4.1",
|
||||
"catboost>=0.23",
|
||||
"scikit-learn>=0.23.2",
|
||||
"optuna==2.3.0"
|
||||
],
|
||||
|
||||
|
||||
@@ -48,6 +47,10 @@ setuptools.setup(
|
||||
"coverage>=5.3",
|
||||
"xgboost<1.3",
|
||||
"rgf-python",
|
||||
"optuna==2.3.0",
|
||||
],
|
||||
"blendsearch": [
|
||||
"optuna==2.3.0"
|
||||
],
|
||||
"ray": [
|
||||
"ray[tune]==1.1.0",
|
||||
|
||||
@@ -274,7 +274,7 @@ class TestAutoML(unittest.TestCase):
|
||||
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"time_budget": 3,
|
||||
"metric": 'ap',
|
||||
"task": 'classification',
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
|
||||
217
test/test_distillbert.py
Normal file
217
test/test_distillbert.py
Normal file
@@ -0,0 +1,217 @@
|
||||
'''Require: pip install torch transformers datasets flaml[blendsearch,ray]
|
||||
'''
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import ray
|
||||
from datasets import (
|
||||
load_dataset,
|
||||
load_metric,
|
||||
)
|
||||
from transformers import (
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
)
|
||||
except:
|
||||
print("pip install torch transformers datasets flaml[blendsearch,ray]")
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.addHandler(logging.FileHandler('test/tune_distilbert.log'))
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
import flaml
|
||||
|
||||
|
||||
MODEL_CHECKPOINT = "distilbert-base-uncased"
|
||||
TASK = "cola"
|
||||
NUM_LABELS = 2
|
||||
COLUMN_NAME = "sentence"
|
||||
METRIC_NAME = "matthews_correlation"
|
||||
|
||||
# HP_METRIC, MODE = "loss", "min"
|
||||
HP_METRIC, MODE = "matthews_correlation", "max"
|
||||
|
||||
def train_distilbert(config: dict):
|
||||
|
||||
# Define tokenize method
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
|
||||
def tokenize(examples):
|
||||
return tokenizer(examples[COLUMN_NAME], truncation=True)
|
||||
# Load CoLA dataset and apply tokenizer
|
||||
cola_raw = load_dataset("glue", TASK)
|
||||
cola_encoded = cola_raw.map(tokenize, batched=True)
|
||||
train_dataset, eval_dataset = cola_encoded["train"], cola_encoded["validation"]
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
MODEL_CHECKPOINT, num_labels=NUM_LABELS
|
||||
)
|
||||
|
||||
metric = load_metric("glue", TASK)
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
predictions, labels = eval_pred
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
return metric.compute(predictions=predictions, references=labels)
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir='.',
|
||||
do_eval=False,
|
||||
disable_tqdm=True,
|
||||
logging_steps=20000,
|
||||
save_total_limit=0,
|
||||
**config,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model,
|
||||
training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
tokenizer=tokenizer,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
# train model
|
||||
trainer.train()
|
||||
|
||||
# evaluate model
|
||||
eval_output = trainer.evaluate()
|
||||
|
||||
flaml.tune.report(
|
||||
loss=eval_output["eval_loss"],
|
||||
matthews_correlation=eval_output["eval_matthews_correlation"],
|
||||
)
|
||||
|
||||
|
||||
def _test_distillbert(method='BlendSearch'):
|
||||
|
||||
max_num_epoch = 64
|
||||
num_samples = -1
|
||||
time_budget_s = 10800
|
||||
|
||||
search_space = {
|
||||
# You can mix constants with search space objects.
|
||||
"num_train_epochs": flaml.tune.loguniform(1, max_num_epoch),
|
||||
"learning_rate": flaml.tune.loguniform(1e-6, 1e-4),
|
||||
"adam_beta1": flaml.tune.uniform(0.8, 0.99),
|
||||
"adam_beta2": flaml.tune.loguniform(98e-2, 9999e-4),
|
||||
"adam_epsilon": flaml.tune.loguniform(1e-9, 1e-7),
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
ray.init(num_cpus=4, num_gpus=4)
|
||||
if 'ASHA' == method:
|
||||
algo = None
|
||||
elif 'BOHB' == method:
|
||||
from ray.tune.schedulers import HyperBandForBOHB
|
||||
from ray.tune.suggest.bohb import tuneBOHB
|
||||
algo = tuneBOHB(max_concurrent=4)
|
||||
scheduler = HyperBandForBOHB(max_t=max_num_epoch)
|
||||
elif 'Optuna' == method:
|
||||
from ray.tune.suggest.optuna import OptunaSearch
|
||||
algo = OptunaSearch()
|
||||
elif 'CFO' == method:
|
||||
from flaml import CFO
|
||||
algo = CFO(points_to_evaluate=[{
|
||||
"num_train_epochs": 1,
|
||||
}])
|
||||
elif 'BlendSearch' == method:
|
||||
from flaml import BlendSearch
|
||||
algo = BlendSearch(points_to_evaluate=[{
|
||||
"num_train_epochs": 1,
|
||||
}])
|
||||
elif 'Dragonfly' == method:
|
||||
from ray.tune.suggest.dragonfly import DragonflySearch
|
||||
algo = DragonflySearch()
|
||||
elif 'SkOpt' == method:
|
||||
from ray.tune.suggest.skopt import SkOptSearch
|
||||
algo = SkOptSearch()
|
||||
elif 'Nevergrad' == method:
|
||||
from ray.tune.suggest.nevergrad import NevergradSearch
|
||||
import nevergrad as ng
|
||||
algo = NevergradSearch(optimizer=ng.optimizers.OnePlusOne)
|
||||
elif 'ZOOpt' == method:
|
||||
from ray.tune.suggest.zoopt import ZOOptSearch
|
||||
algo = ZOOptSearch(budget=num_samples)
|
||||
elif 'Ax' == method:
|
||||
from ray.tune.suggest.ax import AxSearch
|
||||
algo = AxSearch()
|
||||
elif 'HyperOpt' == method:
|
||||
from ray.tune.suggest.hyperopt import HyperOptSearch
|
||||
algo = HyperOptSearch()
|
||||
scheduler = None
|
||||
if method != 'BOHB':
|
||||
from ray.tune.schedulers import ASHAScheduler
|
||||
scheduler = ASHAScheduler(
|
||||
max_t=max_num_epoch,
|
||||
grace_period=1)
|
||||
scheduler = None
|
||||
analysis = ray.tune.run(
|
||||
train_distilbert,
|
||||
metric=HP_METRIC,
|
||||
mode=MODE,
|
||||
# You can add "gpu": 1 to allocate GPUs
|
||||
resources_per_trial={"gpu": 1},
|
||||
config=search_space, local_dir='test/logs/',
|
||||
num_samples=num_samples, time_budget_s=time_budget_s,
|
||||
keep_checkpoints_num=1, checkpoint_score_attr=HP_METRIC,
|
||||
scheduler=scheduler, search_alg=algo)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
best_trial = analysis.get_best_trial(HP_METRIC, MODE, "all")
|
||||
metric = best_trial.metric_analysis[HP_METRIC][MODE]
|
||||
|
||||
logger.info(f"method={method}")
|
||||
logger.info(f"n_trials={len(analysis.trials)}")
|
||||
logger.info(f"time={time.time()-start_time}")
|
||||
logger.info(f"Best model eval {HP_METRIC}: {metric:.4f}")
|
||||
logger.info(f"Best model parameters: {best_trial.config}")
|
||||
|
||||
|
||||
def _test_distillbert_cfo():
|
||||
_test_distillbert('CFO')
|
||||
|
||||
|
||||
def _test_distillbert_dragonfly():
|
||||
_test_distillbert('Dragonfly')
|
||||
|
||||
|
||||
def _test_distillbert_skopt():
|
||||
_test_distillbert('SkOpt')
|
||||
|
||||
|
||||
def _test_distillbert_nevergrad():
|
||||
_test_distillbert('Nevergrad')
|
||||
|
||||
|
||||
def _test_distillbert_zoopt():
|
||||
_test_distillbert('ZOOpt')
|
||||
|
||||
|
||||
def _test_distillbert_ax():
|
||||
_test_distillbert('Ax')
|
||||
|
||||
|
||||
def __test_distillbert_hyperopt():
|
||||
_test_distillbert('HyperOpt')
|
||||
|
||||
|
||||
def _test_distillbert_optuna():
|
||||
_test_distillbert('Optuna')
|
||||
|
||||
|
||||
def _test_distillbert_asha():
|
||||
_test_distillbert('ASHA')
|
||||
|
||||
|
||||
def _test_distillbert_bohb():
|
||||
_test_distillbert('BOHB')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test_distillbert()
|
||||
@@ -1,3 +1,5 @@
|
||||
'''Require: pip install torchvision ray flaml[blendsearch]
|
||||
'''
|
||||
import unittest
|
||||
import os
|
||||
import time
|
||||
@@ -24,7 +26,6 @@ def load_data(data_dir="./data"):
|
||||
# __load_data_end__
|
||||
|
||||
|
||||
import numpy as np
|
||||
try:
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
'''Require: pip install flaml[test,ray]
|
||||
'''
|
||||
import unittest
|
||||
import os
|
||||
import time
|
||||
from sklearn.model_selection import train_test_split
|
||||
import sklearn.metrics
|
||||
@@ -138,6 +139,7 @@ def _test_xgboost(method='BlendSearch'):
|
||||
scheduler=scheduler, search_alg=algo)
|
||||
ray.shutdown()
|
||||
# # Load the best model checkpoint
|
||||
# import os
|
||||
# best_bst = xgb.Booster()
|
||||
# best_bst.load_model(os.path.join(analysis.best_checkpoint,
|
||||
# "model.xgb"))
|
||||
|
||||
@@ -8,11 +8,7 @@ from flaml.model import XGBoostSklearnEstimator
|
||||
from flaml import tune
|
||||
|
||||
|
||||
# dataset = "blood-transfusion-service-center"
|
||||
# dataset = "Australian"
|
||||
dataset = "credit-g"
|
||||
# dataset = "phoneme"
|
||||
# dataset = "kc1"
|
||||
|
||||
|
||||
class XGBoost2D(XGBoostSklearnEstimator):
|
||||
@@ -50,8 +46,11 @@ def test_simple(method=None):
|
||||
"log_type": "all",
|
||||
"time_budget": 3#6000,
|
||||
}
|
||||
|
||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||
try:
|
||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||
except:
|
||||
from sklearn.datasets import load_wine
|
||||
X, y = load_wine(return_X_y=True)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||
random_state=42)
|
||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
|
||||
Reference in New Issue
Block a user