From 169012f3e732cee7206c4a844481c811bf37330d Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 10 Mar 2023 11:35:36 -0800
Subject: [PATCH] ChatGPT support (#942)

* improve max_valid_n and doc

* Update README.md

Co-authored-by: Li Jiang <lijiang1@microsoft.com>

* add support for chatgpt

* notebook

* newline at end of file

* chatgpt notebook

* ChatGPT in Azure

* doc

* math

* warning, timeout, log file name

* handle import error

* doc update; default value

* paper

* doc

* docstr

* eval_func

* prompt and messages

* remove confusing words

* notebook name

---------

Co-authored-by: Li Jiang <lijiang1@microsoft.com>
Co-authored-by: Susan Xueqing Liu <liususan091219@users.noreply.github.com>
---
 flaml/integrations/oai/__init__.py          |    4 +-
 flaml/integrations/oai/completion.py        |  288 +++-
 flaml/version.py                            |    2 +-
 notebook/integrate_chatgpt_code.ipynb       | 1082 +++++++++++++++
 notebook/integrate_chatgpt_math.ipynb       | 1386 +++++++++++++++++++
 notebook/integrate_openai.ipynb             |    1 +
 notebook/research/acl2021.ipynb             |    1 -
 setup.py                                    |    2 +-
 test/openai/test_completion.py              |   29 +-
 test/openai/test_notebook.py                |   10 +-
 website/docs/Examples/Integrate - OpenAI.md |    6 +-
 website/docs/Research.md                    |   11 +
 12 files changed, 2741 insertions(+), 81 deletions(-)
 create mode 100644 notebook/integrate_chatgpt_code.ipynb
 create mode 100644 notebook/integrate_chatgpt_math.ipynb

diff --git a/flaml/integrations/oai/__init__.py b/flaml/integrations/oai/__init__.py
index f6ee2599b..12320692d 100644
--- a/flaml/integrations/oai/__init__.py
+++ b/flaml/integrations/oai/__init__.py
@@ -1,3 +1,3 @@
-from flaml.integrations.oai.completion import Completion
+from flaml.integrations.oai.completion import Completion, ChatCompletion
 
-__all__ = ["Completion"]
+__all__ = ["Completion", "ChatCompletion"]
diff --git a/flaml/integrations/oai/completion.py b/flaml/integrations/oai/completion.py
index 001311e22..ef8bd8b5b 100644
--- a/flaml/integrations/oai/completion.py
+++ b/flaml/integrations/oai/completion.py
@@ -13,6 +13,7 @@ try:
         APIConnectionError,
     )
     import diskcache
+    from urllib3.exceptions import ReadTimeoutError
 
     ERROR = None
 except ImportError:
@@ -39,8 +40,15 @@ def get_key(config):
 
 
 class Completion:
-    """A class for OpenAI API completion."""
+    """A class for OpenAI completion API.
 
+    It also supports: ChatCompletion, Azure OpenAI API.
+    """
+
+    # set of models that support chat completion
+    chat_models = {"gpt-3.5-turbo"}
+
+    # price per 1k tokens
     price1K = {
         "text-ada-001": 0.0004,
         "text-babbage-001": 0.0005,
@@ -49,6 +57,7 @@ class Completion:
         "code-davinci-002": 0.1,
         "text-davinci-002": 0.02,
         "text-davinci-003": 0.02,
+        "gpt-3.5-turbo": 0.002,
     }
 
     default_search_space = {
@@ -70,6 +79,10 @@ class Completion:
     # fail a request after hitting RateLimitError for this many seconds
     retry_timeout = 60
 
+    openai_completion_class = not ERROR and openai.Completion
+    _total_cost = 0
+    optimization_budget = None
+
     @classmethod
     def set_cache(cls, seed=41, cache_path=".cache"):
         """Set cache path.
@@ -95,17 +108,23 @@ class Completion:
             # print("using cached response")
             return response
         retry = 0
+        openai_completion = (
+            openai.ChatCompletion
+            if config["model"] in cls.chat_models
+            else openai.Completion
+        )
         while eval_only or retry * cls.retry_time < cls.retry_timeout:
             try:
-                response = openai.Completion.create(**config)
+                response = openai_completion.create(**config)
                 cls._cache.set(key, response)
                 return response
             except (
                 ServiceUnavailableError,
                 APIError,
                 APIConnectionError,
+                ReadTimeoutError,
             ):
-                logger.info(f"retrying in {cls.retry_time} seconds...", exc_info=1)
+                logger.warning(f"retrying in {cls.retry_time} seconds...", exc_info=1)
                 sleep(cls.retry_time)
             except RateLimitError:
                 logger.info(f"retrying in {cls.retry_time} seconds...", exc_info=1)
@@ -152,7 +171,11 @@ class Completion:
     @classmethod
     def _get_region_key(cls, config):
         # get a key for the valid/invalid region corresponding to the given config
-        return (config["model"], config["prompt"], config.get("stop"))
+        return (
+            config["model"],
+            config.get("prompt", config.get("messages")),
+            config.get("stop"),
+        )
 
     @classmethod
     def _update_invalid_n(cls, prune, region_key, max_tokens, num_completions):
@@ -172,25 +195,41 @@ class Completion:
         Args:
             config (dict): Hyperparameter setting for the openai api call.
             prune (bool, optional): Whether to enable pruning. Defaults to True.
-            eval_only (bool, optional): Whether to evaluate only. Defaults to False.
+            eval_only (bool, optional): Whether to evaluate only (ignore the inference budget and no timeout).
+              Defaults to False.
 
         Returns:
             dict: Evaluation results.
         """
         cost = 0
         data = cls.data
+        model = config["model"]
         data_length = len(data)
-        target_n_tokens = (
-            1000 * cls.inference_budget / cls.price1K[config["model"]]
-            if cls.inference_budget and cls.price1K.get(config["model"])
+        target_n_tokens = getattr(cls, "inference_budget", None) and (
+            1000 * cls.inference_budget / cls.price1K[model]
+            if cls.inference_budget and cls.price1K.get(model)
             else None
         )
-        prune_hp = cls._prune_hp
+        prune_hp = getattr(cls, "_prune_hp", "n")
         metric = cls._metric
-        config_n = config[prune_hp]
+        config_n = config.get(prune_hp, 1)  # default value in OpenAI is 1
         max_tokens = config.get("max_tokens", 16)  # default value in OpenAI is 16
         region_key = cls._get_region_key(config)
-        prompt = cls._prompts[config["prompt"]]
+        if model in cls.chat_models:
+            # either "prompt" should be in config (for being compatible with non-chat models)
+            # or "messages" should be in config (for tuning chat models only)
+            prompt = config.get("prompt")
+            messages = config.get("messages")
+            # either prompt or messages should be in config, but not both
+            assert (prompt is None) != (
+                messages is None
+            ), "Either prompt or messages should be in config for chat models."
+            if prompt is None:
+                messages = cls._messages[messages]
+            else:
+                prompt = cls._prompts[prompt]
+        else:
+            prompt = cls._prompts[config["prompt"]]
         stop = cls._stops and cls._stops[config["stop"]]
         if prune and target_n_tokens:
             max_valid_n = cls._get_max_valid_n(region_key, max_tokens)
@@ -232,8 +271,37 @@ class Completion:
             while True:  # data_limit <= data_length
                 # limit the number of data points to avoid rate limit
                 for i in range(prev_data_limit, data_limit):
+                    logger.debug(
+                        f"num_completions={num_completions}, data instance={i}"
+                    )
                     data_i = data[i]
-                    params["prompt"] = prompt.format(**data_i)
+                    if prompt is None:
+                        params["messages"] = [
+                            {
+                                "role": m["role"],
+                                "content": m["content"].format(**data_i)
+                                if isinstance(m["content"], str)
+                                else m["content"](data_i),
+                            }
+                            for m in messages
+                        ]
+                    elif model in cls.chat_models:
+                        # convert prompt to messages
+                        params["messages"] = [
+                            {
+                                "role": "user",
+                                "content": prompt.format(**data_i)
+                                if isinstance(prompt, str)
+                                else prompt(data_i),
+                            },
+                        ]
+                        params.pop("prompt", None)
+                    else:
+                        params["prompt"] = (
+                            prompt.format(**data_i)
+                            if isinstance(prompt, str)
+                            else prompt(data_i)
+                        )
                     response = cls._get_response(params, eval_only)
                     if response == -1:  # rate limit error, treat as invalid
                         cls._update_invalid_n(
@@ -243,7 +311,11 @@ class Completion:
                         result["cost"] = cost
                         return result
                     # evaluate the quality of the responses
-                    responses = [r["text"].rstrip() for r in response["choices"]]
+                    responses = (
+                        [r["message"]["content"].rstrip() for r in response["choices"]]
+                        if model in cls.chat_models
+                        else [r["text"].rstrip() for r in response["choices"]]
+                    )
                     n_tokens = (
                         response["usage"]["completion_tokens"]
                         if previous_num_completions
@@ -260,9 +332,7 @@ class Completion:
                     # Under Assumption 1, we should count both the input and output tokens in the first query,
                     # and only count ouput tokens afterwards
                     query_cost = (
-                        response["usage"]["total_tokens"]
-                        * cls.price1K[config["model"]]
-                        / 1000
+                        response["usage"]["total_tokens"] * cls.price1K[model] / 1000
                     )
                     cls._total_cost += query_cost
                     cost += query_cost
@@ -347,9 +417,7 @@ class Completion:
                     result[key] /= data_limit
                 result["total_cost"] = cls._total_cost
                 result["cost"] = cost
-                result["inference_cost"] = (
-                    avg_n_tokens * cls.price1K[config["model"]] / 1000
-                )
+                result["inference_cost"] = avg_n_tokens * cls.price1K[model] / 1000
                 if prune and target_n_tokens and not cls.avg_input_tokens:
                     cls.avg_input_tokens = np.mean(input_tokens)
                 break
@@ -374,6 +442,7 @@ class Completion:
         inference_budget=None,
         optimization_budget=None,
         num_samples=1,
+        logging_level=logging.WARNING,
         **config,
     ):
         """Tune the parameters for the OpenAI API call.
@@ -385,13 +454,42 @@ class Completion:
             metric (str): The metric to optimize.
             mode (str): The optimization mode, "min" or "max.
             eval_func (Callable): The evaluation function for responses.
+                The function should take a list of responses and a data point as input,
+                and return a dict of metrics. For example,
+
+            ```python
+            def eval_func(responses, **data):
+                solution = data["solution"]
+                success_list = []
+                n = len(responses)
+                for i in range(n):
+                    response = responses[i]
+                    succeed = is_equiv_chain_of_thought(response, solution)
+                    success_list.append(succeed)
+                return {
+                    "expected_success": 1 - pow(1 - sum(success_list) / n, n),
+                    "success": any(s for s in success_list),
+                }
+            ```
+
             log_file_name (str, optional): The log file.
             inference_budget (float, optional): The inference budget.
             optimization_budget (float, optional): The optimization budget.
             num_samples (int, optional): The number of samples to evaluate.
+                -1 means no hard restriction in the number of trials
+                and the actual number is decided by optimization_budget. Defaults to 1.
             **config (dict): The search space to update over the default search.
-                For prompt, please provide a string or a list of strings.
+                For prompt, please provide a string/Callable or a list of strings/Callables.
+                    - If prompt is provided for chat models, it will be converted to messages under role "user".
+                    - Do not provide both prompt and messages for chat models, but provide either of them.
+                    - A string `prompt` template will be used to generate a prompt for each data instance
+                      using `prompt.format(**data)`.
+                    - A callable `prompt` template will be used to generate a prompt for each data instance
+                      using `prompt(data)`.
                 For stop, please provide a string, a list of strings, or a list of lists of strings.
+                For messages (chat models only), please provide a list of messages (for a single chat prefix)
+                or a list of lists of messages (for multiple choices of chat prefix to choose from).
+                Each message should be a dict with keys "role" and "content".
 
         Returns:
             dict: The optimized hyperparameter setting.
@@ -399,9 +497,11 @@ class Completion:
         """
         if ERROR:
             raise ERROR
-        space = Completion.default_search_space.copy()
+        space = cls.default_search_space.copy()
         if config is not None:
             space.update(config)
+            if "messages" in space:
+                space.pop("prompt", None)
             temperature = space.pop("temperature", None)
             top_p = space.pop("top_p", None)
             if temperature is not None and top_p is None:
@@ -415,58 +515,69 @@ class Completion:
                 logger.warning(
                     "temperature and top_p are not recommended to vary together."
                 )
-        with diskcache.Cache(cls.cache_path) as cls._cache:
-            cls._max_valid_n_per_max_tokens, cls._min_invalid_n_per_max_tokens = {}, {}
-            cls.optimization_budget = optimization_budget
-            cls.inference_budget = inference_budget
-            cls._prune_hp = "best_of" if space.get("best_of", 1) != 1 else "n"
-            cls._prompts = space["prompt"]
+        cls._max_valid_n_per_max_tokens, cls._min_invalid_n_per_max_tokens = {}, {}
+        cls.optimization_budget = optimization_budget
+        cls.inference_budget = inference_budget
+        cls._prune_hp = "best_of" if space.get("best_of", 1) != 1 else "n"
+        cls._prompts = space.get("prompt")
+        if cls._prompts is None:
+            cls._messages = space.get("messages")
+            assert isinstance(cls._messages, list) and isinstance(
+                cls._messages[0], (dict, list)
+            ), "messages must be a list of dicts or a list of lists."
+            if isinstance(cls._messages[0], dict):
+                cls._messages = [cls._messages]
+            space["messages"] = tune.choice(list(range(len(cls._messages))))
+        else:
+            assert (
+                space.get("messages") is None
+            ), "messages and prompt cannot be provided at the same time."
             assert isinstance(
                 cls._prompts, (str, list)
             ), "prompt must be a string or a list of strings."
             if isinstance(cls._prompts, str):
                 cls._prompts = [cls._prompts]
             space["prompt"] = tune.choice(list(range(len(cls._prompts))))
-            cls._stops = space.get("stop")
-            if cls._stops:
-                assert isinstance(
-                    cls._stops, (str, list)
-                ), "stop must be a string, a list of strings, or a list of lists of strings."
-                if not (
-                    isinstance(cls._stops, list) and isinstance(cls._stops[0], list)
-                ):
-                    cls._stops = [cls._stops]
-                space["stop"] = tune.choice(list(range(len(cls._stops))))
-            cls._metric, cls._mode = metric, mode
-            cls._total_cost = 0  # total optimization cost
-            cls._eval_func = eval_func
-            cls.data = data
-            cls.avg_input_tokens = None
+        cls._stops = space.get("stop")
+        if cls._stops:
+            assert isinstance(
+                cls._stops, (str, list)
+            ), "stop must be a string, a list of strings, or a list of lists of strings."
+            if not (isinstance(cls._stops, list) and isinstance(cls._stops[0], list)):
+                cls._stops = [cls._stops]
+            space["stop"] = tune.choice(list(range(len(cls._stops))))
+        cls._metric, cls._mode = metric, mode
+        cls._total_cost = 0  # total optimization cost
+        cls._eval_func = eval_func
+        cls.data = data
+        cls.avg_input_tokens = None
 
+        search_alg = BlendSearch(
+            cost_attr="cost",
+            cost_budget=optimization_budget,
+            metric=metric,
+            mode=mode,
+            space=space,
+        )
+        if len(space["model"]) > 1:
+            # start all the models with the same hp config
+            config0 = search_alg.suggest("t0")
+            points_to_evaluate = [config0]
+            for model in space["model"]:
+                if model != config0["model"]:
+                    point = config0.copy()
+                    point["model"] = model
+                    points_to_evaluate.append(point)
             search_alg = BlendSearch(
                 cost_attr="cost",
                 cost_budget=optimization_budget,
                 metric=metric,
                 mode=mode,
                 space=space,
+                points_to_evaluate=points_to_evaluate,
             )
-            if len(space["model"]) > 1:
-                # start all the models with the same hp config
-                config0 = search_alg.suggest("t0")
-                points_to_evaluate = [config0]
-                for model in space["model"]:
-                    if model != config0["model"]:
-                        point = config0.copy()
-                        point["model"] = model
-                        points_to_evaluate.append(point)
-                search_alg = BlendSearch(
-                    cost_attr="cost",
-                    cost_budget=optimization_budget,
-                    metric=metric,
-                    mode=mode,
-                    space=space,
-                    points_to_evaluate=points_to_evaluate,
-                )
+        logger.setLevel(logging_level)
+        with diskcache.Cache(cls.cache_path) as cls._cache:
             analysis = tune.run(
                 cls.eval,
                 search_alg=search_alg,
@@ -474,14 +585,17 @@ class Completion:
                 log_file_name=log_file_name,
                 verbose=3,
             )
-            config = analysis.best_config
-            params = config.copy()
+        config = analysis.best_config
+        params = config.copy()
+        if cls._prompts:
             params["prompt"] = cls._prompts[config["prompt"]]
-            stop = cls._stops and cls._stops[config["stop"]]
-            params["stop"] = stop
-            temperature_or_top_p = params.pop("temperature_or_top_p", None)
-            if temperature_or_top_p:
-                params.update(temperature_or_top_p)
+        else:
+            params["messages"] = cls._messages[config["messages"]]
+        stop = cls._stops and cls._stops[config["stop"]]
+        params["stop"] = stop
+        temperature_or_top_p = params.pop("temperature_or_top_p", None)
+        if temperature_or_top_p:
+            params.update(temperature_or_top_p)
         return params, analysis
 
     @classmethod
@@ -503,7 +617,43 @@ class Completion:
         if ERROR:
             raise ERROR
         params = config.copy()
-        params["prompt"] = config["prompt"].format(**context)
+        prompt = config.get("prompt")
+        if "messages" in config:
+            params["messages"] = [
+                {
+                    k: v.format(**context) if isinstance(v, str) else v(context)
+                    for k, v in message.items()
+                }
+                for message in config["messages"]
+            ]
+            params.pop("prompt", None)
+        elif config["model"] in cls.chat_models:
+            params["messages"] = [
+                {
+                    "role": "user",
+                    "content": prompt.format(**context)
+                    if isinstance(prompt, str)
+                    else prompt(context),
+                }
+            ]
+            params.pop("prompt", None)
+        else:
+            params["prompt"] = (
+                prompt.format(**context) if isinstance(prompt, str) else prompt(context)
+            )
         if use_cache:
-            return cls._get_response(params)
-        return openai.Completion.create(**params)
+            with diskcache.Cache(cls.cache_path) as cls._cache:
+                return cls._get_response(params)
+        return cls.openai_completion_class.create(**params)
+
+
+class ChatCompletion(Completion):
+    """A class for OpenAI API ChatCompletion."""
+
+    price1K = {
+        "gpt-3.5-turbo": 0.002,
+    }
+
+    default_search_space = Completion.default_search_space.copy()
+    default_search_space["model"] = tune.choice(list(price1K.keys()))
+    openai_completion_class = not ERROR and openai.ChatCompletion
diff --git a/flaml/version.py b/flaml/version.py
index 0b2f79dbb..c68196d1c 100644
--- a/flaml/version.py
+++ b/flaml/version.py
@@ -1 +1 @@
-__version__ = "1.1.3"
+__version__ = "1.2.0"
diff --git a/notebook/integrate_chatgpt_code.ipynb b/notebook/integrate_chatgpt_code.ipynb
new file mode 100644
index 000000000..735881d97
--- /dev/null
+++ b/notebook/integrate_chatgpt_code.ipynb
@@ -0,0 +1,1082 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "Copyright (c) Microsoft Corporation. All rights reserved. \n",
+    "\n",
+    "Licensed under the MIT License.\n",
+    "\n",
+    "# Use FLAML to Tune ChatGPT\n",
+    "\n",
+    "In this notebook, we tune OpenAI ChatGPT model for code generation. We use [the HumanEval benchmark](https://huggingface.co/datasets/openai_humaneval) released by OpenAI for synthesizing programs from docstrings. \n",
+    "\n",
+    "## Requirements\n",
+    "\n",
+    "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n",
+    "```bash\n",
+    "pip install flaml[openai]==1.2.0\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.317406Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.316561Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.321193Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.320628Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install flaml[openai]==1.2.0 datasets"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set your OpenAI key:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.324240Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.323783Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.330570Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.329750Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "if \"OPENAI_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When ChatGPT is available in Azure OpenAI, uncomment the following to use Azure OpenAI:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.333547Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.333249Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.336508Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.335858Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# openai.api_type = \"azure\"\n",
+    "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
+    "# openai.api_version = \"2023-3-01\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load dataset\n",
+    "\n",
+    "First, we load the humaneval dataset. The dataset contains 164 examples. We use the first 20 for tuning the generation hyperparameters and the remaining for evaluation. In each example, the \"prompt\" is the prompt string for eliciting the code generation, \"test\" is the Python code for unit test for the example, and \"entry_point\" is the function name to be tested."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.339977Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.339556Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.603349Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.602630Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset openai_humaneval (/home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aac289608bce4a808e224c0a09e1e8cf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75/cache-1e8448101c1b32e8.arrow\n"
+     ]
+    }
+   ],
+   "source": [
+    "import datasets\n",
+    "\n",
+    "seed = 41\n",
+    "data = datasets.load_dataset(\"openai_humaneval\")[\"test\"].shuffle(seed=seed)\n",
+    "n_tune_data = 20\n",
+    "tune_data = [\n",
+    "    {\n",
+    "        \"prompt\": data[x][\"prompt\"],\n",
+    "        \"test\": data[x][\"test\"],\n",
+    "        \"entry_point\": data[x][\"entry_point\"],\n",
+    "    }\n",
+    "    for x in range(n_tune_data)\n",
+    "]\n",
+    "test_data = [\n",
+    "    {\n",
+    "        \"prompt\": data[x][\"prompt\"],\n",
+    "        \"test\": data[x][\"test\"],\n",
+    "        \"entry_point\": data[x][\"entry_point\"],\n",
+    "    }\n",
+    "    for x in range(n_tune_data, len(data))\n",
+    "]\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "Check a tuning example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.607152Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.606441Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.610504Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.609759Z"
+    },
+    "slideshow": {
+     "slide_type": "subslide"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "def compare(game,guess):\n",
+      "    \"\"\"I think we all remember that feeling when the result of some long-awaited\n",
+      "    event is finally known. The feelings and thoughts you have at that moment are\n",
+      "    definitely worth noting down and comparing.\n",
+      "    Your task is to determine if a person correctly guessed the results of a number of matches.\n",
+      "    You are given two arrays of scores and guesses of equal length, where each index shows a match. \n",
+      "    Return an array of the same length denoting how far off each guess was. If they have guessed correctly,\n",
+      "    the value is 0, and if not, the value is the absolute difference between the guess and the score.\n",
+      "    \n",
+      "    \n",
+      "    example:\n",
+      "\n",
+      "    compare([1,2,3,4,5,1],[1,2,3,4,2,-2]) -> [0,0,0,0,3,3]\n",
+      "    compare([0,5,0,0,0,4],[4,1,1,0,0,-2]) -> [4,4,1,0,0,6]\n",
+      "    \"\"\"\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tune_data[1][\"prompt\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here is one example of the unit test code for verifying the correctness of the generated code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.613590Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.613168Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.616873Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.616193Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "def check(candidate):\n",
+      "\n",
+      "    # Check some simple cases\n",
+      "    assert candidate([1,2,3,4,5,1],[1,2,3,4,2,-2])==[0,0,0,0,3,3], \"This prints if this assert fails 1 (good for debugging!)\"\n",
+      "    assert candidate([0,0,0,0,0,0],[0,0,0,0,0,0])==[0,0,0,0,0,0], \"This prints if this assert fails 1 (good for debugging!)\"\n",
+      "    assert candidate([1,2,3],[-1,-2,-3])==[2,4,6], \"This prints if this assert fails 1 (good for debugging!)\"\n",
+      "    assert candidate([1,2,3,5],[-1,2,3,4])==[2,0,0,1], \"This prints if this assert fails 1 (good for debugging!)\"\n",
+      "\n",
+      "    # Check some edge cases that are easy to work out by hand.\n",
+      "    assert True, \"This prints if this assert fails 2 (also good for debugging!)\"\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tune_data[1][\"test\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Success Metric\n",
+    "\n",
+    "Before we start tuning, we need to define the success metric we want to opotimize. For each code generation task, if one of the returned responses can pass the test, we consider the task as successfully solved. Then we can define the mean success rate of a collection of tasks.\n",
+    "\n",
+    "### Define a code executor\n",
+    "\n",
+    "First, we write a simple code executor. The code executor takes the generated code and the test code as the input, and execute them with a timer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.619618Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.619218Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.624272Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.623664Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import signal\n",
+    "import subprocess\n",
+    "import sys\n",
+    "\n",
+    "def timeout_handler(signum, frame):\n",
+    "    raise TimeoutError(\"Timed out!\")\n",
+    "\n",
+    "signal.signal(signal.SIGALRM, timeout_handler)\n",
+    "max_exec_time = 3  # seconds\n",
+    "\n",
+    "def execute_code(code):\n",
+    "    code = code.strip()\n",
+    "    with open(\"codetest.py\", \"w\") as fout:\n",
+    "        fout.write(code)\n",
+    "    try:\n",
+    "        signal.alarm(max_exec_time)\n",
+    "        result = subprocess.run(\n",
+    "            [sys.executable, \"codetest.py\"],\n",
+    "            stdout=subprocess.DEVNULL,\n",
+    "            stderr=subprocess.PIPE,\n",
+    "        )\n",
+    "        signal.alarm(0)\n",
+    "    except TimeoutError:\n",
+    "        return 0\n",
+    "    return int(result.returncode == 0)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This function will create a temp file \"codetest.py\" and execute it in a separate process. It allows for 3 seconds to finish that code.\n",
+    "\n",
+    "### Define a function to evaluate the success for a given program synthesis task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.626998Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.626593Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.631383Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.630770Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def success_metrics(responses, prompt, test, entry_point):\n",
+    "    \"\"\"Check if the task is successful.\n",
+    "\n",
+    "    Args:\n",
+    "        responses (list): The list of responses.\n",
+    "        prompt (str): The input prompt.\n",
+    "        test (str): The test code.\n",
+    "        entry_point (str): The name of the function.\n",
+    "\n",
+    "    Returns:\n",
+    "        dict: The success metrics.\n",
+    "    \"\"\"\n",
+    "    success_list = []\n",
+    "    n = len(responses)\n",
+    "    for i in range(n):\n",
+    "        response = responses[i]\n",
+    "        code = f\"{prompt}{response}\\n{test}\\ncheck({entry_point})\"\n",
+    "        succeed = execute_code(code)\n",
+    "        success_list.append(succeed)\n",
+    "    return {\n",
+    "        \"expected_success\": 1 - pow(1 - sum(success_list) / n, n),\n",
+    "        \"success\": any(s for s in success_list),\n",
+    "    }\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "## Use the tuning data to find a good configuration\n",
+    "\n",
+    "### Import the oai and tune subpackages from flaml.\n",
+    "\n",
+    "FLAML has provided an API for hyperparameter optimization of OpenAI ChatGPT completions: `oai.ChatCompletion.tune` and to make a request with the tuned config: `oai.ChatCompletion.create`. First, we import oai from flaml:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.634335Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.633929Z",
+     "iopub.status.idle": "2023-02-13T23:40:56.105700Z",
+     "shell.execute_reply": "2023-02-13T23:40:56.105085Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from flaml import oai"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For (local) reproducibility and cost efficiency, we cache responses from OpenAI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:56.109177Z",
+     "iopub.status.busy": "2023-02-13T23:40:56.108624Z",
+     "iopub.status.idle": "2023-02-13T23:40:56.112651Z",
+     "shell.execute_reply": "2023-02-13T23:40:56.112076Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "oai.ChatCompletion.set_cache(seed)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately.\n",
+    "\n",
+    "### Perform tuning\n",
+    "\n",
+    "The tuning will take a while to finish, depending on the optimization budget (~5 mins for the budget used in the following example). The tuning will be performed under the specified optimization budgets.\n",
+    "\n",
+    "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.002 means the target inference budget is 0.002 dollars, which translates to 1000 tokens (input + output combined) if the gpt-3.5-turbo model is used.\n",
+    "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 0.1 means 0.1 dollars are allowed in total, which translates to 50K tokens for the gpt-3.5-turbo model.\n",
+    "* `num_sumples` is the number of different hyperparameter configurations which is allowed to try. The tuning will stop after either num_samples trials or after optimization_budget dollars spent, whichever happens first. -1 means no hard restriction in the number of trials and the actual number is decided by `optimization_budget`.\n",
+    "\n",
+    "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n",
+    "\n",
+    "```python\n",
+    "price1K = {\n",
+    "    \"gpt-3.5-turbo\": 0.002,\n",
+    "}\n",
+    "\n",
+    "default_search_space = {\n",
+    "    \"model\": tune.choice(list(price1K.keys())),\n",
+    "    \"temperature_or_top_p\": tune.choice(\n",
+    "        [\n",
+    "            {\"temperature\": tune.uniform(0, 1)},\n",
+    "            {\"top_p\": tune.uniform(0, 1)},\n",
+    "        ]\n",
+    "    ),\n",
+    "    \"max_tokens\": tune.lograndint(50, 1000),\n",
+    "    \"n\": tune.randint(1, 100),\n",
+    "    \"prompt\": \"{prompt}\",\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "The default search space can be overriden by users' input.\n",
+    "For example, the following code specifies four choices for the prompt and a fixed stop sequence and number of completions. For hyperparameters which don't appear in users' input, the default search space will be used."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:56.115383Z",
+     "iopub.status.busy": "2023-02-13T23:40:56.114975Z",
+     "iopub.status.idle": "2023-02-13T23:41:55.045654Z",
+     "shell.execute_reply": "2023-02-13T23:41:55.044973Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m[I 2023-03-04 03:34:16,379]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[flaml.tune.tune: 03-04 03:34:16] {811} INFO - trial 1 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.36280922847807595}, 'max_tokens': 347, 'n': 1, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-04 03:34:17] {215} INFO - result: {'expected_success': 0.25, 'success': 0.25, 'total_cost': 0.00971, 'cost': 0.00971, 'inference_cost': 0.0004855, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.36280922847807595}, 'max_tokens': 347, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.36280922847807595}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5723528861999512}\n",
+      "[flaml.tune.tune: 03-04 03:34:17] {811} INFO - trial 2 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6336482349262754}, 'max_tokens': 470, 'prompt': 3, 'stop': 0, 'n': 1}\n",
+      "[flaml.tune.tune: 03-04 03:34:18] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.019959999999999995, 'cost': 0.01025, 'inference_cost': 0.0005124999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6336482349262754}, 'max_tokens': 470, 'prompt': 3, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6336482349262754}, 'config/max_tokens': 470, 'config/prompt': 3, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.8377933502197266}\n",
+      "[flaml.tune.tune: 03-04 03:34:18] {811} INFO - trial 3 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6853598183677972}, 'max_tokens': 869, 'prompt': 2, 'stop': 0, 'n': 1}\n",
+      "[flaml.tune.tune: 03-04 03:34:18] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.02982599999999999, 'cost': 0.009866, 'inference_cost': 0.0004933, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6853598183677972}, 'max_tokens': 869, 'prompt': 2, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6853598183677972}, 'config/max_tokens': 869, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.5643947124481201}\n",
+      "[flaml.tune.tune: 03-04 03:34:18] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 879, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.7857528212930487}}\n",
+      "[flaml.tune.tune: 03-04 03:34:19] {215} INFO - result: {'expected_success': 0.7, 'success': 0.7, 'total_cost': 0.039289999999999985, 'cost': 0.009464, 'inference_cost': 0.0004732, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 879, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.7857528212930487}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 879, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.7857528212930487}, 'experiment_tag': 'exp', 'time_total_s': 0.615419864654541}\n",
+      "[flaml.tune.tune: 03-04 03:34:19] {811} INFO - trial 5 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'prompt': 3, 'stop': 0, 'n': 1}\n",
+      "[flaml.tune.tune: 03-04 03:34:20] {215} INFO - result: {'expected_success': 0.65, 'success': 0.65, 'total_cost': 0.049551999999999985, 'cost': 0.010261999999999999, 'inference_cost': 0.0005131, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'prompt': 3, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/prompt': 3, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.968498945236206}\n",
+      "[flaml.tune.tune: 03-04 03:34:20] {811} INFO - trial 6 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6177669784693172}, 'max_tokens': 231, 'prompt': 2, 'stop': 0, 'n': 1}\n",
+      "[flaml.tune.tune: 03-04 03:34:20] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.059558, 'cost': 0.010006000000000001, 'inference_cost': 0.0005003, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6177669784693172}, 'max_tokens': 231, 'prompt': 2, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6177669784693172}, 'config/max_tokens': 231, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.6833891868591309}\n",
+      "[flaml.tune.tune: 03-04 03:34:20] {811} INFO - trial 7 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 342, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.4957816798898031}}\n",
+      "[flaml.tune.tune: 03-04 03:34:21] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.069498, 'cost': 0.009940000000000001, 'inference_cost': 0.000497, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 342, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.4957816798898031}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 342, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.4957816798898031}, 'experiment_tag': 'exp', 'time_total_s': 0.5958354473114014}\n",
+      "[flaml.tune.tune: 03-04 03:34:21] {811} INFO - trial 8 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 156, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.7397522770488312}}\n",
+      "[flaml.tune.tune: 03-04 03:34:22] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.07878399999999999, 'cost': 0.009286000000000003, 'inference_cost': 0.00046430000000000006, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 156, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.7397522770488312}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 156, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.7397522770488312}, 'experiment_tag': 'exp', 'time_total_s': 0.5807592868804932}\n",
+      "[flaml.tune.tune: 03-04 03:34:22] {811} INFO - trial 9 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 201, 'prompt': 3, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.6833066621306901}}\n",
+      "[flaml.tune.tune: 03-04 03:34:22] {215} INFO - result: {'expected_success': 0.7, 'success': 0.7, 'total_cost': 0.088372, 'cost': 0.009588000000000001, 'inference_cost': 0.0004794, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 201, 'prompt': 3, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.6833066621306901}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 201, 'config/prompt': 3, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.6833066621306901}, 'experiment_tag': 'exp', 'time_total_s': 0.756892204284668}\n",
+      "[flaml.tune.tune: 03-04 03:34:22] {811} INFO - trial 10 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 266, 'prompt': 1, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.5522272948079442}}\n",
+      "[flaml.tune.tune: 03-04 03:34:23] {215} INFO - result: {'expected_success': 0.35, 'success': 0.35, 'total_cost': 0.09748600000000002, 'cost': 0.009114, 'inference_cost': 0.0004557, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 266, 'prompt': 1, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.5522272948079442}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 266, 'config/prompt': 1, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.5522272948079442}, 'experiment_tag': 'exp', 'time_total_s': 0.5654494762420654}\n",
+      "[flaml.tune.tune: 03-04 03:34:23] {811} INFO - trial 11 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 218, 'prompt': 3, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.6240777950749403}}\n",
+      "[flaml.tune.tune: 03-04 03:34:23] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.10044800000000001, 'cost': 0.0029620000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 218, 'prompt': 3, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.6240777950749403}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 218, 'config/prompt': 3, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.6240777950749403}, 'experiment_tag': 'exp', 'time_total_s': 0.003355741500854492}\n",
+      "[flaml.tune.tune: 03-04 03:34:23] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import logging\n",
+    "\n",
+    "config, analysis = oai.ChatCompletion.tune(\n",
+    "    data=tune_data,  # the data for tuning\n",
+    "    metric=\"expected_success\",  # the metric to optimize\n",
+    "    mode=\"max\",  # the optimization mode\n",
+    "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
+    "    # log_file_name=\"logs/humaneval.log\",  # the log file name\n",
+    "    inference_budget=0.002,  # the inference budget (dollar)\n",
+    "    optimization_budget=0.1,  # the optimization budget (dollar)\n",
+    "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
+    "    # -1 means decided by the optimization budget only\n",
+    "    num_samples=-1,\n",
+    "    prompt=[\n",
+    "        \"{prompt}\",\n",
+    "        \"# Python 3{prompt}\",\n",
+    "        \"Complete the following Python function:{prompt}\",\n",
+    "        \"Complete the following Python function while including necessary import statements inside the function:{prompt}\",\n",
+    "    ],  # the prompt templates to choose from\n",
+    "    stop=[\"\\nprint\"],  # the stop sequence\n",
+    "    logging_level=logging.INFO,  # the logging level\n",
+    "    n=1,  # the number of responses to generate\n",
+    ")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Output tuning results\n",
+    "\n",
+    "After the tuning, we can print out the config and the result found by FLAML:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:41:55.049204Z",
+     "iopub.status.busy": "2023-02-13T23:41:55.048871Z",
+     "iopub.status.idle": "2023-02-13T23:41:55.053284Z",
+     "shell.execute_reply": "2023-02-13T23:41:55.052574Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "optimized config {'model': 'gpt-3.5-turbo', 'max_tokens': 869, 'prompt': 'Complete the following Python function:{prompt}', 'stop': ['\\nprint'], 'n': 1, 'temperature': 0.6853598183677972}\n",
+      "best result on tuning data {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.02982599999999999, 'cost': 0.009866, 'inference_cost': 0.0004933, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6853598183677972}, 'max_tokens': 869, 'prompt': 2, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6853598183677972}, 'config/max_tokens': 869, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.5643947124481201}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"optimized config\", config)\n",
+    "print(\"best result on tuning data\", analysis.best_result)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "### Make a request with the tuned config\n",
+    "\n",
+    "We can apply the tuned config on the request for an example task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:41:55.056205Z",
+     "iopub.status.busy": "2023-02-13T23:41:55.055631Z",
+     "iopub.status.idle": "2023-02-13T23:41:56.039259Z",
+     "shell.execute_reply": "2023-02-13T23:41:56.038427Z"
+    },
+    "slideshow": {
+     "slide_type": "subslide"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 0,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"\\n\\ndef compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        if game[i] == guess[i]:\\n            result.append(0)\\n        else:\\n            result.append(abs(game[i] - guess[i]))\\n    return result\\n\\n\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    }\n",
+      "  ],\n",
+      "  \"created\": 1677891117,\n",
+      "  \"id\": \"chatcmpl-6qAPt1bjNEM80fK4JDOT3RqP3POjA\",\n",
+      "  \"model\": \"gpt-3.5-turbo-0301\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"usage\": {\n",
+      "    \"completion_tokens\": 52,\n",
+      "    \"prompt_tokens\": 237,\n",
+      "    \"total_tokens\": 289\n",
+      "  }\n",
+      "}\n",
+      "{'expected_success': 1.0, 'success': True}\n"
+     ]
+    }
+   ],
+   "source": [
+    "responses = oai.ChatCompletion.create(context=tune_data[1], **config)\n",
+    "print(responses)\n",
+    "print(success_metrics([response[\"message\"][\"content\"].rstrip() for response in responses[\"choices\"]], **tune_data[1]))\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Evaluate the success rate on the test data\n",
+    "\n",
+    "You can use flaml's `oai.ChatCompletion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.ChatCompletion.data` to the data to evaluate. The following code will take a while to evaluate all the 144 test data instances."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:41:56.042764Z",
+     "iopub.status.busy": "2023-02-13T23:41:56.042086Z",
+     "iopub.status.idle": "2023-02-13T23:53:05.597643Z",
+     "shell.execute_reply": "2023-02-13T23:53:05.596603Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'expected_success': 0.7152777777777778, 'success': 0.7152777777777778, 'total_cost': 0.17079400000000003, 'cost': 0.07034599999999996, 'inference_cost': 0.0004885138888888889}\n"
+     ]
+    }
+   ],
+   "source": [
+    "oai.ChatCompletion.data = test_data\n",
+    "result = oai.ChatCompletion.eval(analysis.best_config, prune=False, eval_only=True)\n",
+    "print(result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15 (main, Oct 26 2022, 03:47:43) \n[GCC 10.2.1 20210110]"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
+   }
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {
+     "2d910cfd2d2a4fc49fc30fbbdc5576a7": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "454146d0f7224f038689031002906e6f": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_e4ae2b6f5a974fd4bafb6abb9d12ff26",
+        "IPY_MODEL_577e1e3cc4db4942b0883577b3b52755",
+        "IPY_MODEL_b40bdfb1ac1d4cffb7cefcb870c64d45"
+       ],
+       "layout": "IPY_MODEL_dc83c7bff2f241309537a8119dfc7555",
+       "tabbable": null,
+       "tooltip": null
+      }
+     },
+     "577e1e3cc4db4942b0883577b3b52755": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "FloatProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "FloatProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_2d910cfd2d2a4fc49fc30fbbdc5576a7",
+       "max": 1,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_74a6ba0c3cbc4051be0a83e152fe1e62",
+       "tabbable": null,
+       "tooltip": null,
+       "value": 1
+      }
+     },
+     "6086462a12d54bafa59d3c4566f06cb2": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "74a6ba0c3cbc4051be0a83e152fe1e62": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "7d3f3d9e15894d05a4d188ff4f466554": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "b40bdfb1ac1d4cffb7cefcb870c64d45": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_f1355871cc6f4dd4b50d9df5af20e5c8",
+       "placeholder": "​",
+       "style": "IPY_MODEL_ca245376fd9f4354af6b2befe4af4466",
+       "tabbable": null,
+       "tooltip": null,
+       "value": " 1/1 [00:00&lt;00:00, 44.69it/s]"
+      }
+     },
+     "ca245376fd9f4354af6b2befe4af4466": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "dc83c7bff2f241309537a8119dfc7555": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "e4ae2b6f5a974fd4bafb6abb9d12ff26": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_6086462a12d54bafa59d3c4566f06cb2",
+       "placeholder": "​",
+       "style": "IPY_MODEL_7d3f3d9e15894d05a4d188ff4f466554",
+       "tabbable": null,
+       "tooltip": null,
+       "value": "100%"
+      }
+     },
+     "f1355871cc6f4dd4b50d9df5af20e5c8": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     }
+    },
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook/integrate_chatgpt_math.ipynb b/notebook/integrate_chatgpt_math.ipynb
new file mode 100644
index 000000000..66392efc6
--- /dev/null
+++ b/notebook/integrate_chatgpt_math.ipynb
@@ -0,0 +1,1386 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "Copyright (c) Microsoft Corporation. All rights reserved. \n",
+    "\n",
+    "Licensed under the MIT License.\n",
+    "\n",
+    "# Use FLAML to Tune ChatGPT\n",
+    "\n",
+    "In this notebook, we tune OpenAI ChatGPT model for math problem solving. We use [the MATH benchmark](https://crfm.stanford.edu/helm/latest/?group=math_chain_of_thought) for measuring mathematical problem solving on competition math problems with chain-of-thoughts style reasoning. \n",
+    "\n",
+    "## Requirements\n",
+    "\n",
+    "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n",
+    "```bash\n",
+    "pip install flaml[openai]==1.2.0\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.317406Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.316561Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.321193Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.320628Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install flaml[openai]==1.2.0 datasets"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set your OpenAI key:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.324240Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.323783Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.330570Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.329750Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "if \"OPENAI_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When ChatGPT is available in Azure OpenAI, uncomment the following to use Azure OpenAI:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.333547Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.333249Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.336508Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.335858Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# openai.api_type = \"azure\"\n",
+    "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
+    "# openai.api_version = \"2023-3-01\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load dataset\n",
+    "\n",
+    "First, we load the competition_math dataset. The dataset contains 457 \"Level 1\" examples. We use a random sample of 20 examples for tuning the generation hyperparameters and the remaining for evaluation. We use one demonstration example in the prompt."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.339977Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.339556Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.603349Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.602630Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset competition_math (/home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "79ced88ccf474030bda228436813e94b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b/cache-f1cfe8228271b121.arrow\n",
+      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b/cache-d155a2d38c23bd53.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "max tokens in tuning data's canonical solutions 128\n",
+      "20 437\n"
+     ]
+    }
+   ],
+   "source": [
+    "import datasets\n",
+    "\n",
+    "seed = 41\n",
+    "data = datasets.load_dataset(\"competition_math\")\n",
+    "train_data = data[\"train\"].shuffle(seed=seed)\n",
+    "test_data = data[\"test\"].shuffle(seed=seed)\n",
+    "n_tune_data = 20\n",
+    "tune_data = [\n",
+    "    {\n",
+    "        \"problem\": train_data[x][\"problem\"],\n",
+    "        \"solution\": train_data[x][\"solution\"],\n",
+    "    }\n",
+    "    for x in range(len(train_data)) if train_data[x][\"level\"] == \"Level 1\"\n",
+    "][:n_tune_data]\n",
+    "test_data = [\n",
+    "    {\n",
+    "        \"problem\": test_data[x][\"problem\"],\n",
+    "        \"solution\": test_data[x][\"solution\"],\n",
+    "    }\n",
+    "    for x in range(len(test_data)) if test_data[x][\"level\"] == \"Level 1\"\n",
+    "]\n",
+    "input_field = \"problem\"\n",
+    "output_fields = [\"solution\"]\n",
+    "print(\"max tokens in tuning data's canonical solutions\", max([len(x[\"solution\"].split()) for x in tune_data]))\n",
+    "print(len(tune_data), len(test_data))\n",
+    "# prompt template\n",
+    "prompts = [lambda data: \"Given a mathematics problem, determine the answer. Simplify your answer as much as possible.\\n###\\nProblem: What is the value of $\\\\sqrt{3! \\\\cdot 3!}$ expressed as a positive integer?\\nAnswer: $\\\\sqrt{3!\\\\cdot3!}$ is equal to $\\\\sqrt{(3!)^2}=3!=3\\\\cdot2\\\\cdot1=\\\\boxed{6}$.\\n###\\nProblem: %s\\nAnswer:\" + data[\"problem\"]]\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "Check a tuning example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.607152Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.606441Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.610504Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.609759Z"
+    },
+    "slideshow": {
+     "slide_type": "subslide"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Find $\\log_{10} 40 +\\log_{10} 25$.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tune_data[1][\"problem\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here is one example of the canonical solution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.613590Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.613168Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.616873Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.616193Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using $\\log x+\\log y=\\log xy,$ we get that $\\log_{10} 40+\\log_{10} 25=\\log_{10}(40\\cdot 25)=\\log 1000.$ That means we want $x$ where $10^x=1000,$ which means $x=3.$ Therefore, $\\log_{10} 40+\\log_{10} 25=\\boxed{3}.$\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tune_data[1][\"solution\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Success Metric\n",
+    "\n",
+    "Before we start tuning, we need to define the success metric we want to opotimize. For each math task, if one of the returned responses has an equivalent answer to the canonical solution, we consider the task as successfully solved. Then we can define the mean success rate of a collection of tasks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.626998Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.626593Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.631383Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.630770Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Optional\n",
+    "\n",
+    "def remove_boxed(string: str) -> Optional[str]:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Extract the text within a \\\\boxed{...} environment.\n",
+    "    Example:\n",
+    "    >>> remove_boxed(\\\\boxed{\\\\frac{2}{3}})\n",
+    "    \\\\frac{2}{3}\n",
+    "    \"\"\"\n",
+    "    left = \"\\\\boxed{\"\n",
+    "    try:\n",
+    "        assert string[: len(left)] == left\n",
+    "        assert string[-1] == \"}\"\n",
+    "        return string[len(left) : -1]\n",
+    "    except Exception:\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "def last_boxed_only_string(string: str) -> Optional[str]:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Extract the last \\\\boxed{...} or \\\\fbox{...} element from a string.\n",
+    "    \"\"\"\n",
+    "    idx = string.rfind(\"\\\\boxed\")\n",
+    "    if idx < 0:\n",
+    "        idx = string.rfind(\"\\\\fbox\")\n",
+    "        if idx < 0:\n",
+    "            return None\n",
+    "\n",
+    "    i = idx\n",
+    "    right_brace_idx = None\n",
+    "    num_left_braces_open = 0\n",
+    "    while i < len(string):\n",
+    "        if string[i] == \"{\":\n",
+    "            num_left_braces_open += 1\n",
+    "        if string[i] == \"}\":\n",
+    "            num_left_braces_open -= 1\n",
+    "            if num_left_braces_open == 0:\n",
+    "                right_brace_idx = i\n",
+    "                break\n",
+    "        i += 1\n",
+    "\n",
+    "    if right_brace_idx is None:\n",
+    "        retval = None\n",
+    "    else:\n",
+    "        retval = string[idx : right_brace_idx + 1]\n",
+    "\n",
+    "    return retval\n",
+    "\n",
+    "\n",
+    "def _fix_fracs(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Reformat fractions.\n",
+    "    Examples:\n",
+    "    >>> _fix_fracs(\"\\\\frac1b\")\n",
+    "    \\frac{1}{b}\n",
+    "    >>> _fix_fracs(\"\\\\frac12\")\n",
+    "    \\frac{1}{2}\n",
+    "    >>> _fix_fracs(\"\\\\frac1{72}\")\n",
+    "    \\frac{1}{72}\n",
+    "    \"\"\"\n",
+    "    substrs = string.split(\"\\\\frac\")\n",
+    "    new_str = substrs[0]\n",
+    "    if len(substrs) > 1:\n",
+    "        substrs = substrs[1:]\n",
+    "        for substr in substrs:\n",
+    "            new_str += \"\\\\frac\"\n",
+    "            if substr[0] == \"{\":\n",
+    "                new_str += substr\n",
+    "            else:\n",
+    "                try:\n",
+    "                    assert len(substr) >= 2\n",
+    "                except Exception:\n",
+    "                    return string\n",
+    "                a = substr[0]\n",
+    "                b = substr[1]\n",
+    "                if b != \"{\":\n",
+    "                    if len(substr) > 2:\n",
+    "                        post_substr = substr[2:]\n",
+    "                        new_str += \"{\" + a + \"}{\" + b + \"}\" + post_substr\n",
+    "                    else:\n",
+    "                        new_str += \"{\" + a + \"}{\" + b + \"}\"\n",
+    "                else:\n",
+    "                    if len(substr) > 2:\n",
+    "                        post_substr = substr[2:]\n",
+    "                        new_str += \"{\" + a + \"}\" + b + post_substr\n",
+    "                    else:\n",
+    "                        new_str += \"{\" + a + \"}\" + b\n",
+    "    string = new_str\n",
+    "    return string\n",
+    "\n",
+    "\n",
+    "def _fix_a_slash_b(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Reformat fractions formatted as a/b to \\\\frac{a}{b}.\n",
+    "    Example:\n",
+    "    >>> _fix_a_slash_b(\"2/3\")\n",
+    "    \\frac{2}{3}\n",
+    "    \"\"\"\n",
+    "    if len(string.split(\"/\")) != 2:\n",
+    "        return string\n",
+    "    a_str = string.split(\"/\")[0]\n",
+    "    b_str = string.split(\"/\")[1]\n",
+    "    try:\n",
+    "        a = int(a_str)\n",
+    "        b = int(b_str)\n",
+    "        assert string == \"{}/{}\".format(a, b)\n",
+    "        new_string = \"\\\\frac{\" + str(a) + \"}{\" + str(b) + \"}\"\n",
+    "        return new_string\n",
+    "    except Exception:\n",
+    "        return string\n",
+    "\n",
+    "\n",
+    "def _remove_right_units(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Remove units (on the right).\n",
+    "    \"\\\\text{ \" only ever occurs (at least in the val set) when describing units.\n",
+    "    \"\"\"\n",
+    "    if \"\\\\text{ \" in string:\n",
+    "        splits = string.split(\"\\\\text{ \")\n",
+    "        assert len(splits) == 2\n",
+    "        return splits[0]\n",
+    "    else:\n",
+    "        return string\n",
+    "\n",
+    "\n",
+    "def _fix_sqrt(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Reformat square roots.\n",
+    "    Example:\n",
+    "    >>> _fix_sqrt(\"\\\\sqrt3\")\n",
+    "    \\sqrt{3}\n",
+    "    \"\"\"\n",
+    "    if \"\\\\sqrt\" not in string:\n",
+    "        return string\n",
+    "    splits = string.split(\"\\\\sqrt\")\n",
+    "    new_string = splits[0]\n",
+    "    for split in splits[1:]:\n",
+    "        if split[0] != \"{\":\n",
+    "            a = split[0]\n",
+    "            new_substr = \"\\\\sqrt{\" + a + \"}\" + split[1:]\n",
+    "        else:\n",
+    "            new_substr = \"\\\\sqrt\" + split\n",
+    "        new_string += new_substr\n",
+    "    return new_string\n",
+    "\n",
+    "\n",
+    "def _strip_string(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Apply the reformatting helper functions above.\n",
+    "    \"\"\"\n",
+    "    # linebreaks\n",
+    "    string = string.replace(\"\\n\", \"\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # remove inverse spaces\n",
+    "    string = string.replace(\"\\\\!\", \"\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # replace \\\\ with \\\n",
+    "    string = string.replace(\"\\\\\\\\\", \"\\\\\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # replace tfrac and dfrac with frac\n",
+    "    string = string.replace(\"tfrac\", \"frac\")\n",
+    "    string = string.replace(\"dfrac\", \"frac\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # remove \\left and \\right\n",
+    "    string = string.replace(\"\\\\left\", \"\")\n",
+    "    string = string.replace(\"\\\\right\", \"\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # Remove circ (degrees)\n",
+    "    string = string.replace(\"^{\\\\circ}\", \"\")\n",
+    "    string = string.replace(\"^\\\\circ\", \"\")\n",
+    "\n",
+    "    # remove dollar signs\n",
+    "    string = string.replace(\"\\\\$\", \"\")\n",
+    "\n",
+    "    # remove units (on the right)\n",
+    "    string = _remove_right_units(string)\n",
+    "\n",
+    "    # remove percentage\n",
+    "    string = string.replace(\"\\\\%\", \"\")\n",
+    "    string = string.replace(\"\\%\", \"\")\n",
+    "\n",
+    "    # \" 0.\" equivalent to \" .\" and \"{0.\" equivalent to \"{.\" Alternatively, add \"0\" if \".\" is the start of the string\n",
+    "    string = string.replace(\" .\", \" 0.\")\n",
+    "    string = string.replace(\"{.\", \"{0.\")\n",
+    "    # if empty, return empty string\n",
+    "    if len(string) == 0:\n",
+    "        return string\n",
+    "    if string[0] == \".\":\n",
+    "        string = \"0\" + string\n",
+    "\n",
+    "    # to consider: get rid of e.g. \"k = \" or \"q = \" at beginning\n",
+    "    if len(string.split(\"=\")) == 2:\n",
+    "        if len(string.split(\"=\")[0]) <= 2:\n",
+    "            string = string.split(\"=\")[1]\n",
+    "\n",
+    "    # fix sqrt3 --> sqrt{3}\n",
+    "    string = _fix_sqrt(string)\n",
+    "\n",
+    "    # remove spaces\n",
+    "    string = string.replace(\" \", \"\")\n",
+    "\n",
+    "    # \\frac1b or \\frac12 --> \\frac{1}{b} and \\frac{1}{2}, etc.\n",
+    "    # Even works with \\frac1{72} (but not \\frac{72}1).\n",
+    "    # Also does a/b --> \\\\frac{a}{b}\n",
+    "    string = _fix_fracs(string)\n",
+    "\n",
+    "    # manually change 0.5 --> \\frac{1}{2}\n",
+    "    if string == \"0.5\":\n",
+    "        string = \"\\\\frac{1}{2}\"\n",
+    "\n",
+    "    # NOTE: X/Y changed to \\frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y\n",
+    "    string = _fix_a_slash_b(string)\n",
+    "\n",
+    "    return string\n",
+    "\n",
+    "\n",
+    "def get_answer(solution: Optional[str]) -> Optional[str]:\n",
+    "    if solution is None:\n",
+    "        return None\n",
+    "    last_boxed = last_boxed_only_string(solution)\n",
+    "    if last_boxed is None:\n",
+    "        return None\n",
+    "    answer = remove_boxed(last_boxed)\n",
+    "    if answer is None:\n",
+    "        return None\n",
+    "    return answer\n",
+    "\n",
+    "\n",
+    "def is_equiv(str1: Optional[str], str2: Optional[str]) -> float:\n",
+    "    \"\"\"Returns (as a float) whether two strings containing math are equivalent up to differences of formatting in\n",
+    "    - units\n",
+    "    - fractions\n",
+    "    - square roots\n",
+    "    - superfluous LaTeX.\n",
+    "    Source: https://github.com/hendrycks/math\n",
+    "    \"\"\"\n",
+    "    if str1 is None and str2 is None:\n",
+    "        print(\"WARNING: Both None\")\n",
+    "        return 1.0\n",
+    "    if str1 is None or str2 is None:\n",
+    "        return 0.0\n",
+    "\n",
+    "    try:\n",
+    "        ss1 = _strip_string(str1)\n",
+    "        ss2 = _strip_string(str2)\n",
+    "        return float(ss1 == ss2)\n",
+    "    except Exception:\n",
+    "        return float(str1 == str2)\n",
+    "\n",
+    "\n",
+    "def is_equiv_chain_of_thought(str1: str, str2: str) -> float:\n",
+    "    \"\"\"Strips the solution first before calling `is_equiv`.\"\"\"\n",
+    "    ans1 = get_answer(str1)\n",
+    "    ans2 = get_answer(str2)\n",
+    "\n",
+    "    return is_equiv(ans1, ans2)\n",
+    "\n",
+    "\n",
+    "def success_metrics(responses, solution, **args):\n",
+    "    \"\"\"Check if each response is correct.\n",
+    "    \n",
+    "    Args:\n",
+    "        responses (list): The list of responses.\n",
+    "        solution (str): The canonical solution.\n",
+    "    \n",
+    "    Returns:\n",
+    "        dict: The success metrics.\n",
+    "    \"\"\"\n",
+    "    success_list = []\n",
+    "    n = len(responses)\n",
+    "    for i in range(n):\n",
+    "        response = responses[i]\n",
+    "        succeed = is_equiv_chain_of_thought(response, solution)\n",
+    "        success_list.append(succeed)\n",
+    "    return {\n",
+    "        \"expected_success\": 1 - pow(1 - sum(success_list) / n, n),\n",
+    "        \"success\": any(s for s in success_list),\n",
+    "    }\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "## Use the tuning data to find a good configuration\n",
+    "\n",
+    "### Import the oai and tune subpackages from flaml.\n",
+    "\n",
+    "FLAML has provided an API for hyperparameter optimization of OpenAI ChatGPT models: `oai.ChatCompletion.tune` and to make a request with the tuned config: `oai.ChatCompletion.create`. First, we import oai from flaml:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.634335Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.633929Z",
+     "iopub.status.idle": "2023-02-13T23:40:56.105700Z",
+     "shell.execute_reply": "2023-02-13T23:40:56.105085Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from flaml import oai"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For (local) reproducibility and cost efficiency, we cache responses from OpenAI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:56.109177Z",
+     "iopub.status.busy": "2023-02-13T23:40:56.108624Z",
+     "iopub.status.idle": "2023-02-13T23:40:56.112651Z",
+     "shell.execute_reply": "2023-02-13T23:40:56.112076Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "oai.ChatCompletion.set_cache(seed)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately.\n",
+    "\n",
+    "### Perform tuning\n",
+    "\n",
+    "The tuning will take a while to finish, depending on the optimization budget. The tuning will be performed under the specified optimization budgets.\n",
+    "\n",
+    "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.002 means the target inference budget is 0.002 dollars, which translates to 1000 tokens (input + output combined) if the gpt-3.5-turbo model is used.\n",
+    "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 0.5 means 0.5 dollars are allowed in total, which translates to 250K tokens for the gpt-3.5-turbo model.\n",
+    "* `num_sumples` is the number of different hyperparameter configurations which is allowed to try. The tuning will stop after either num_samples trials or after optimization_budget dollars spent, whichever happens first. -1 means no hard restriction in the number of trials and the actual number is decided by `optimization_budget`.\n",
+    "\n",
+    "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n",
+    "\n",
+    "```python\n",
+    "price1K = {\n",
+    "    \"gpt-3.5-turbo\": 0.002,\n",
+    "}\n",
+    "\n",
+    "default_search_space = {\n",
+    "    \"model\": tune.choice(list(price1K.keys())),\n",
+    "    \"temperature_or_top_p\": tune.choice(\n",
+    "        [\n",
+    "            {\"temperature\": tune.uniform(0, 1)},\n",
+    "            {\"top_p\": tune.uniform(0, 1)},\n",
+    "        ]\n",
+    "    ),\n",
+    "    \"max_tokens\": tune.lograndint(50, 1000),\n",
+    "    \"n\": tune.randint(1, 100),\n",
+    "    \"prompt\": \"{prompt}\",\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "The default search space can be overriden by users' input.\n",
+    "For example, the following code specifies a fixed prompt template and a list of stop sequences. For hyperparameters which don't appear in users' input, the default search space will be used."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:56.115383Z",
+     "iopub.status.busy": "2023-02-13T23:40:56.114975Z",
+     "iopub.status.idle": "2023-02-13T23:41:55.045654Z",
+     "shell.execute_reply": "2023-02-13T23:41:55.044973Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m[I 2023-03-05 05:01:24,381]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 1 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.36280922847807595}, 'max_tokens': 347, 'n': 10, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.011049999999999999, 'cost': 0.011049999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.36280922847807595}, 'max_tokens': 347, 'n': 10, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.36280922847807595}, 'config/max_tokens': 347, 'config/n': 10, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0027980804443359375}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 2 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6336482349262754}, 'max_tokens': 470, 'n': 50, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6336482349262754}, 'max_tokens': 470, 'n': 50, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6336482349262754}, 'config/max_tokens': 470, 'config/n': 50, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0004801750183105469}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 3 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0.5308234838865221, 'success': 0.6, 'total_cost': 0.043492, 'cost': 0.032442, 'inference_cost': 0.0016220999999999998, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.7605307121989587}, 'config/max_tokens': 82, 'config/n': 9, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0066220760345458984}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.003948266327914451}, 'max_tokens': 231, 'n': 81, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.049, 'cost': 0.005508, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.003948266327914451}, 'max_tokens': 231, 'n': 81, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.003948266327914451}, 'config/max_tokens': 231, 'config/n': 81, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0020475387573242188}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 5 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.29187606817063316}, 'max_tokens': 781, 'n': 71, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.29187606817063316}, 'max_tokens': 781, 'n': 71, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.29187606817063316}, 'config/max_tokens': 781, 'config/n': 71, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0005230903625488281}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 6 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3733407600514692}, 'max_tokens': 375, 'n': 44, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3733407600514692}, 'max_tokens': 375, 'n': 44, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.3733407600514692}, 'config/max_tokens': 375, 'config/n': 44, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.000446319580078125}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 7 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.5131382425543909}, 'max_tokens': 350, 'n': 60, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.5131382425543909}, 'max_tokens': 350, 'n': 60, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.5131382425543909}, 'config/max_tokens': 350, 'config/n': 60, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.00055694580078125}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 8 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.08172600000000001, 'cost': 0.032726000000000005, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9086488808086682}, 'config/max_tokens': 129, 'config/n': 9, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.004898548126220703}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 9 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8286813263076767}, 'max_tokens': 57, 'n': 63, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.09077800000000001, 'cost': 0.009052000000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8286813263076767}, 'max_tokens': 57, 'n': 63, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.8286813263076767}, 'config/max_tokens': 57, 'config/n': 63, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0021355152130126953}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 10 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.1989475396788123}, 'max_tokens': 650, 'n': 35, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.1989475396788123}, 'max_tokens': 650, 'n': 35, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.1989475396788123}, 'config/max_tokens': 650, 'config/n': 35, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0006568431854248047}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 11 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8839364795611863}, 'max_tokens': 132, 'n': 17, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.09582600000000001, 'cost': 0.005048, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8839364795611863}, 'max_tokens': 132, 'n': 17, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.8839364795611863}, 'config/max_tokens': 132, 'config/n': 17, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.009762048721313477}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 12 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8211056578369285}, 'max_tokens': 78, 'n': 39, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8211056578369285}, 'max_tokens': 78, 'n': 39, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.8211056578369285}, 'config/max_tokens': 78, 'config/n': 39, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.007121086120605469}\n",
+      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 13 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.0422875090290305}, 'max_tokens': 56, 'n': 3, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:35] {215} INFO - result: {'expected_success': 0.15, 'success': 0.15, 'total_cost': 0.10778599999999998, 'cost': 0.011960000000000002, 'inference_cost': 0.000598, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.0422875090290305}, 'max_tokens': 56, 'n': 3, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.0422875090290305}, 'config/max_tokens': 56, 'config/n': 3, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 10.761135816574097}\n",
+      "[flaml.tune.tune: 03-05 05:01:35] {811} INFO - trial 14 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.11030610637969397}, 'max_tokens': 52, 'n': 3, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:52] {215} INFO - result: {'expected_success': 0.1, 'success': 0.1, 'total_cost': 0.11931399999999996, 'cost': 0.011528, 'inference_cost': 0.0005764, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.11030610637969397}, 'max_tokens': 52, 'n': 3, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.11030610637969397}, 'config/max_tokens': 52, 'config/n': 3, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 17.322299242019653}\n",
+      "[flaml.tune.tune: 03-05 05:01:52] {811} INFO - trial 15 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5632321190691856}, 'max_tokens': 89, 'n': 22, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:52] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5632321190691856}, 'max_tokens': 89, 'n': 22, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.5632321190691856}, 'config/max_tokens': 89, 'config/n': 22, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0008306503295898438}\n",
+      "[flaml.tune.tune: 03-05 05:01:52] {811} INFO - trial 16 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.04561271084264061}, 'max_tokens': 51, 'n': 98, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:01:54] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.12412799999999996, 'cost': 0.004814, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.04561271084264061}, 'max_tokens': 51, 'n': 98, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.04561271084264061}, 'config/max_tokens': 51, 'config/n': 98, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 1.575875997543335}\n",
+      "[flaml.tune.tune: 03-05 05:01:54] {811} INFO - trial 17 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5087240651577944}, 'max_tokens': 95, 'n': 1, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:02:20] {215} INFO - result: {'expected_success': 0.3, 'success': 0.3, 'total_cost': 0.13279399999999997, 'cost': 0.008666, 'inference_cost': 0.0004333, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5087240651577944}, 'max_tokens': 95, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.5087240651577944}, 'config/max_tokens': 95, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 26.14193034172058}\n",
+      "[flaml.tune.tune: 03-05 05:02:20] {811} INFO - trial 18 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6040740802039921}, 'max_tokens': 129, 'n': 25, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:02:20] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6040740802039921}, 'max_tokens': 129, 'n': 25, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6040740802039921}, 'config/max_tokens': 129, 'config/n': 25, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0008137226104736328}\n",
+      "[flaml.tune.tune: 03-05 05:02:20] {811} INFO - trial 19 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3754115138138923}, 'max_tokens': 86, 'n': 12, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:02:33] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.149274, 'cost': 0.01648, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3754115138138923}, 'max_tokens': 86, 'n': 12, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.3754115138138923}, 'config/max_tokens': 86, 'config/n': 12, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 13.519219398498535}\n",
+      "[flaml.tune.tune: 03-05 05:02:33] {811} INFO - trial 20 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6887263877538047}, 'max_tokens': 173, 'n': 28, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:02:33] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6887263877538047}, 'max_tokens': 173, 'n': 28, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6887263877538047}, 'config/max_tokens': 173, 'config/n': 28, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0005598068237304688}\n",
+      "[flaml.tune.tune: 03-05 05:02:33] {811} INFO - trial 21 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.40706161658517775}, 'max_tokens': 217, 'n': 5, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:03:20] {215} INFO - result: {'expected_success': 0.739152, 'success': 0.8, 'total_cost': 0.17876000000000006, 'cost': 0.029486000000000002, 'inference_cost': 0.0014743, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.40706161658517775}, 'max_tokens': 217, 'n': 5, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.40706161658517775}, 'config/max_tokens': 217, 'config/n': 5, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 47.16692495346069}\n",
+      "[flaml.tune.tune: 03-05 05:03:20] {811} INFO - trial 22 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 174, 'n': 2, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.27048488009754645}}\n",
+      "[flaml.tune.tune: 03-05 05:04:01] {215} INFO - result: {'expected_success': 0.5125, 'success': 0.55, 'total_cost': 0.19355200000000006, 'cost': 0.014792000000000003, 'inference_cost': 0.0007396000000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 174, 'n': 2, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.27048488009754645}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 174, 'config/n': 2, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.27048488009754645}, 'experiment_tag': 'exp', 'time_total_s': 40.51927351951599}\n",
+      "[flaml.tune.tune: 03-05 05:04:01] {811} INFO - trial 23 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3413175996734835}, 'max_tokens': 275, 'n': 52, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:04:01] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3413175996734835}, 'max_tokens': 275, 'n': 52, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.3413175996734835}, 'config/max_tokens': 275, 'config/n': 52, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0007867813110351562}\n",
+      "[flaml.tune.tune: 03-05 05:04:01] {811} INFO - trial 24 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.2645495244555195}, 'max_tokens': 499, 'n': 12, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:04:01] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.2645495244555195}, 'max_tokens': 499, 'n': 12, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.2645495244555195}, 'config/max_tokens': 499, 'config/n': 12, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0006549358367919922}\n",
+      "[flaml.tune.tune: 03-05 05:04:01] {811} INFO - trial 25 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.48492162197022287}, 'max_tokens': 174, 'n': 2, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:04:40] {215} INFO - result: {'expected_success': 0.55, 'success': 0.6, 'total_cost': 0.2079620000000001, 'cost': 0.01441, 'inference_cost': 0.0007205, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.48492162197022287}, 'max_tokens': 174, 'n': 2, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.48492162197022287}, 'config/max_tokens': 174, 'config/n': 2, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 38.88523626327515}\n",
+      "[flaml.tune.tune: 03-05 05:04:40] {811} INFO - trial 26 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7008948011018361}, 'max_tokens': 188, 'n': 2, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:05:20] {215} INFO - result: {'expected_success': 0.6375, 'success': 0.65, 'total_cost': 0.22241600000000009, 'cost': 0.014454, 'inference_cost': 0.0007227000000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7008948011018361}, 'max_tokens': 188, 'n': 2, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.7008948011018361}, 'config/max_tokens': 188, 'config/n': 2, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 40.07520294189453}\n",
+      "[flaml.tune.tune: 03-05 05:05:20] {811} INFO - trial 27 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.45563880608336627}, 'max_tokens': 181, 'n': 1, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:05:54] {215} INFO - result: {'expected_success': 0.55, 'success': 0.55, 'total_cost': 0.23225200000000013, 'cost': 0.009836000000000001, 'inference_cost': 0.0004918, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.45563880608336627}, 'max_tokens': 181, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.45563880608336627}, 'config/max_tokens': 181, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 34.365720987319946}\n",
+      "[flaml.tune.tune: 03-05 05:05:54] {811} INFO - trial 28 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.21155867162942757}, 'max_tokens': 183, 'n': 17, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:05:57] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.23748400000000014, 'cost': 0.005232, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.21155867162942757}, 'max_tokens': 183, 'n': 17, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.21155867162942757}, 'config/max_tokens': 183, 'config/n': 17, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 2.9915997982025146}\n",
+      "[flaml.tune.tune: 03-05 05:05:57] {811} INFO - trial 29 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.652909170066013}, 'max_tokens': 285, 'n': 31, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:05:57] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.652909170066013}, 'max_tokens': 285, 'n': 31, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.652909170066013}, 'config/max_tokens': 285, 'config/n': 31, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0005283355712890625}\n",
+      "[flaml.tune.tune: 03-05 05:05:57] {811} INFO - trial 30 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9990495004030453}, 'max_tokens': 219, 'n': 18, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:06:02] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.24319000000000013, 'cost': 0.005706, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9990495004030453}, 'max_tokens': 219, 'n': 18, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.9990495004030453}, 'config/max_tokens': 219, 'config/n': 18, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.099469184875488}\n",
+      "[flaml.tune.tune: 03-05 05:06:02] {811} INFO - trial 31 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4467837016610728}, 'max_tokens': 404, 'n': 1, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:06:50] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.25467800000000024, 'cost': 0.011488, 'inference_cost': 0.0005744, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4467837016610728}, 'max_tokens': 404, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4467837016610728}, 'config/max_tokens': 404, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 47.18360900878906}\n",
+      "[flaml.tune.tune: 03-05 05:06:50] {811} INFO - trial 32 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7150017857658078}, 'max_tokens': 469, 'n': 9, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:06:50] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7150017857658078}, 'max_tokens': 469, 'n': 9, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.7150017857658078}, 'config/max_tokens': 469, 'config/n': 9, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.000614166259765625}\n",
+      "[flaml.tune.tune: 03-05 05:06:50] {811} INFO - trial 33 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.2594708806296415}, 'max_tokens': 352, 'n': 7, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:07:35] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.29123200000000016, 'cost': 0.036554, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.2594708806296415}, 'max_tokens': 352, 'n': 7, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.2594708806296415}, 'config/max_tokens': 352, 'config/n': 7, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 45.43464660644531}\n",
+      "[flaml.tune.tune: 03-05 05:07:35] {811} INFO - trial 34 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5691158455115929}, 'max_tokens': 520, 'n': 22, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:07:35] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5691158455115929}, 'max_tokens': 520, 'n': 22, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.5691158455115929}, 'config/max_tokens': 520, 'config/n': 22, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0005013942718505859}\n",
+      "[flaml.tune.tune: 03-05 05:07:35] {811} INFO - trial 35 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4357505186889488}, 'max_tokens': 153, 'n': 1, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:08:11] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.3012180000000001, 'cost': 0.009986, 'inference_cost': 0.0004993, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4357505186889488}, 'max_tokens': 153, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4357505186889488}, 'config/max_tokens': 153, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 36.294803857803345}\n",
+      "[flaml.tune.tune: 03-05 05:08:11] {811} INFO - trial 36 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.43174456068612144}, 'max_tokens': 244, 'n': 1, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:08:50] {215} INFO - result: {'expected_success': 0.45, 'success': 0.45, 'total_cost': 0.3115360000000001, 'cost': 0.010318, 'inference_cost': 0.0005159, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.43174456068612144}, 'max_tokens': 244, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.43174456068612144}, 'config/max_tokens': 244, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 38.782007455825806}\n",
+      "[flaml.tune.tune: 03-05 05:08:50] {811} INFO - trial 37 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.31174598735063297}, 'max_tokens': 152, 'n': 93, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:08:50] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.31174598735063297}, 'max_tokens': 152, 'n': 93, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.31174598735063297}, 'config/max_tokens': 152, 'config/n': 93, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.000728607177734375}\n",
+      "[flaml.tune.tune: 03-05 05:08:50] {811} INFO - trial 38 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9998765149838305}, 'max_tokens': 968, 'n': 13, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:08:50] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9998765149838305}, 'max_tokens': 968, 'n': 13, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.9998765149838305}, 'config/max_tokens': 968, 'config/n': 13, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0006527900695800781}\n",
+      "[flaml.tune.tune: 03-05 05:08:50] {811} INFO - trial 39 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4077967938262427}, 'max_tokens': 208, 'n': 6, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:09:37] {215} INFO - result: {'expected_success': 0.8148458933470506, 'success': 0.85, 'total_cost': 0.344804, 'cost': 0.03326799999999999, 'inference_cost': 0.0016634000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4077967938262427}, 'max_tokens': 208, 'n': 6, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4077967938262427}, 'config/max_tokens': 208, 'config/n': 6, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 46.54340124130249}\n",
+      "[flaml.tune.tune: 03-05 05:09:37] {811} INFO - trial 40 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 340, 'n': 1, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.4404342494313882}}\n",
+      "[flaml.tune.tune: 03-05 05:10:23] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.356122, 'cost': 0.011318000000000002, 'inference_cost': 0.0005658999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 340, 'n': 1, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.4404342494313882}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 340, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.4404342494313882}, 'experiment_tag': 'exp', 'time_total_s': 45.89974808692932}\n",
+      "[flaml.tune.tune: 03-05 05:10:23] {811} INFO - trial 41 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 127, 'n': 16, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.37515933822109715}}\n",
+      "[flaml.tune.tune: 03-05 05:10:26] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.361062, 'cost': 0.00494, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 127, 'n': 16, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.37515933822109715}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 127, 'config/n': 16, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.37515933822109715}, 'experiment_tag': 'exp', 'time_total_s': 3.5503623485565186}\n",
+      "[flaml.tune.tune: 03-05 05:10:26] {811} INFO - trial 42 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.996156173020253}, 'max_tokens': 107, 'n': 7, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:11:06] {215} INFO - result: {'expected_success': 0.646968646445905, 'success': 0.7, 'total_cost': 0.39229600000000003, 'cost': 0.031234, 'inference_cost': 0.0015617, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.996156173020253}, 'max_tokens': 107, 'n': 7, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.996156173020253}, 'config/max_tokens': 107, 'config/n': 7, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 40.09834337234497}\n",
+      "[flaml.tune.tune: 03-05 05:11:06] {811} INFO - trial 43 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.712309746815617}, 'max_tokens': 112, 'n': 77, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:11:06] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.712309746815617}, 'max_tokens': 112, 'n': 77, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.712309746815617}, 'config/max_tokens': 112, 'config/n': 77, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0007219314575195312}\n",
+      "[flaml.tune.tune: 03-05 05:11:06] {811} INFO - trial 44 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7694213309158455}, 'max_tokens': 226, 'n': 8, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:11:55] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.42729200000000006, 'cost': 0.034996, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7694213309158455}, 'max_tokens': 226, 'n': 8, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.7694213309158455}, 'config/max_tokens': 226, 'config/n': 8, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 48.949331283569336}\n",
+      "[flaml.tune.tune: 03-05 05:11:55] {811} INFO - trial 45 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9557646172390091}, 'max_tokens': 293, 'n': 45, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:11:55] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9557646172390091}, 'max_tokens': 293, 'n': 45, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9557646172390091}, 'config/max_tokens': 293, 'config/n': 45, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0007379055023193359}\n",
+      "[flaml.tune.tune: 03-05 05:11:55] {811} INFO - trial 46 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9767564075397783}, 'max_tokens': 65, 'n': 16, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:12:03] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.436042, 'cost': 0.008749999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9767564075397783}, 'max_tokens': 65, 'n': 16, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9767564075397783}, 'config/max_tokens': 65, 'config/n': 16, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 8.102897882461548}\n",
+      "[flaml.tune.tune: 03-05 05:12:03] {811} INFO - trial 47 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3783227519390696}, 'max_tokens': 111, 'n': 6, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:12:39] {215} INFO - result: {'expected_success': 0.5908468364197531, 'success': 0.65, 'total_cost': 0.46333, 'cost': 0.027288, 'inference_cost': 0.0013644, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3783227519390696}, 'max_tokens': 111, 'n': 6, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.3783227519390696}, 'config/max_tokens': 111, 'config/n': 6, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 35.84658098220825}\n",
+      "[flaml.tune.tune: 03-05 05:12:39] {811} INFO - trial 48 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5239740220006481}, 'max_tokens': 150, 'n': 10, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:12:49] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.47180400000000006, 'cost': 0.008474, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5239740220006481}, 'max_tokens': 150, 'n': 10, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.5239740220006481}, 'config/max_tokens': 150, 'config/n': 10, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 9.35022783279419}\n",
+      "[flaml.tune.tune: 03-05 05:12:49] {811} INFO - trial 49 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4090242730676276}, 'max_tokens': 198, 'n': 6, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-05 05:13:30] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.500916, 'cost': 0.029112000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4090242730676276}, 'max_tokens': 198, 'n': 6, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4090242730676276}, 'config/max_tokens': 198, 'config/n': 6, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 40.903329372406006}\n",
+      "[flaml.tune.tune: 03-05 05:13:30] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import logging\n",
+    "\n",
+    "config, analysis = oai.ChatCompletion.tune(\n",
+    "    data=tune_data,  # the data for tuning\n",
+    "    metric=\"expected_success\",  # the metric to optimize\n",
+    "    mode=\"max\",  # the optimization mode\n",
+    "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
+    "    # log_file_name=\"logs/math.log\",  # the log file name\n",
+    "    inference_budget=0.002,  # the inference budget (dollar)\n",
+    "    optimization_budget=0.5,  # the optimization budget (dollar)\n",
+    "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
+    "    # -1 means decided by the optimization budget only\n",
+    "    num_samples=-1,\n",
+    "    prompt=prompts,  # the prompt templates to choose from\n",
+    "    stop=\"###\",  # the stop sequence\n",
+    "    logging_level=logging.INFO,  # the logging level\n",
+    ")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Output tuning results\n",
+    "\n",
+    "After the tuning, we can print out the config and the result found by FLAML:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:41:55.049204Z",
+     "iopub.status.busy": "2023-02-13T23:41:55.048871Z",
+     "iopub.status.idle": "2023-02-13T23:41:55.053284Z",
+     "shell.execute_reply": "2023-02-13T23:41:55.052574Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "optimized config {'model': 'gpt-3.5-turbo', 'max_tokens': 208, 'n': 6, 'prompt': <function <lambda> at 0x7f80e405b430>, 'stop': '###', 'temperature': 0.4077967938262427}\n",
+      "best result on tuning data {'expected_success': 0.8148458933470506, 'success': 0.85, 'total_cost': 0.344804, 'cost': 0.03326799999999999, 'inference_cost': 0.0016634000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4077967938262427}, 'max_tokens': 208, 'n': 6, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4077967938262427}, 'config/max_tokens': 208, 'config/n': 6, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 46.54340124130249}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"optimized config\", config)\n",
+    "print(\"best result on tuning data\", analysis.best_result)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "### Make a request with the tuned config\n",
+    "\n",
+    "We can apply the tuned config on the request for an example task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:41:55.056205Z",
+     "iopub.status.busy": "2023-02-13T23:41:55.055631Z",
+     "iopub.status.idle": "2023-02-13T23:41:56.039259Z",
+     "shell.execute_reply": "2023-02-13T23:41:56.038427Z"
+    },
+    "slideshow": {
+     "slide_type": "subslide"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 0,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"\\n\\nAnswer: Using the logarithmic identity $\\\\log_{a}(b\\\\cdot c)=\\\\log_{a}(b)+\\\\log_{a}(c)$, we can simplify the expression as follows: $$\\\\log_{10} 40 +\\\\log_{10} 25=\\\\log_{10}(40\\\\cdot 25)=\\\\log_{10}(1000)=\\\\boxed{3}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": null,\n",
+      "      \"index\": 1,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"\\n\\nAnswer: Using the logarithmic property $\\\\log_a b + \\\\log_a c = \\\\log_a (bc)$, we can combine the two logarithms to get $\\\\log_{10} 40 \\\\cdot 25$. Simplifying, we get $\\\\log_{10} 1000$. Since $10^3 = 1000$, we have $\\\\log_{10} 1000 = \\\\boxed{3}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 2,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"\\n\\nAnswer: Using the logarithmic property $\\\\log_a b + \\\\log_a c = \\\\log_a (bc)$, we can simplify the expression as follows: $$\\\\log_{10} 40 + \\\\log_{10} 25 = \\\\log_{10} (40 \\\\cdot 25) = \\\\log_{10} 1000$$ Since $1000$ is equal to $10^3$, we have $\\\\log_{10} 1000 = \\\\boxed{3}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 3,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"\\n\\nAnswer: Using the logarithmic identity $\\\\log_{a}(b\\\\cdot c) = \\\\log_{a}(b) + \\\\log_{a}(c)$, we can simplify the expression as follows:\\n\\n$$\\\\log_{10} 40 +\\\\log_{10} 25 = \\\\log_{10} (40\\\\cdot 25) = \\\\log_{10} 1000$$\\n\\nSince $1000 = 10^3$, we have $\\\\log_{10} 1000 = \\\\boxed{3}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 4,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"\\n\\nAnswer: Using the logarithmic property $\\\\log_{a}(b) + \\\\log_{a}(c) = \\\\log_{a}(bc)$, we can simplify the expression to $\\\\log_{10}(40 \\\\cdot 25)$. Multiplying $40$ and $25$ gives us $1000$. Therefore, the expression simplifies to $\\\\log_{10}1000$. Since $10^3=1000$, we have $\\\\log_{10}1000 = \\\\boxed{3}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 5,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"\\n\\nAnswer: Using the logarithmic identity $\\\\log_{a}(b) + \\\\log_{a}(c) = \\\\log_{a}(bc)$, we can simplify the expression to $\\\\log_{10}(40\\\\cdot25)$. Evaluating $40\\\\cdot25$ gives us $1000$, so our final answer is $\\\\log_{10}(1000) = \\\\boxed{3}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    }\n",
+      "  ],\n",
+      "  \"created\": 1677992931,\n",
+      "  \"id\": \"chatcmpl-6qau3onXVENQuWDXUttbTe3rJ27vH\",\n",
+      "  \"model\": \"gpt-3.5-turbo-0301\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"usage\": {\n",
+      "    \"completion_tokens\": 575,\n",
+      "    \"prompt_tokens\": 112,\n",
+      "    \"total_tokens\": 687\n",
+      "  }\n",
+      "}\n",
+      "{'expected_success': 1.0, 'success': True}\n"
+     ]
+    }
+   ],
+   "source": [
+    "responses = oai.ChatCompletion.create(context=tune_data[1], **config)\n",
+    "print(responses)\n",
+    "print(success_metrics([response[\"message\"][\"content\"].rstrip() for response in responses[\"choices\"]], **tune_data[1]))\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Evaluate the success rate on the test data\n",
+    "\n",
+    "You can use flaml's `oai.ChatCompletion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.ChatCompletion.data` to the data to evaluate. The following code will take a while to evaluate all the 438 test data instances."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:41:56.042764Z",
+     "iopub.status.busy": "2023-02-13T23:41:56.042086Z",
+     "iopub.status.idle": "2023-02-13T23:53:05.597643Z",
+     "shell.execute_reply": "2023-02-13T23:53:05.596603Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'expected_success': 0.7719714162844925, 'success': 0.8123569794050344, 'total_cost': 1.1100199999999998, 'cost': 0.6091040000000002, 'inference_cost': 0.001393830663615561}\n"
+     ]
+    }
+   ],
+   "source": [
+    "oai.ChatCompletion.data = test_data\n",
+    "result = oai.ChatCompletion.eval(analysis.best_config, prune=False, eval_only=True)\n",
+    "print(result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
+   }
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {
+     "2d910cfd2d2a4fc49fc30fbbdc5576a7": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "454146d0f7224f038689031002906e6f": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_e4ae2b6f5a974fd4bafb6abb9d12ff26",
+        "IPY_MODEL_577e1e3cc4db4942b0883577b3b52755",
+        "IPY_MODEL_b40bdfb1ac1d4cffb7cefcb870c64d45"
+       ],
+       "layout": "IPY_MODEL_dc83c7bff2f241309537a8119dfc7555",
+       "tabbable": null,
+       "tooltip": null
+      }
+     },
+     "577e1e3cc4db4942b0883577b3b52755": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "FloatProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "FloatProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_2d910cfd2d2a4fc49fc30fbbdc5576a7",
+       "max": 1,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_74a6ba0c3cbc4051be0a83e152fe1e62",
+       "tabbable": null,
+       "tooltip": null,
+       "value": 1
+      }
+     },
+     "6086462a12d54bafa59d3c4566f06cb2": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "74a6ba0c3cbc4051be0a83e152fe1e62": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "7d3f3d9e15894d05a4d188ff4f466554": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "b40bdfb1ac1d4cffb7cefcb870c64d45": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_f1355871cc6f4dd4b50d9df5af20e5c8",
+       "placeholder": "​",
+       "style": "IPY_MODEL_ca245376fd9f4354af6b2befe4af4466",
+       "tabbable": null,
+       "tooltip": null,
+       "value": " 1/1 [00:00&lt;00:00, 44.69it/s]"
+      }
+     },
+     "ca245376fd9f4354af6b2befe4af4466": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "dc83c7bff2f241309537a8119dfc7555": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "e4ae2b6f5a974fd4bafb6abb9d12ff26": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_6086462a12d54bafa59d3c4566f06cb2",
+       "placeholder": "​",
+       "style": "IPY_MODEL_7d3f3d9e15894d05a4d188ff4f466554",
+       "tabbable": null,
+       "tooltip": null,
+       "value": "100%"
+      }
+     },
+     "f1355871cc6f4dd4b50d9df5af20e5c8": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     }
+    },
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook/integrate_openai.ipynb b/notebook/integrate_openai.ipynb
index 5d74ae552..8edf39635 100644
--- a/notebook/integrate_openai.ipynb
+++ b/notebook/integrate_openai.ipynb
@@ -458,6 +458,7 @@
     "    \"code-davinci-002\": 0.1,\n",
     "    \"text-davinci-002\": 0.02,\n",
     "    \"text-davinci-003\": 0.02,\n",
+    "    \"gpt-3.5-turbo\": 0.002,\n",
     "}\n",
     "\n",
     "default_search_space = {\n",
diff --git a/notebook/research/acl2021.ipynb b/notebook/research/acl2021.ipynb
index 53cf9b247..9b099b1c9 100644
--- a/notebook/research/acl2021.ipynb
+++ b/notebook/research/acl2021.ipynb
@@ -804,5 +804,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-
 }
diff --git a/setup.py b/setup.py
index 56bb955a8..c627a8f53 100644
--- a/setup.py
+++ b/setup.py
@@ -120,7 +120,7 @@ setuptools.setup(
             "pytorch-forecasting>=0.9.0",
         ],
         "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
-        "openai": ["openai==0.23.1", "diskcache", "optuna==2.8.0"],
+        "openai": ["openai==0.27.0", "diskcache", "optuna==2.8.0"],
         "synapse": ["joblibspark>=0.5.0", "optuna==2.8.0", "pyspark>=3.0.0"],
     },
     classifiers=[
diff --git a/test/openai/test_completion.py b/test/openai/test_completion.py
index 3717faa00..191f8be1e 100644
--- a/test/openai/test_completion.py
+++ b/test/openai/test_completion.py
@@ -80,13 +80,36 @@ def test_humaneval(num_samples=1):
     oai.Completion.set_cache(seed)
     try:
         # a minimal tuning example
-        oai.Completion.tune(
+        config, _ = oai.Completion.tune(
             data=tune_data,
             metric="success",
             mode="max",
             eval_func=success_metrics,
             n=1,
         )
+        responses = oai.Completion.create(context=test_data[0], **config)
+        # a minimal tuning example for tuning chat completion models using the Completion class
+        config, _ = oai.Completion.tune(
+            data=tune_data,
+            metric="success",
+            mode="max",
+            eval_func=success_metrics,
+            n=1,
+            model="gpt-3.5-turbo",
+        )
+        responses = oai.Completion.create(context=test_data[0], **config)
+        # a minimal tuning example for tuning chat completion models using the Completion class
+        config, _ = oai.ChatCompletion.tune(
+            data=tune_data,
+            metric="success",
+            mode="max",
+            eval_func=success_metrics,
+            n=1,
+            messages=[{"role": "user", "content": "{prompt}"}],
+        )
+        responses = oai.ChatCompletion.create(context=test_data[0], **config)
+        print(responses)
+        return
         # a more comprehensive tuning example
         config, analysis = oai.Completion.tune(
             data=tune_data,
@@ -94,8 +117,8 @@ def test_humaneval(num_samples=1):
             mode="max",
             eval_func=success_metrics,
             log_file_name="logs/humaneval.log",
-            inference_budget=0.02,
-            optimization_budget=5,
+            inference_budget=0.002,
+            optimization_budget=2,
             num_samples=num_samples,
             prompt=[
                 "{prompt}",
diff --git a/test/openai/test_notebook.py b/test/openai/test_notebook.py
index dc2fef32b..e8cb51588 100644
--- a/test/openai/test_notebook.py
+++ b/test/openai/test_notebook.py
@@ -38,5 +38,13 @@ def test_integrate_openai(save=False):
     run_notebook("integrate_openai.ipynb", save=save)
 
 
+@pytest.mark.skipif(
+    skip,
+    reason="do not run openai test if openai is not installed",
+)
+def test_integrate_chatgpt(save=False):
+    run_notebook("integrate_chatgpt_math.ipynb", save=save)
+
+
 if __name__ == "__main__":
-    test_integrate_openai(save=True)
+    test_integrate_chatgpt(save=True)
diff --git a/website/docs/Examples/Integrate - OpenAI.md b/website/docs/Examples/Integrate - OpenAI.md
index f9b2b3287..65e8bfa06 100644
--- a/website/docs/Examples/Integrate - OpenAI.md	
+++ b/website/docs/Examples/Integrate - OpenAI.md	
@@ -1,10 +1,10 @@
-FLAML has integrated the OpenAI's completion API. In this example, we will tune several hyperparameters including the temperature, prompt and n to optimize the inference performance of OpenAI's completion API for a code generation task. Our study shows that tuning hyperparameters can significantly affect the utility of the OpenAI API.
+FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673). In this example, we will tune several hyperparameters for the OpenAI's completion API, including the temperature, prompt and n (number of completions), to optimize the inference performance for a code generation task. Our study shows that tuning hyperparameters can significantly affect the utility of the OpenAI API.
 
 ### Prerequisites
 
-Install the [openai] option. The option is available in flaml since version 1.1.3. This feature is subject to change in future versions.
+Install the [openai] option. The OpenAI integration is in preview. ChaptGPT support is available since version 1.2.0.
 ```bash
-pip install "flaml[openai]==1.1.3"
+pip install "flaml[openai]==1.2.0"
 ```
 
 
diff --git a/website/docs/Research.md b/website/docs/Research.md
index c672c30f4..7c1d9ceb4 100644
--- a/website/docs/Research.md
+++ b/website/docs/Research.md
@@ -91,3 +91,14 @@ year={2023},
 url={https://openreview.net/forum?id=0Ij9_q567Ma}
 }
 ```
+
+* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).
+
+```bibtex
+@inproceedings{wang2023EcoOptiGen,
+    title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},
+    author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},
+    year={2023},
+    booktitle={ArXiv preprint arXiv:2303.04673},
+}
+```