From 47168a9cb933d92b287cc819d2f3e015e25ef52f Mon Sep 17 00:00:00 2001
From: "a.zemerov" <zemerovanton@gmail.com>
Date: Mon, 22 Jul 2024 16:30:29 +0300
Subject: [PATCH 1/6] Change default parameters for judge config

---
 config/judge_config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/judge_config.yaml b/config/judge_config.yaml
index 5090d2a6..8de9a367 100644
--- a/config/judge_config.yaml
+++ b/config/judge_config.yaml
@@ -3,12 +3,12 @@ name: judgment config file for Arena Hard
 bench_name: arena-hard-v0.1
 
 # Arena Hard default
-judge_model: gpt-4-1106-preview
+judge_model: gpt-4o
 reference: False # Optional
 ref_model: null  
 
 baseline: True
-baseline_model: gpt-4-0314
+baseline_model: gpt-4-0613
 
 pairwise: True
 temperature: 0

From 5430faa4d65a07ac7c4403569dd253332b8de399 Mon Sep 17 00:00:00 2001
From: "a.zemerov" <zemerovanton@gmail.com>
Date: Tue, 23 Jul 2024 18:46:16 +0300
Subject: [PATCH 2/6] add new metrics and api types

---
 gen_answer.py   | 72 +++++++++++++++++++++----------------------------
 gen_judgment.py | 14 ++++++----
 show_result.py  | 23 +++++++++++-----
 utils.py        |  8 ++++--
 4 files changed, 62 insertions(+), 55 deletions(-)

diff --git a/gen_answer.py b/gen_answer.py
index e33b6231..c5c0dcd6 100644
--- a/gen_answer.py
+++ b/gen_answer.py
@@ -27,6 +27,7 @@
     chat_completion_yandex,
     chat_completion_sber,
     detect_language,
+    detect_repetitions,
     OPENAI_MODEL_LIST,
     temperature_config,
 )
@@ -53,54 +54,42 @@ def get_answer(
         turns = []
         for j in range(len(question["turns"])):
             conv.append({"role": "user", "content": question["turns"][j]["content"]})
-            if api_type == "anthropic":
-                output = chat_completion_anthropic(model=endpoint_info["model_name"],
-                                                   messages=conv,
-                                                   temperature=temperature,
-                                                   max_tokens=max_tokens)
-            elif api_type == "mistral":
-                output = chat_completion_mistral(model=endpoint_info["model_name"],
-                                                 messages=conv,
-                                                 temperature=temperature,
-                                                 max_tokens=max_tokens)
-            elif api_type == "gemini":
-                output = chat_completion_gemini(model=endpoint_info["model_name"],
-                                                messages=question["turns"][j]["content"],
-                                                temperature=temperature,
-                                                max_tokens=max_tokens)
-            elif api_type == "azure":
-                output = chat_completion_openai_azure(model=endpoint_info["model_name"],
-                                                      messages=conv,
-                                                      temperature=temperature,
-                                                      max_tokens=max_tokens,
-                                                      api_dict=api_dict)
-            elif api_type == "cohere":
-                output = chat_completion_cohere(model=endpoint_info["model_name"],
-                                                messages=conv,
-                                                temperature=temperature,
-                                                max_tokens=max_tokens)
-            elif api_type == "yandex":
+            if api_type == "yandex":
                 for reply in conv:  # Rename key name for compatibility with yandex api
                     reply["text"] = reply.pop("content")
 
-                output = chat_completion_yandex(model=endpoint_info["model_name"],
-                                                messages=conv,
-                                                temperature=temperature,
-                                                max_tokens=max_tokens)
+                output, prompt_tokens, completion_tokens = chat_completion_yandex(
+                    model=endpoint_info["model_name"],
+                    messages=conv,
+                    temperature=temperature,
+                    max_tokens=max_tokens
+                )
             elif api_type == "sber":
-                output = chat_completion_sber(model=endpoint_info["model_name"],
-                                                messages=conv,
-                                                temperature=temperature,
-                                                max_tokens=max_tokens)
+                output, prompt_tokens, completion_tokens = chat_completion_sber(
+                    model=endpoint_info["model_name"],
+                    messages=conv,
+                    temperature=temperature,
+                    max_tokens=max_tokens
+                )
             else:
-                output = chat_completion_openai(model=endpoint_info["model_name"], 
-                                                messages=conv, 
-                                                temperature=temperature, 
-                                                max_tokens=max_tokens, 
-                                                api_dict=api_dict)
+                output, prompt_tokens, completion_tokens = chat_completion_openai(
+                    model=endpoint_info["model_name"],
+                    messages=conv,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    api_dict=api_dict
+                )
             conv.append({"role": "assistant", "content": output})
 
-            turns.append({"content": output, "token_len": len(encoding.encode(output))})
+            turns.append(
+                {
+                    "content": output,
+                    "token_len": prompt_tokens + completion_tokens,
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens
+                }
+            )
+
         choices.append({"index": i, "turns": turns})
 
     # Dump answers
@@ -194,3 +183,4 @@ def get_answer(
                 future.result()
 
         detect_language(answer_file)
+        detect_repetitions(answer_file)
diff --git a/gen_judgment.py b/gen_judgment.py
index 839c2892..ecd42df2 100644
--- a/gen_judgment.py
+++ b/gen_judgment.py
@@ -33,7 +33,7 @@ def get_score(judgment, pattern, pairwise=True):
 
 
 # get answer from model
-def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
+def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None) -> (str, int, int):
     api_dict = get_endpoint(endpoint_dict["endpoints"])
 
     if endpoint_dict["api_type"] == "anthropic":
@@ -41,8 +41,10 @@ def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
     elif endpoint_dict["api_type"] == "azure":
         output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict)
     else:
-        output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict)
-    return output
+        output, prompt_tokens, completion_tokens = chat_completion_openai(
+            model, conv, temperature, max_tokens, api_dict)
+
+    return output, prompt_tokens, completion_tokens
 
 
 def judgment(**args):
@@ -96,7 +98,7 @@ def judgment(**args):
 
         judgment = ""
         for _ in range(2):
-            new_judgment = get_answer(
+            new_judgment, prompt_tokens, completion_tokens = get_answer(
                 model,
                 conv,
                 configs["temperature"],
@@ -118,7 +120,9 @@ def judgment(**args):
         result = {
             "user_prompt": conv[1]["content"],
             "judgment": judgment,
-            "score":score
+            "score": score,
+            "completion_tokens": completion_tokens,
+            "prompt_tokens": prompt_tokens,
         }
         output["games"].append(result)
 
diff --git a/show_result.py b/show_result.py
index d0872d5d..d8f6c090 100644
--- a/show_result.py
+++ b/show_result.py
@@ -275,17 +275,26 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3, basel
     else:
         decimal = 0
         stats = stats.astype({"score" : int, "lower" : int, "upper" : int})
-    
+
+    stats["repetition_openai"] = stats["repetition_openai"].apply(lambda x: f"{round(x * 100, 1)}%")
+    stats["repetitions"] = stats["repetitions"].apply(lambda x: f"{round(x * 100, 1)}%")
+    stats["ru"] = stats["ru"].apply(lambda x: f"{round(x * 100, 1)}%")
+    stats["interval"] = stats.apply(
+        lambda x: str((round(x['lower'] - x['score'], decimal), round(x['upper'] - x['score'], decimal))),
+        axis=1
+    )
+
     stats.sort_values(by="score", ascending=False, inplace=True)
     for _, row in stats.iterrows():
-        interval = str((round(row['lower'] - row['score'], decimal), round(row['upper'] - row['score'], decimal)))
-        print(f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | "
-              f"repetition_openai: {round(row['repetition_openai'] * 100, 1) : ^3}% | "
-              f"repetitions: {round(row['repetitions'] * 100, 1) : ^3}% | "
+        print(f"{row['model'] : <50} | score: {round(row['score'], decimal) : ^5} | 95% CI: {row['interval'] : ^12} | "
+              f"repetition_openai: {row['repetition_openai'] : ^5} | "
+              f"repetitions: {row['repetitions'] : ^5} | "
               f"average #tokens: {int(row['avg_tokens']) : ^5} | "
-              f"ru: {round(row['ru'] * 100, 1)}%")
+              f"ru: {row['ru']}")
 
+    stats = stats.drop(columns=["results"])
     if args.output:
         cur_date = datetime.datetime.now()
         date_str = cur_date.strftime("%Y%m%d")
-        stats.to_json(f"arena_hard_leaderboard_{date_str}.json", orient="records", indent=4)
\ No newline at end of file
+        stats.to_json(f"data/arena-hard-v0.1/_leaderboard_{date_str}.json", orient="records", indent=4)
+        stats.to_csv(f"data/arena-hard-v0.1/arena_hard_leaderboard_{date_str}.csv", index=False)
diff --git a/utils.py b/utils.py
index 3d4ea99e..de410199 100644
--- a/utils.py
+++ b/utils.py
@@ -107,7 +107,7 @@ def make_config(config_file: str) -> dict:
     return config_kwargs
 
 
-def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
+def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None) -> (str, int, int):
     import openai
     if api_dict:
         client = openai.OpenAI(
@@ -118,6 +118,8 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No
         client = openai.OpenAI()
     
     output = API_ERROR_OUTPUT
+    prompt_tokens = 0
+    completion_tokens = 0
     for _ in range(API_MAX_RETRY):
         try:
             # print(messages)
@@ -128,6 +130,8 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No
                 max_tokens=max_tokens
                 )
             output = completion.choices[0].message.content
+            prompt_tokens = completion.usage.prompt_tokens
+            completion_tokens = completion.usage.completion_tokens
             break
         except openai.RateLimitError as e:
             print(type(e), e)
@@ -144,7 +148,7 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No
         except Exception as e:
             print(type(e), repr(e))
     
-    return output
+    return output, prompt_tokens, completion_tokens
 
 
 def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):

From a7fa1553ddf8da4f32237429411d9d97d3a80973 Mon Sep 17 00:00:00 2001
From: "a.zemerov" <zemerovanton@gmail.com>
Date: Thu, 25 Jul 2024 17:51:54 +0300
Subject: [PATCH 3/6] Change Yandex auth

---
 utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/utils.py b/utils.py
index de410199..831a78b4 100644
--- a/utils.py
+++ b/utils.py
@@ -365,8 +365,7 @@ def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=No
     # Set up the headers
     headers = {
         "Content-Type": "application/json",
-        "Authorization": f"Bearer {IAM_TOKEN}",
-        "x-folder-id": FOLDER_ID
+        "Authorization": f"Api-Key {IAM_TOKEN}",
     }
 
     output: str = API_ERROR_OUTPUT

From 4d385f529380e5f283ee44f876ef8fd2d2d9ba8b Mon Sep 17 00:00:00 2001
From: "y.shakhvalieva" <shahvalieva.yuliana@yandex.ru>
Date: Thu, 25 Jul 2024 22:55:07 +0300
Subject: [PATCH 4/6] Add calculation of mean number of api errors.

---
 show_result.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/show_result.py b/show_result.py
index d8f6c090..88fd41d5 100644
--- a/show_result.py
+++ b/show_result.py
@@ -13,6 +13,8 @@
 
 from sklearn.linear_model import LogisticRegression
 from collections import defaultdict
+
+import utils
 from utils import load_model_answers, REPETITION_OUTPUT
 
 RU_LANG_LABEL = "__label__rus_Cyrl"
@@ -262,6 +264,14 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3, basel
                 repetitions.append(turn["repetition"])
         stats.at[i, "repetitions"] = sum(repetitions) / len(repetitions) if len(repetitions) > 0 else 0
 
+        # Calculate mean number of api errors
+        errors = []
+        if model in model_answers:
+            for _, row in model_answers[model].items():
+                turn = row["choices"][0]["turns"][0]
+                errors.append(utils.API_ERROR_OUTPUT in turn["content"])
+        stats.at[i, "errors"] = sum(errors) / len(errors) if len(errors) > 0 else 0
+
         stats.at[i, "results"] = bootstrap_elo_lu[model].tolist()
 
         stats.at[i, "repetition_openai"] = repetition_scores[model] if model in repetition_scores else 0
@@ -278,6 +288,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3, basel
 
     stats["repetition_openai"] = stats["repetition_openai"].apply(lambda x: f"{round(x * 100, 1)}%")
     stats["repetitions"] = stats["repetitions"].apply(lambda x: f"{round(x * 100, 1)}%")
+    stats["errors"] = stats["errors"].apply(lambda x: f"{round(x * 100, 1)}%")
     stats["ru"] = stats["ru"].apply(lambda x: f"{round(x * 100, 1)}%")
     stats["interval"] = stats.apply(
         lambda x: str((round(x['lower'] - x['score'], decimal), round(x['upper'] - x['score'], decimal))),
@@ -289,6 +300,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3, basel
         print(f"{row['model'] : <50} | score: {round(row['score'], decimal) : ^5} | 95% CI: {row['interval'] : ^12} | "
               f"repetition_openai: {row['repetition_openai'] : ^5} | "
               f"repetitions: {row['repetitions'] : ^5} | "
+              f"errors: {row['errors'] : ^5} | "
               f"average #tokens: {int(row['avg_tokens']) : ^5} | "
               f"ru: {row['ru']}")
 

From baaee8d7550ed3eff3180df298d5f1a553ff3455 Mon Sep 17 00:00:00 2001
From: "y.shakhvalieva" <shahvalieva.yuliana@yandex.ru>
Date: Thu, 25 Jul 2024 23:16:56 +0300
Subject: [PATCH 5/6] Add answer regeneration in case of error in existing
 answer.

---
 gen_answer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gen_answer.py b/gen_answer.py
index c5c0dcd6..1fdf0a5f 100644
--- a/gen_answer.py
+++ b/gen_answer.py
@@ -13,6 +13,7 @@
 import shortuuid
 import tqdm
 
+import utils
 from utils import (
     load_questions,
     load_model_answers,
@@ -160,7 +161,9 @@ def get_answer(
             futures = []
             count = 0
             for index, question in enumerate(questions):
-                if model in existing_answer and question["question_id"] in existing_answer[model]:
+                if model in existing_answer and question["question_id"] in existing_answer[model] \
+                        and utils.API_ERROR_OUTPUT not in \
+                        existing_answer[model][question["question_id"]]['choices'][0]['turns'][0]['content']:
                     count += 1
                     continue
                 future = executor.submit(

From 2e6bc14f5a98286df65054f76ffeeee3967fe679 Mon Sep 17 00:00:00 2001
From: "y.shakhvalieva" <shahvalieva.yuliana@yandex.ru>
Date: Fri, 2 Aug 2024 11:33:45 +0300
Subject: [PATCH 6/6] Upload benchmark automation

---
 automation/.gen_answer_ascii.txt       |   7 +
 automation/.gen_judgment_ascii.txt     |   8 ++
 automation/.get_token_ascii.txt        |   8 ++
 automation/.results_ascii.txt          |   7 +
 automation/.run_vllm_ascii.txt         |   7 +
 automation/config_constants.py         |  18 +++
 automation/default_hyperparameters.txt |  14 ++
 automation/make_yaml_config.py         | 143 ++++++++++++++++++++
 automation/utils.sh                    | 180 +++++++++++++++++++++++++
 start.sh                               |  26 ++++
 10 files changed, 418 insertions(+)
 create mode 100644 automation/.gen_answer_ascii.txt
 create mode 100644 automation/.gen_judgment_ascii.txt
 create mode 100644 automation/.get_token_ascii.txt
 create mode 100644 automation/.results_ascii.txt
 create mode 100644 automation/.run_vllm_ascii.txt
 create mode 100644 automation/config_constants.py
 create mode 100644 automation/default_hyperparameters.txt
 create mode 100644 automation/make_yaml_config.py
 create mode 100644 automation/utils.sh
 create mode 100755 start.sh

diff --git a/automation/.gen_answer_ascii.txt b/automation/.gen_answer_ascii.txt
new file mode 100644
index 00000000..958bc3da
--- /dev/null
+++ b/automation/.gen_answer_ascii.txt
@@ -0,0 +1,7 @@
+
+   ____ ____  ____     ____ _____  ______      _____  _____
+  / __ `/ _ \/ __ \   / __ `/ __ \/ ___/ | /| / / _ \/ ___/
+ / /_/ /  __/ / / /  / /_/ / / / (__  )| |/ |/ /  __/ /
+ \__, /\___/_/ /_/   \__,_/_/ /_/____/ |__/|__/\___/_/
+/____/
+
diff --git a/automation/.gen_judgment_ascii.txt b/automation/.gen_judgment_ascii.txt
new file mode 100644
index 00000000..61cb4580
--- /dev/null
+++ b/automation/.gen_judgment_ascii.txt
@@ -0,0 +1,8 @@
+
+                          _           __                           __
+   ____ ____  ____       (_)_  ______/ /___ _____ ___  ___  ____  / /_
+  / __ `/ _ \/ __ \     / / / / / __  / __ `/ __ `__ \/ _ \/ __ \/ __/
+ / /_/ /  __/ / / /    / / /_/ / /_/ / /_/ / / / / / /  __/ / / / /_
+ \__, /\___/_/ /_/  __/ /\__,_/\__,_/\__, /_/ /_/ /_/\___/_/ /_/\__/
+/____/             /___/            /____/
+
diff --git a/automation/.get_token_ascii.txt b/automation/.get_token_ascii.txt
new file mode 100644
index 00000000..178c0410
--- /dev/null
+++ b/automation/.get_token_ascii.txt
@@ -0,0 +1,8 @@
+
+               __     __        __
+   ____ ____  / /_   / /_____  / /_____  ____
+  / __ `/ _ \/ __/  / __/ __ \/ //_/ _ \/ __ \
+ / /_/ /  __/ /_   / /_/ /_/ / ,< /  __/ / / /
+ \__, /\___/\__/   \__/\____/_/|_|\___/_/ /_/
+/____/
+
diff --git a/automation/.results_ascii.txt b/automation/.results_ascii.txt
new file mode 100644
index 00000000..ad693f71
--- /dev/null
+++ b/automation/.results_ascii.txt
@@ -0,0 +1,7 @@
+
+                         ____
+   ________  _______  __/ / /______
+  / ___/ _ \/ ___/ / / / / __/ ___/
+ / /  /  __(__  ) /_/ / / /_(__  )
+/_/   \___/____/\__,_/_/\__/____/
+
diff --git a/automation/.run_vllm_ascii.txt b/automation/.run_vllm_ascii.txt
new file mode 100644
index 00000000..b23dd0bf
--- /dev/null
+++ b/automation/.run_vllm_ascii.txt
@@ -0,0 +1,7 @@
+
+                              ____
+   _______  ______     _   __/ / /___ ___
+  / ___/ / / / __ \   | | / / / / __ `__ \
+ / /  / /_/ / / / /   | |/ / / / / / / / /
+/_/   \__,_/_/ /_/    |___/_/_/_/ /_/ /_/
+
diff --git a/automation/config_constants.py b/automation/config_constants.py
new file mode 100644
index 00000000..f9cb2233
--- /dev/null
+++ b/automation/config_constants.py
@@ -0,0 +1,18 @@
+# api_config.yaml
+gpt_4o_parallel = 250
+gpt_4_0613_parallel = 50
+yandex_parallel = 1
+gigachat_parallel = 1
+hf_parallel = 1
+
+question_file_name = 'question_ru.jsonl'
+
+# judge_config.yaml
+prompt_template = ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
+system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
+regex_pattern = f"\[\[([AB<>=]+)\]\]"
+
+reference = False
+ref_model = None
+baseline = True
+pairwise = True
diff --git a/automation/default_hyperparameters.txt b/automation/default_hyperparameters.txt
new file mode 100644
index 00000000..dac2cdfe
--- /dev/null
+++ b/automation/default_hyperparameters.txt
@@ -0,0 +1,14 @@
+vllm_port = 8880
+hf_parallel = 1
+
+bench_name = arena-hard-v0.1
+judge_model = gpt-4o
+baseline_model = gpt-4-0613
+
+gen_answer_temperature = 0.0
+gen_answer_max_tokens = 4096
+gen_answer_num_choices = 1
+
+judge_config_temperature = 0
+judge_config_max_tokens = 4096
+
diff --git a/automation/make_yaml_config.py b/automation/make_yaml_config.py
new file mode 100644
index 00000000..8244600c
--- /dev/null
+++ b/automation/make_yaml_config.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+import sys
+import yaml
+import config_constants
+
+
+class QuotedString(str):
+    pass
+
+
+class CustomDumper(yaml.Dumper):
+    def represent_data(self, data):
+        if type(data) == QuotedString:
+            return self.represent_scalar('tag:yaml.org,2002:str', data, style='"')
+
+        return super(CustomDumper, self).represent_data(data)
+
+
+def save_yaml(data_name, data):
+    file_name = 'config/' + data_name + '.yaml'
+
+    with open(file_name, 'w', encoding='utf-8') as outfile:
+        yaml.dump(data, outfile,
+                  default_flow_style=False,
+                  encoding='utf-8',
+                  width=2000,
+                  allow_unicode=True,
+                  Dumper=CustomDumper)
+
+
+def save_api_config(hyperparameters):
+    api_config = {
+        'gpt-4o': {
+            'model_name': 'gpt-4o',
+            'endpoints': None,
+            'api_type': 'openai',
+            'parallel': config_constants.gpt_4o_parallel,
+        },
+        'gpt-4-0613': {
+            'model_name': 'gpt-4-0613',
+            'endpoints': None,
+            'api_type': 'openai',
+            'parallel': config_constants.gpt_4_0613_parallel,
+        }
+    }
+
+    if hyperparameters['model_type'] == 'yandex':
+        model_config = {
+            'model_name': hyperparameters['original_model_name'],
+            'system_prompt': QuotedString('Ты полезный AI-ассистент.'),
+            'endpoints': None,
+            'api_type': 'yandex',
+            'parallel': config_constants.yandex_parallel,
+        }
+    elif hyperparameters['model_type'] == 'gigachat':
+        model_config = {
+            'model_name': hyperparameters['original_model_name'],
+            'system_prompt': QuotedString('Ты полезный AI-ассистент.'),
+            'endpoints': None,
+            'api_type': 'sber',
+            'parallel': config_constants.gigachat_parallel,
+        }
+    elif hyperparameters['model_type'] == 'hf':
+        model_config = {
+            'model_name': hyperparameters['original_model_name'],
+            'endpoints': [
+                {
+                    'api_base': f"http://{hyperparameters['hostname']}:{hyperparameters['vllm_port']}/v1",
+                    'api_key': 'default-token',
+                },
+            ],
+            'api_type': 'openai',
+            'parallel': hyperparameters['hf_parallel'],
+        }
+    else:
+        raise ValueError('Incorrect model type')
+
+    api_config[hyperparameters['model_alias']] = model_config
+    save_yaml('api_config', api_config)
+
+
+def save_gen_answer_config(hyperparameters):
+    gen_answer_config = {
+        'name': f'config of answer generation for {hyperparameters["bench_name"]}',
+        'bench_name': hyperparameters['bench_name'],
+        'temperature': hyperparameters['gen_answer_temperature'],
+        'max_tokens': hyperparameters['gen_answer_max_tokens'],
+        'num_choices': hyperparameters['gen_answer_num_choices'],
+        'question_file': QuotedString(config_constants.question_file_name),
+        'model_list': [hyperparameters['baseline_model'], hyperparameters['model_alias']],
+    }
+
+    save_yaml('gen_answer_config', gen_answer_config)
+
+
+def save_judge_config(hyperparameters):
+    judge_config = {
+        'name': f'judgment config file for {hyperparameters["bench_name"]}',
+        'bench_name': hyperparameters['bench_name'],
+        'judge_model': hyperparameters['judge_model'],
+        'reference': config_constants.reference,
+        'ref_model': config_constants.ref_model,
+        'baseline': config_constants.baseline,
+        'baseline_model': hyperparameters['baseline_model'],
+        'pairwise': config_constants.pairwise,
+        'temperature': hyperparameters['judge_config_temperature'],
+        'max_tokens': hyperparameters['judge_config_max_tokens'],
+        'regex_pattern': config_constants.regex_pattern,
+        'system_prompt': QuotedString(config_constants.system_prompt),
+        'prompt_template': config_constants.prompt_template,
+        'question_file': QuotedString(config_constants.question_file_name),
+        'model_list': [hyperparameters['model_alias']],
+    }
+
+    save_yaml('judge_config', judge_config)
+
+
+def correct_type(arg):
+    try:
+        return int(arg)
+    except ValueError:
+        try:
+            return float(arg)
+        except ValueError:
+            return arg
+
+
+def main(args):
+    data = [correct_type(arg) for arg in args[1:] if arg != "="]
+    hyperparameters = dict()
+
+    for i in range(0, len(data), 2):
+        var_name = data[i]
+        var_value = data[i + 1]
+        hyperparameters[var_name] = var_value
+
+    save_api_config(hyperparameters)
+    save_gen_answer_config(hyperparameters)
+    save_judge_config(hyperparameters)
+
+
+if __name__ == '__main__':
+    main(sys.argv)
diff --git a/automation/utils.sh b/automation/utils.sh
new file mode 100644
index 00000000..91cd1a3d
--- /dev/null
+++ b/automation/utils.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+function get_param_from_string {
+  echo "$1" | grep "$2" | cut -d '=' -f 2 | sed 's/ //g'
+}
+
+function read_original_model_name {
+  read -p "Enter model name.
+
+Examples:
+  - deepvk/gemma-2b-sft
+  - yandexgpt-lite
+  - GigaChat
+
+" original_model_name;
+
+  echo "${original_model_name//["'\""]/}"
+}
+
+function resolve_model_type {
+  if [[ "$1" = 'yandex' ]];
+  then
+    echo 'yandex'
+  elif [[ "$1" = 'gigachat' ]];
+  then
+    echo 'gigachat'
+  else
+    echo 'hf'
+  fi
+}
+
+function check_export_vars {
+  vars_count=0
+
+  for var_name in ${!1};
+  do
+    if ! [[ -n "${!var_name}" ]];
+    then
+      vars_count+=0
+      echo "You should export $var_name"
+    else
+      echo "OK $var_name"
+    fi
+  done
+
+  if ! [[ ($vars_count == 0) ]];
+  then
+    exit 1
+  fi
+}
+
+function get_var_values {
+  read -r -p "----------------------------------------
+$var_values
+----------------------------------------
+Would you like to use this hyperparameters? [y/N] " response
+
+  case "$response" in
+  [yY][eE][sS]|[yY])
+    echo "$var_values"
+    ;;
+  *)
+    read -p "
+Enter hyperparameters you would like to change:
+<var_name_1> <var_value_1> <var_name_2> <var_value_2> ...
+" new_values
+
+    array=($new_values)
+    for ((i=0;i< ${#array[@]} ;i+=2));
+    do
+      var_name=${array[i]}
+      var_value=${array[i+1]}
+      var_values=$(echo "$var_values" | sed "s/\($var_name = \([^=]*\)\)/$var_name = $var_value/g")
+    done
+    var_values=$(get_var_values)
+    echo "$var_values"
+    ;;
+  esac
+}
+
+function make_yaml_configs {
+  hostname="$(hostname -f)"
+
+  python automation/make_yaml_config.py $var_values  \
+         "original_model_name" "$original_model_name"  \
+         "model_alias" "$model_alias" \
+         "model_type" "$model_type" \
+         "hostname" "$hostname"
+}
+
+function get_gigachat_token {
+  cat "automation/.get_token_ascii.txt"
+
+  response=$(curl -k -s -L -X POST 'https://ngw.devices.sberbank.ru:9443/api/v2/oauth' \
+  -H 'Content-Type: application/x-www-form-urlencoded' \
+  -H 'Accept: application/json' \
+  -H 'RqUID: '"$(uuidgen)"'' \
+  -H 'Authorization: Basic '"$1"'' \
+  --data-urlencode 'scope=GIGACHAT_API_PERS')
+   echo "$response" | sed -E 's/.*"access_token":"(.*)",.*/\1/'
+}
+
+function run_vllm {
+  cat "automation/.run_vllm_ascii.txt"
+
+  clean_env "$model_type"
+
+  vllm_port=$(get_param_from_string "$var_values" "vllm_port")
+  hf_cache="/nfs/$(whoami)/hf_cache"
+
+  export TOKEN="$HF_TOKEN"
+  export MODEL="$original_model_name"
+  export PORT="$vllm_port"
+  export HF_CACHE="$hf_cache"
+
+  docker run --runtime nvidia --gpus '"device=0,1,2,3,4,5,6,7"' \
+  -v "$HF_CACHE:/root/.cache/huggingface" \
+  -dit \
+  --name "script_vllm_container" \
+  --env "HUGGING_FACE_HUB_TOKEN=${TOKEN}" \
+  --env VLLM_ATTENTION_BACKEND=FLASHINFER \
+  --ipc=host \
+  --net=host \
+  --uts=host \
+  vllm/vllm-openai:latest \
+  --model "$MODEL" \
+  --api-key default-token \
+  --dtype auto \
+  --port "$PORT"
+
+  while docker logs "script_vllm_container" | grep -q "Application startup complete";
+  do
+    sleep 0.1;
+  done
+}
+
+function clean_env {
+  if [[ "$1" == 'hf' ]];
+  then
+    container_id=$(docker ps -a | grep "script_vllm_container" | cut -d " " -f1)
+
+    if ! [ -z "$container_id" ];
+    then
+      docker kill "$container_id"
+      docker rm -f "$container_id"
+    fi
+
+  fi
+}
+
+function prepare_env {
+  if [[ "$1" == 'gigachat' ]];
+  then
+    GIGACHAT_TOKEN=$(get_gigachat_token "$GIGACHAT_AUTHORIZATION_DATA")
+    export GIGACHAT_TOKEN="$GIGACHAT_TOKEN"
+
+  elif [[ "$1" == 'hf' ]];
+  then
+    run_vllm
+  fi
+}
+
+function gen_answer {
+  cat "automation/.gen_answer_ascii.txt"
+  python gen_answer.py
+}
+
+function gen_judgment {
+  cat "automation/.gen_judgment_ascii.txt"
+  python gen_judgment.py
+}
+
+function show_result {
+  cat "automation/.results_ascii.txt"
+  judge_model=$(get_param_from_string "$var_values" "judge_model")
+  python show_result.py --judge-name "$judge_model"
+}
+
+
+
diff --git a/start.sh b/start.sh
new file mode 100755
index 00000000..afdc7fd6
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+vars_gigachat=("OPENAI_API_KEY" "GIGACHAT_AUTHORIZATION_DATA")
+vars_yandex=("OPENAI_API_KEY" "FOLDER_ID" "IAM_TOKEN")
+vars_hf=("OPENAI_API_KEY" "HF_TOKEN")
+
+source automation/utils.sh
+original_model_name=$(read_original_model_name)
+
+original_model_name_lowercase=$(echo "$original_model_name" | awk '{print tolower($0)}')
+model_type=$(resolve_model_type "$original_model_name_lowercase")
+check_export_vars "vars_${model_type}[@]"
+
+var_values=$(cat "automation/default_hyperparameters.txt")
+var_values=$(get_var_values)
+
+model_alias="${original_model_name////-}-$(date '+%d-%m-%y')"
+make_yaml_configs
+
+prepare_env "$model_type"
+
+gen_answer
+gen_judgment
+show_result "$var_values"
+
+clean_env "$model_type"