From 47168a9cb933d92b287cc819d2f3e015e25ef52f Mon Sep 17 00:00:00 2001 From: "a.zemerov" Date: Mon, 22 Jul 2024 16:30:29 +0300 Subject: [PATCH 1/6] Change default parameters for judge config --- config/judge_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/judge_config.yaml b/config/judge_config.yaml index 5090d2a6..8de9a367 100644 --- a/config/judge_config.yaml +++ b/config/judge_config.yaml @@ -3,12 +3,12 @@ name: judgment config file for Arena Hard bench_name: arena-hard-v0.1 # Arena Hard default -judge_model: gpt-4-1106-preview +judge_model: gpt-4o reference: False # Optional ref_model: null baseline: True -baseline_model: gpt-4-0314 +baseline_model: gpt-4-0613 pairwise: True temperature: 0 From 5430faa4d65a07ac7c4403569dd253332b8de399 Mon Sep 17 00:00:00 2001 From: "a.zemerov" Date: Tue, 23 Jul 2024 18:46:16 +0300 Subject: [PATCH 2/6] add new metrics and api types --- gen_answer.py | 72 +++++++++++++++++++++---------------------------- gen_judgment.py | 14 ++++++---- show_result.py | 23 +++++++++++----- utils.py | 8 ++++-- 4 files changed, 62 insertions(+), 55 deletions(-) diff --git a/gen_answer.py b/gen_answer.py index e33b6231..c5c0dcd6 100644 --- a/gen_answer.py +++ b/gen_answer.py @@ -27,6 +27,7 @@ chat_completion_yandex, chat_completion_sber, detect_language, + detect_repetitions, OPENAI_MODEL_LIST, temperature_config, ) @@ -53,54 +54,42 @@ def get_answer( turns = [] for j in range(len(question["turns"])): conv.append({"role": "user", "content": question["turns"][j]["content"]}) - if api_type == "anthropic": - output = chat_completion_anthropic(model=endpoint_info["model_name"], - messages=conv, - temperature=temperature, - max_tokens=max_tokens) - elif api_type == "mistral": - output = chat_completion_mistral(model=endpoint_info["model_name"], - messages=conv, - temperature=temperature, - max_tokens=max_tokens) - elif api_type == "gemini": - output = chat_completion_gemini(model=endpoint_info["model_name"], - messages=question["turns"][j]["content"], - temperature=temperature, - max_tokens=max_tokens) - elif api_type == "azure": - output = chat_completion_openai_azure(model=endpoint_info["model_name"], - messages=conv, - temperature=temperature, - max_tokens=max_tokens, - api_dict=api_dict) - elif api_type == "cohere": - output = chat_completion_cohere(model=endpoint_info["model_name"], - messages=conv, - temperature=temperature, - max_tokens=max_tokens) - elif api_type == "yandex": + if api_type == "yandex": for reply in conv: # Rename key name for compatibility with yandex api reply["text"] = reply.pop("content") - output = chat_completion_yandex(model=endpoint_info["model_name"], - messages=conv, - temperature=temperature, - max_tokens=max_tokens) + output, prompt_tokens, completion_tokens = chat_completion_yandex( + model=endpoint_info["model_name"], + messages=conv, + temperature=temperature, + max_tokens=max_tokens + ) elif api_type == "sber": - output = chat_completion_sber(model=endpoint_info["model_name"], - messages=conv, - temperature=temperature, - max_tokens=max_tokens) + output, prompt_tokens, completion_tokens = chat_completion_sber( + model=endpoint_info["model_name"], + messages=conv, + temperature=temperature, + max_tokens=max_tokens + ) else: - output = chat_completion_openai(model=endpoint_info["model_name"], - messages=conv, - temperature=temperature, - max_tokens=max_tokens, - api_dict=api_dict) + output, prompt_tokens, completion_tokens = chat_completion_openai( + model=endpoint_info["model_name"], + messages=conv, + temperature=temperature, + max_tokens=max_tokens, + api_dict=api_dict + ) conv.append({"role": "assistant", "content": output}) - turns.append({"content": output, "token_len": len(encoding.encode(output))}) + turns.append( + { + "content": output, + "token_len": prompt_tokens + completion_tokens, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens + } + ) + choices.append({"index": i, "turns": turns}) # Dump answers @@ -194,3 +183,4 @@ def get_answer( future.result() detect_language(answer_file) + detect_repetitions(answer_file) diff --git a/gen_judgment.py b/gen_judgment.py index 839c2892..ecd42df2 100644 --- a/gen_judgment.py +++ b/gen_judgment.py @@ -33,7 +33,7 @@ def get_score(judgment, pattern, pairwise=True): # get answer from model -def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None): +def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None) -> (str, int, int): api_dict = get_endpoint(endpoint_dict["endpoints"]) if endpoint_dict["api_type"] == "anthropic": @@ -41,8 +41,10 @@ def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None): elif endpoint_dict["api_type"] == "azure": output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict) else: - output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict) - return output + output, prompt_tokens, completion_tokens = chat_completion_openai( + model, conv, temperature, max_tokens, api_dict) + + return output, prompt_tokens, completion_tokens def judgment(**args): @@ -96,7 +98,7 @@ def judgment(**args): judgment = "" for _ in range(2): - new_judgment = get_answer( + new_judgment, prompt_tokens, completion_tokens = get_answer( model, conv, configs["temperature"], @@ -118,7 +120,9 @@ def judgment(**args): result = { "user_prompt": conv[1]["content"], "judgment": judgment, - "score":score + "score": score, + "completion_tokens": completion_tokens, + "prompt_tokens": prompt_tokens, } output["games"].append(result) diff --git a/show_result.py b/show_result.py index d0872d5d..d8f6c090 100644 --- a/show_result.py +++ b/show_result.py @@ -275,17 +275,26 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3, basel else: decimal = 0 stats = stats.astype({"score" : int, "lower" : int, "upper" : int}) - + + stats["repetition_openai"] = stats["repetition_openai"].apply(lambda x: f"{round(x * 100, 1)}%") + stats["repetitions"] = stats["repetitions"].apply(lambda x: f"{round(x * 100, 1)}%") + stats["ru"] = stats["ru"].apply(lambda x: f"{round(x * 100, 1)}%") + stats["interval"] = stats.apply( + lambda x: str((round(x['lower'] - x['score'], decimal), round(x['upper'] - x['score'], decimal))), + axis=1 + ) + stats.sort_values(by="score", ascending=False, inplace=True) for _, row in stats.iterrows(): - interval = str((round(row['lower'] - row['score'], decimal), round(row['upper'] - row['score'], decimal))) - print(f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | " - f"repetition_openai: {round(row['repetition_openai'] * 100, 1) : ^3}% | " - f"repetitions: {round(row['repetitions'] * 100, 1) : ^3}% | " + print(f"{row['model'] : <50} | score: {round(row['score'], decimal) : ^5} | 95% CI: {row['interval'] : ^12} | " + f"repetition_openai: {row['repetition_openai'] : ^5} | " + f"repetitions: {row['repetitions'] : ^5} | " f"average #tokens: {int(row['avg_tokens']) : ^5} | " - f"ru: {round(row['ru'] * 100, 1)}%") + f"ru: {row['ru']}") + stats = stats.drop(columns=["results"]) if args.output: cur_date = datetime.datetime.now() date_str = cur_date.strftime("%Y%m%d") - stats.to_json(f"arena_hard_leaderboard_{date_str}.json", orient="records", indent=4) \ No newline at end of file + stats.to_json(f"data/arena-hard-v0.1/_leaderboard_{date_str}.json", orient="records", indent=4) + stats.to_csv(f"data/arena-hard-v0.1/arena_hard_leaderboard_{date_str}.csv", index=False) diff --git a/utils.py b/utils.py index 3d4ea99e..de410199 100644 --- a/utils.py +++ b/utils.py @@ -107,7 +107,7 @@ def make_config(config_file: str) -> dict: return config_kwargs -def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None): +def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None) -> (str, int, int): import openai if api_dict: client = openai.OpenAI( @@ -118,6 +118,8 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No client = openai.OpenAI() output = API_ERROR_OUTPUT + prompt_tokens = 0 + completion_tokens = 0 for _ in range(API_MAX_RETRY): try: # print(messages) @@ -128,6 +130,8 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No max_tokens=max_tokens ) output = completion.choices[0].message.content + prompt_tokens = completion.usage.prompt_tokens + completion_tokens = completion.usage.completion_tokens break except openai.RateLimitError as e: print(type(e), e) @@ -144,7 +148,7 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No except Exception as e: print(type(e), repr(e)) - return output + return output, prompt_tokens, completion_tokens def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None): From a7fa1553ddf8da4f32237429411d9d97d3a80973 Mon Sep 17 00:00:00 2001 From: "a.zemerov" Date: Thu, 25 Jul 2024 17:51:54 +0300 Subject: [PATCH 3/6] Change Yandex auth --- utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils.py b/utils.py index de410199..831a78b4 100644 --- a/utils.py +++ b/utils.py @@ -365,8 +365,7 @@ def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=No # Set up the headers headers = { "Content-Type": "application/json", - "Authorization": f"Bearer {IAM_TOKEN}", - "x-folder-id": FOLDER_ID + "Authorization": f"Api-Key {IAM_TOKEN}", } output: str = API_ERROR_OUTPUT From 4d385f529380e5f283ee44f876ef8fd2d2d9ba8b Mon Sep 17 00:00:00 2001 From: "y.shakhvalieva" Date: Thu, 25 Jul 2024 22:55:07 +0300 Subject: [PATCH 4/6] Add calculation of mean number of api errors. --- show_result.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/show_result.py b/show_result.py index d8f6c090..88fd41d5 100644 --- a/show_result.py +++ b/show_result.py @@ -13,6 +13,8 @@ from sklearn.linear_model import LogisticRegression from collections import defaultdict + +import utils from utils import load_model_answers, REPETITION_OUTPUT RU_LANG_LABEL = "__label__rus_Cyrl" @@ -262,6 +264,14 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3, basel repetitions.append(turn["repetition"]) stats.at[i, "repetitions"] = sum(repetitions) / len(repetitions) if len(repetitions) > 0 else 0 + # Calculate mean number of api errors + errors = [] + if model in model_answers: + for _, row in model_answers[model].items(): + turn = row["choices"][0]["turns"][0] + errors.append(utils.API_ERROR_OUTPUT in turn["content"]) + stats.at[i, "errors"] = sum(errors) / len(errors) if len(errors) > 0 else 0 + stats.at[i, "results"] = bootstrap_elo_lu[model].tolist() stats.at[i, "repetition_openai"] = repetition_scores[model] if model in repetition_scores else 0 @@ -278,6 +288,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3, basel stats["repetition_openai"] = stats["repetition_openai"].apply(lambda x: f"{round(x * 100, 1)}%") stats["repetitions"] = stats["repetitions"].apply(lambda x: f"{round(x * 100, 1)}%") + stats["errors"] = stats["errors"].apply(lambda x: f"{round(x * 100, 1)}%") stats["ru"] = stats["ru"].apply(lambda x: f"{round(x * 100, 1)}%") stats["interval"] = stats.apply( lambda x: str((round(x['lower'] - x['score'], decimal), round(x['upper'] - x['score'], decimal))), @@ -289,6 +300,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3, basel print(f"{row['model'] : <50} | score: {round(row['score'], decimal) : ^5} | 95% CI: {row['interval'] : ^12} | " f"repetition_openai: {row['repetition_openai'] : ^5} | " f"repetitions: {row['repetitions'] : ^5} | " + f"errors: {row['errors'] : ^5} | " f"average #tokens: {int(row['avg_tokens']) : ^5} | " f"ru: {row['ru']}") From baaee8d7550ed3eff3180df298d5f1a553ff3455 Mon Sep 17 00:00:00 2001 From: "y.shakhvalieva" Date: Thu, 25 Jul 2024 23:16:56 +0300 Subject: [PATCH 5/6] Add answer regeneration in case of error in existing answer. --- gen_answer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gen_answer.py b/gen_answer.py index c5c0dcd6..1fdf0a5f 100644 --- a/gen_answer.py +++ b/gen_answer.py @@ -13,6 +13,7 @@ import shortuuid import tqdm +import utils from utils import ( load_questions, load_model_answers, @@ -160,7 +161,9 @@ def get_answer( futures = [] count = 0 for index, question in enumerate(questions): - if model in existing_answer and question["question_id"] in existing_answer[model]: + if model in existing_answer and question["question_id"] in existing_answer[model] \ + and utils.API_ERROR_OUTPUT not in \ + existing_answer[model][question["question_id"]]['choices'][0]['turns'][0]['content']: count += 1 continue future = executor.submit( From 2e6bc14f5a98286df65054f76ffeeee3967fe679 Mon Sep 17 00:00:00 2001 From: "y.shakhvalieva" Date: Fri, 2 Aug 2024 11:33:45 +0300 Subject: [PATCH 6/6] Upload benchmark automation --- automation/.gen_answer_ascii.txt | 7 + automation/.gen_judgment_ascii.txt | 8 ++ automation/.get_token_ascii.txt | 8 ++ automation/.results_ascii.txt | 7 + automation/.run_vllm_ascii.txt | 7 + automation/config_constants.py | 18 +++ automation/default_hyperparameters.txt | 14 ++ automation/make_yaml_config.py | 143 ++++++++++++++++++++ automation/utils.sh | 180 +++++++++++++++++++++++++ start.sh | 26 ++++ 10 files changed, 418 insertions(+) create mode 100644 automation/.gen_answer_ascii.txt create mode 100644 automation/.gen_judgment_ascii.txt create mode 100644 automation/.get_token_ascii.txt create mode 100644 automation/.results_ascii.txt create mode 100644 automation/.run_vllm_ascii.txt create mode 100644 automation/config_constants.py create mode 100644 automation/default_hyperparameters.txt create mode 100644 automation/make_yaml_config.py create mode 100644 automation/utils.sh create mode 100755 start.sh diff --git a/automation/.gen_answer_ascii.txt b/automation/.gen_answer_ascii.txt new file mode 100644 index 00000000..958bc3da --- /dev/null +++ b/automation/.gen_answer_ascii.txt @@ -0,0 +1,7 @@ + + ____ ____ ____ ____ _____ ______ _____ _____ + / __ `/ _ \/ __ \ / __ `/ __ \/ ___/ | /| / / _ \/ ___/ + / /_/ / __/ / / / / /_/ / / / (__ )| |/ |/ / __/ / + \__, /\___/_/ /_/ \__,_/_/ /_/____/ |__/|__/\___/_/ +/____/ + diff --git a/automation/.gen_judgment_ascii.txt b/automation/.gen_judgment_ascii.txt new file mode 100644 index 00000000..61cb4580 --- /dev/null +++ b/automation/.gen_judgment_ascii.txt @@ -0,0 +1,8 @@ + + _ __ __ + ____ ____ ____ (_)_ ______/ /___ _____ ___ ___ ____ / /_ + / __ `/ _ \/ __ \ / / / / / __ / __ `/ __ `__ \/ _ \/ __ \/ __/ + / /_/ / __/ / / / / / /_/ / /_/ / /_/ / / / / / / __/ / / / /_ + \__, /\___/_/ /_/ __/ /\__,_/\__,_/\__, /_/ /_/ /_/\___/_/ /_/\__/ +/____/ /___/ /____/ + diff --git a/automation/.get_token_ascii.txt b/automation/.get_token_ascii.txt new file mode 100644 index 00000000..178c0410 --- /dev/null +++ b/automation/.get_token_ascii.txt @@ -0,0 +1,8 @@ + + __ __ __ + ____ ____ / /_ / /_____ / /_____ ____ + / __ `/ _ \/ __/ / __/ __ \/ //_/ _ \/ __ \ + / /_/ / __/ /_ / /_/ /_/ / ,< / __/ / / / + \__, /\___/\__/ \__/\____/_/|_|\___/_/ /_/ +/____/ + diff --git a/automation/.results_ascii.txt b/automation/.results_ascii.txt new file mode 100644 index 00000000..ad693f71 --- /dev/null +++ b/automation/.results_ascii.txt @@ -0,0 +1,7 @@ + + ____ + ________ _______ __/ / /______ + / ___/ _ \/ ___/ / / / / __/ ___/ + / / / __(__ ) /_/ / / /_(__ ) +/_/ \___/____/\__,_/_/\__/____/ + diff --git a/automation/.run_vllm_ascii.txt b/automation/.run_vllm_ascii.txt new file mode 100644 index 00000000..b23dd0bf --- /dev/null +++ b/automation/.run_vllm_ascii.txt @@ -0,0 +1,7 @@ + + ____ + _______ ______ _ __/ / /___ ___ + / ___/ / / / __ \ | | / / / / __ `__ \ + / / / /_/ / / / / | |/ / / / / / / / / +/_/ \__,_/_/ /_/ |___/_/_/_/ /_/ /_/ + diff --git a/automation/config_constants.py b/automation/config_constants.py new file mode 100644 index 00000000..f9cb2233 --- /dev/null +++ b/automation/config_constants.py @@ -0,0 +1,18 @@ +# api_config.yaml +gpt_4o_parallel = 250 +gpt_4_0613_parallel = 50 +yandex_parallel = 1 +gigachat_parallel = 1 +hf_parallel = 1 + +question_file_name = 'question_ru.jsonl' + +# judge_config.yaml +prompt_template = ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"] +system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." +regex_pattern = f"\[\[([AB<>=]+)\]\]" + +reference = False +ref_model = None +baseline = True +pairwise = True diff --git a/automation/default_hyperparameters.txt b/automation/default_hyperparameters.txt new file mode 100644 index 00000000..dac2cdfe --- /dev/null +++ b/automation/default_hyperparameters.txt @@ -0,0 +1,14 @@ +vllm_port = 8880 +hf_parallel = 1 + +bench_name = arena-hard-v0.1 +judge_model = gpt-4o +baseline_model = gpt-4-0613 + +gen_answer_temperature = 0.0 +gen_answer_max_tokens = 4096 +gen_answer_num_choices = 1 + +judge_config_temperature = 0 +judge_config_max_tokens = 4096 + diff --git a/automation/make_yaml_config.py b/automation/make_yaml_config.py new file mode 100644 index 00000000..8244600c --- /dev/null +++ b/automation/make_yaml_config.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- +import sys +import yaml +import config_constants + + +class QuotedString(str): + pass + + +class CustomDumper(yaml.Dumper): + def represent_data(self, data): + if type(data) == QuotedString: + return self.represent_scalar('tag:yaml.org,2002:str', data, style='"') + + return super(CustomDumper, self).represent_data(data) + + +def save_yaml(data_name, data): + file_name = 'config/' + data_name + '.yaml' + + with open(file_name, 'w', encoding='utf-8') as outfile: + yaml.dump(data, outfile, + default_flow_style=False, + encoding='utf-8', + width=2000, + allow_unicode=True, + Dumper=CustomDumper) + + +def save_api_config(hyperparameters): + api_config = { + 'gpt-4o': { + 'model_name': 'gpt-4o', + 'endpoints': None, + 'api_type': 'openai', + 'parallel': config_constants.gpt_4o_parallel, + }, + 'gpt-4-0613': { + 'model_name': 'gpt-4-0613', + 'endpoints': None, + 'api_type': 'openai', + 'parallel': config_constants.gpt_4_0613_parallel, + } + } + + if hyperparameters['model_type'] == 'yandex': + model_config = { + 'model_name': hyperparameters['original_model_name'], + 'system_prompt': QuotedString('Ты полезный AI-ассистент.'), + 'endpoints': None, + 'api_type': 'yandex', + 'parallel': config_constants.yandex_parallel, + } + elif hyperparameters['model_type'] == 'gigachat': + model_config = { + 'model_name': hyperparameters['original_model_name'], + 'system_prompt': QuotedString('Ты полезный AI-ассистент.'), + 'endpoints': None, + 'api_type': 'sber', + 'parallel': config_constants.gigachat_parallel, + } + elif hyperparameters['model_type'] == 'hf': + model_config = { + 'model_name': hyperparameters['original_model_name'], + 'endpoints': [ + { + 'api_base': f"http://{hyperparameters['hostname']}:{hyperparameters['vllm_port']}/v1", + 'api_key': 'default-token', + }, + ], + 'api_type': 'openai', + 'parallel': hyperparameters['hf_parallel'], + } + else: + raise ValueError('Incorrect model type') + + api_config[hyperparameters['model_alias']] = model_config + save_yaml('api_config', api_config) + + +def save_gen_answer_config(hyperparameters): + gen_answer_config = { + 'name': f'config of answer generation for {hyperparameters["bench_name"]}', + 'bench_name': hyperparameters['bench_name'], + 'temperature': hyperparameters['gen_answer_temperature'], + 'max_tokens': hyperparameters['gen_answer_max_tokens'], + 'num_choices': hyperparameters['gen_answer_num_choices'], + 'question_file': QuotedString(config_constants.question_file_name), + 'model_list': [hyperparameters['baseline_model'], hyperparameters['model_alias']], + } + + save_yaml('gen_answer_config', gen_answer_config) + + +def save_judge_config(hyperparameters): + judge_config = { + 'name': f'judgment config file for {hyperparameters["bench_name"]}', + 'bench_name': hyperparameters['bench_name'], + 'judge_model': hyperparameters['judge_model'], + 'reference': config_constants.reference, + 'ref_model': config_constants.ref_model, + 'baseline': config_constants.baseline, + 'baseline_model': hyperparameters['baseline_model'], + 'pairwise': config_constants.pairwise, + 'temperature': hyperparameters['judge_config_temperature'], + 'max_tokens': hyperparameters['judge_config_max_tokens'], + 'regex_pattern': config_constants.regex_pattern, + 'system_prompt': QuotedString(config_constants.system_prompt), + 'prompt_template': config_constants.prompt_template, + 'question_file': QuotedString(config_constants.question_file_name), + 'model_list': [hyperparameters['model_alias']], + } + + save_yaml('judge_config', judge_config) + + +def correct_type(arg): + try: + return int(arg) + except ValueError: + try: + return float(arg) + except ValueError: + return arg + + +def main(args): + data = [correct_type(arg) for arg in args[1:] if arg != "="] + hyperparameters = dict() + + for i in range(0, len(data), 2): + var_name = data[i] + var_value = data[i + 1] + hyperparameters[var_name] = var_value + + save_api_config(hyperparameters) + save_gen_answer_config(hyperparameters) + save_judge_config(hyperparameters) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/automation/utils.sh b/automation/utils.sh new file mode 100644 index 00000000..91cd1a3d --- /dev/null +++ b/automation/utils.sh @@ -0,0 +1,180 @@ +#!/bin/bash + +function get_param_from_string { + echo "$1" | grep "$2" | cut -d '=' -f 2 | sed 's/ //g' +} + +function read_original_model_name { + read -p "Enter model name. + +Examples: + - deepvk/gemma-2b-sft + - yandexgpt-lite + - GigaChat + +" original_model_name; + + echo "${original_model_name//["'\""]/}" +} + +function resolve_model_type { + if [[ "$1" = 'yandex' ]]; + then + echo 'yandex' + elif [[ "$1" = 'gigachat' ]]; + then + echo 'gigachat' + else + echo 'hf' + fi +} + +function check_export_vars { + vars_count=0 + + for var_name in ${!1}; + do + if ! [[ -n "${!var_name}" ]]; + then + vars_count+=0 + echo "You should export $var_name" + else + echo "OK $var_name" + fi + done + + if ! [[ ($vars_count == 0) ]]; + then + exit 1 + fi +} + +function get_var_values { + read -r -p "---------------------------------------- +$var_values +---------------------------------------- +Would you like to use this hyperparameters? [y/N] " response + + case "$response" in + [yY][eE][sS]|[yY]) + echo "$var_values" + ;; + *) + read -p " +Enter hyperparameters you would like to change: + ... +" new_values + + array=($new_values) + for ((i=0;i< ${#array[@]} ;i+=2)); + do + var_name=${array[i]} + var_value=${array[i+1]} + var_values=$(echo "$var_values" | sed "s/\($var_name = \([^=]*\)\)/$var_name = $var_value/g") + done + var_values=$(get_var_values) + echo "$var_values" + ;; + esac +} + +function make_yaml_configs { + hostname="$(hostname -f)" + + python automation/make_yaml_config.py $var_values \ + "original_model_name" "$original_model_name" \ + "model_alias" "$model_alias" \ + "model_type" "$model_type" \ + "hostname" "$hostname" +} + +function get_gigachat_token { + cat "automation/.get_token_ascii.txt" + + response=$(curl -k -s -L -X POST 'https://ngw.devices.sberbank.ru:9443/api/v2/oauth' \ + -H 'Content-Type: application/x-www-form-urlencoded' \ + -H 'Accept: application/json' \ + -H 'RqUID: '"$(uuidgen)"'' \ + -H 'Authorization: Basic '"$1"'' \ + --data-urlencode 'scope=GIGACHAT_API_PERS') + echo "$response" | sed -E 's/.*"access_token":"(.*)",.*/\1/' +} + +function run_vllm { + cat "automation/.run_vllm_ascii.txt" + + clean_env "$model_type" + + vllm_port=$(get_param_from_string "$var_values" "vllm_port") + hf_cache="/nfs/$(whoami)/hf_cache" + + export TOKEN="$HF_TOKEN" + export MODEL="$original_model_name" + export PORT="$vllm_port" + export HF_CACHE="$hf_cache" + + docker run --runtime nvidia --gpus '"device=0,1,2,3,4,5,6,7"' \ + -v "$HF_CACHE:/root/.cache/huggingface" \ + -dit \ + --name "script_vllm_container" \ + --env "HUGGING_FACE_HUB_TOKEN=${TOKEN}" \ + --env VLLM_ATTENTION_BACKEND=FLASHINFER \ + --ipc=host \ + --net=host \ + --uts=host \ + vllm/vllm-openai:latest \ + --model "$MODEL" \ + --api-key default-token \ + --dtype auto \ + --port "$PORT" + + while docker logs "script_vllm_container" | grep -q "Application startup complete"; + do + sleep 0.1; + done +} + +function clean_env { + if [[ "$1" == 'hf' ]]; + then + container_id=$(docker ps -a | grep "script_vllm_container" | cut -d " " -f1) + + if ! [ -z "$container_id" ]; + then + docker kill "$container_id" + docker rm -f "$container_id" + fi + + fi +} + +function prepare_env { + if [[ "$1" == 'gigachat' ]]; + then + GIGACHAT_TOKEN=$(get_gigachat_token "$GIGACHAT_AUTHORIZATION_DATA") + export GIGACHAT_TOKEN="$GIGACHAT_TOKEN" + + elif [[ "$1" == 'hf' ]]; + then + run_vllm + fi +} + +function gen_answer { + cat "automation/.gen_answer_ascii.txt" + python gen_answer.py +} + +function gen_judgment { + cat "automation/.gen_judgment_ascii.txt" + python gen_judgment.py +} + +function show_result { + cat "automation/.results_ascii.txt" + judge_model=$(get_param_from_string "$var_values" "judge_model") + python show_result.py --judge-name "$judge_model" +} + + + diff --git a/start.sh b/start.sh new file mode 100755 index 00000000..afdc7fd6 --- /dev/null +++ b/start.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +vars_gigachat=("OPENAI_API_KEY" "GIGACHAT_AUTHORIZATION_DATA") +vars_yandex=("OPENAI_API_KEY" "FOLDER_ID" "IAM_TOKEN") +vars_hf=("OPENAI_API_KEY" "HF_TOKEN") + +source automation/utils.sh +original_model_name=$(read_original_model_name) + +original_model_name_lowercase=$(echo "$original_model_name" | awk '{print tolower($0)}') +model_type=$(resolve_model_type "$original_model_name_lowercase") +check_export_vars "vars_${model_type}[@]" + +var_values=$(cat "automation/default_hyperparameters.txt") +var_values=$(get_var_values) + +model_alias="${original_model_name////-}-$(date '+%d-%m-%y')" +make_yaml_configs + +prepare_env "$model_type" + +gen_answer +gen_judgment +show_result "$var_values" + +clean_env "$model_type"