From 08c75fbee18bfa1da071322f4d2f376a5f924c64 Mon Sep 17 00:00:00 2001 From: P01son6415 Date: Sun, 3 Dec 2023 22:07:49 +0800 Subject: [PATCH] test --- ...ama_from_tencentpretrain_to_huggingface.py | 116 ++++++++++++++++++ tencentpretrain/utils/dataset.py | 2 +- 2 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 scripts/convert_llama_from_tencentpretrain_to_huggingface.py diff --git a/scripts/convert_llama_from_tencentpretrain_to_huggingface.py b/scripts/convert_llama_from_tencentpretrain_to_huggingface.py new file mode 100644 index 0000000..dc1587a --- /dev/null +++ b/scripts/convert_llama_from_tencentpretrain_to_huggingface.py @@ -0,0 +1,116 @@ +import argparse +import collections +import torch +import os +import json +from safetensors import safe_open + + +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("--input_model_path", type=str, default="models/llama-7b/", + help=".") +parser.add_argument("--output_model_path", type=str, default="models/llama-7b.bin", + help=".") +parser.add_argument("--type", choices=["3B", "7B", "13B", "33B", "65B", "70B"], default="7B") + + +args = parser.parse_args() + +input_model = torch.load(args.input_model_path) + +input_model = + +output_model = collections.OrderedDict() + + + +model_config = {"3B" : [26, 3200, 32], + "7B" : [32, 4096, 32], + "13B": [40, 5120, 40], + "33B": [60, 6656, 52], + "65B": [80, 8192, 64], + "70B": [80, 8192, 64] + } + +layers_num, dim, n_heads = model_config[args.type] + +if args.type == "70B": + dim2 = dim // 8 + kv_heads = 8 +else: + dim2 = dim + kv_heads = n_heads + +def permute_q(w): + return w.reshape(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim) + +def permute_k(w): + return w.reshape(kv_heads, dim // kv_heads // kv_heads // 2, 2, dim).transpose(1, 2).reshape(dim2, dim) + +dims_per_head = dim // n_heads + + +inv_freq = 1.0 / (10000.0 ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) + +output_model["model.embed_tokens.weight"] = input_model["embedding.word.embedding.weight"] + +for i in range(layers_num): + + output_model["model.layers." + str(i) + ".self_attn.q_proj.weight"] = \ + permute_q(input_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.0.weight"]) + + output_model["model.layers." + str(i) + ".self_attn.k_proj.weight"] = \ + permute_k(input_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.1.weight"]) + + output_model["model.layers." + str(i) + ".self_attn.v_proj.weight"] = \ + input_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] + output_model["model.layers." + str(i) + ".self_attn.o_proj.weight"] = \ + input_model["encoder.transformer." + str(i) + ".self_attn.final_linear.weight"] + + output_model["model.layers." + str(i) + ".input_layernorm.weight"] = \ + input_model["encoder.transformer." + str(i) + ".layer_norm_1.weight"] + + output_model["model.layers." + str(i) + ".mlp.gate_proj.weight"] = \ + input_model["encoder.transformer." + str(i) + ".feed_forward.linear_gate.weight"] + output_model["model.layers." + str(i) + ".mlp.up_proj.weight"] = \ + input_model["encoder.transformer." + str(i) + ".feed_forward.linear_1.weight"] + output_model["model.layers." + str(i) + ".mlp.down_proj.weight"] = \ + input_model["encoder.transformer." + str(i) + ".feed_forward.linear_2.weight"] + + output_model["model.layers." + str(i) + ".post_attention_layernorm.weight"] = \ + input_model["encoder.transformer." + str(i) + ".layer_norm_2.weight"] + + output_model["model.layers." + str(i) + ".self_attn.rotary_emb.inv_freq"] = inv_freq + +output_model["model.norm.weight"] = input_model["encoder.layer_norm.weight"] +output_model["lm_head.weight"] = input_model["target.lm.output_layer.weight"] + +os.system('mkdir ' + args.output_model_path) + + +byte_size = 10 * 500000000 + +param_count, file_count, filename_count = 0, 0, 0 +index_dict = {"weight_map": {}} + +state_dict = collections.OrderedDict() +filename = f"pytorch_model-0.bin" +for k, v in output_model.items(): + state_dict[k] = v.bfloat16() + index_dict["weight_map"][k] = filename + param_count += v.numel() + file_count += v.numel() + if file_count > byte_size: + torch.save(state_dict, os.path.join(args.output_model_path, filename)) + state_dict = collections.OrderedDict() + filename_count += 1 + filename = f"pytorch_model-"+str(filename_count)+".bin" + file_count = 0 + +if len(state_dict) > 0: + torch.save(state_dict, os.path.join(args.output_model_path, filename)) + +index_dict["metadata"] = {"total_size": param_count * 2} +with open(os.path.join(args.output_model_path, "pytorch_model.bin.index.json"), "w") as f: + json.dump(index_dict, f) + diff --git a/tencentpretrain/utils/dataset.py b/tencentpretrain/utils/dataset.py index 462fe08..ce42ac3 100755 --- a/tencentpretrain/utils/dataset.py +++ b/tencentpretrain/utils/dataset.py @@ -1010,7 +1010,7 @@ def worker(self, proc_id, start, end): input = data.get("input", "").replace('\\n', '\n') output = data.get("output", "").replace('\\n', '\n') - document_input = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(instruction + input)) + document_input = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(instruction + " ### Instruction:" + input + " ### Response:")) document_output = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(output)) src = [self.vocab.get(CLS_TOKEN)] + document_input