21
21
# TODO: automate the update of convert-hf-to-gguf.py
22
22
#
23
23
24
+ import logging
24
25
import os
25
26
import requests
26
27
import sys
27
28
import json
28
29
29
30
from hashlib import sha256
30
31
from enum import IntEnum , auto
32
+ from transformers import AutoTokenizer
33
+
34
+ logger = logging .getLogger ("convert-hf-to-gguf-update" )
35
+
31
36
32
37
class TOKENIZER_TYPE (IntEnum ):
33
38
SPM = auto ()
34
39
BPE = auto ()
35
40
WPM = auto ()
36
41
42
+
37
43
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
38
44
# will be updated with time - contributions welcome
39
45
chktxt = '\n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ```````\" \" \" \" ......!!!!!!?????? I\' ve been \' told he\' s there, \' RE you sure? \' M not sure I\' ll make it, \' D you like some tea? We\' Ve a\' lL'
40
46
41
47
if len (sys .argv ) == 2 :
42
48
token = sys .argv [1 ]
43
49
else :
44
- print ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
50
+ logger . info ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
45
51
sys .exit (1 )
46
52
47
53
# TODO: add models here, base models preferred
48
54
models = [
49
- { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
50
- { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
51
- { "name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
52
- { "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
53
- { "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
54
- { "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
55
- { "name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
56
- { "name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
57
- { "name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
58
- { "name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
59
- ]
55
+ { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
56
+ { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
57
+ { "name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
58
+ { "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
59
+ { "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
60
+ { "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
61
+ { "name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
62
+ { "name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
63
+ { "name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
64
+ { "name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
65
+ ]
60
66
61
67
# make directory "models/tokenizers" if it doesn't exist
62
68
if not os .path .exists ("models/tokenizers" ):
63
69
os .makedirs ("models/tokenizers" )
64
70
71
+
65
72
def download_file_with_auth (url , token , save_path ):
66
73
headers = {"Authorization" : f"Bearer { token } " }
67
74
response = requests .get (url , headers = headers )
68
75
if response .status_code == 200 :
69
76
with open (save_path , 'wb' ) as f :
70
77
f .write (response .content )
71
- print (f"File { save_path } downloaded successfully" )
78
+ logger . info (f"File { save_path } downloaded successfully" )
72
79
else :
73
- print (f"Failed to download file. Status code: { response .status_code } " )
80
+ logger .info (f"Failed to download file. Status code: { response .status_code } " )
81
+
74
82
75
83
# download the tokenizer models
76
84
for model in models :
@@ -81,10 +89,10 @@ def download_file_with_auth(url, token, save_path):
81
89
if not os .path .exists (f"models/tokenizers/{ name } " ):
82
90
os .makedirs (f"models/tokenizers/{ name } " )
83
91
else :
84
- print (f"Directory models/tokenizers/{ name } already exists - skipping" )
92
+ logger . info (f"Directory models/tokenizers/{ name } already exists - skipping" )
85
93
continue
86
94
87
- print (f"Downloading { name } to models/tokenizers/{ name } " )
95
+ logger . info (f"Downloading { name } to models/tokenizers/{ name } " )
88
96
89
97
url = f"{ repo } /raw/main/config.json"
90
98
save_path = f"models/tokenizers/{ name } /config.json"
@@ -115,76 +123,76 @@ def download_file_with_auth(url, token, save_path):
115
123
continue
116
124
117
125
# create the tokenizer
118
- from transformers import AutoTokenizer
119
126
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
120
127
121
128
chktok = tokenizer .encode (chktxt )
122
129
chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
123
130
124
- print (f"model: { name } " )
125
- print (f"tokt: { tokt } " )
126
- print (f"repo: { model ['repo' ]} " )
127
- print (f"chktok: { chktok } " )
128
- print (f"chkhsh: { chkhsh } " )
131
+ logger . info (f"model: { name } " )
132
+ logger . info (f"tokt: { tokt } " )
133
+ logger . info (f"repo: { model ['repo' ]} " )
134
+ logger . info (f"chktok: { chktok } " )
135
+ logger . info (f"chkhsh: { chkhsh } " )
129
136
130
137
# print the "pre_tokenizer" content from the tokenizer.json
131
138
with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
132
139
cfg = json .load (f )
133
140
pre_tokenizer = cfg ["pre_tokenizer" ]
134
- print ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
141
+ logger . info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
135
142
136
- print ( f" \n " )
143
+ logger . info ( " " )
137
144
138
145
src_ifs += f" if chkhsh == \" { chkhsh } \" :\n "
139
146
src_ifs += f" # ref: { model ['repo' ]} \n "
140
147
src_ifs += f" res = \" { name } \" \n "
141
148
142
- src_func = ""
143
- src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n "
144
- src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n "
145
- src_func += " # is specific for the BPE pre-tokenizer used by the model\n "
146
- src_func += " # we will use this unique identifier to write a \" tokenizer.ggml.pre\" entry in the GGUF file which we can\n "
147
- src_func += " # use in llama.cpp to implement the same pre-tokenizer\n "
148
- src_func += "\n "
149
- src_func += f" chktxt = { repr (chktxt )} \n "
150
- src_func += "\n "
151
- src_func += " chktok = tokenizer.encode(chktxt)\n "
152
- src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n "
153
- src_func += "\n "
154
- src_func += " print(f\" chktok: {chktok}\" )\n "
155
- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
156
- src_func += "\n "
157
- src_func += " res = None\n "
158
- src_func += "\n "
159
- src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n "
160
- src_func += " # or pull the latest version of the model from Huggingface\n "
161
- src_func += " # don't edit the hashes manually!\n "
162
- src_func += f"{ src_ifs } \n "
163
- src_func += " if res is None:\n "
164
- src_func += " print(\" \\ n\" )\n "
165
- src_func += " print(\" **************************************************************************************\" )\n "
166
- src_func += " print(\" ** WARNING: The BPE pre-tokenizer was not recognized!\" )\n "
167
- src_func += " print(\" ** There are 2 possible reasons for this:\" )\n "
168
- src_func += " print(\" ** - the model has not been added to convert-hf-to-gguf-update.py yet\" )\n "
169
- src_func += " print(\" ** - the pre-tokenization config has changed upstream\" )\n "
170
- src_func += " print(\" ** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\" )\n "
171
- src_func += " print(\" ** ref: https://github.com/ggerganov/llama.cpp/pull/6920\" )\n "
172
- src_func += " print(\" **\" )\n "
173
- src_func += " print(f\" ** chkhsh: {chkhsh}\" )\n "
174
- src_func += " print(\" **************************************************************************************\" )\n "
175
- src_func += " print(\" \\ n\" )\n "
176
- src_func += " raise NotImplementedError(\" BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\" )\n "
177
- src_func += "\n "
178
- src_func += " print(f\" tokenizer.ggml.pre: {res}\" )\n "
179
- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
180
- src_func += "\n "
181
- src_func += " return res\n "
182
-
183
- print (src_func )
184
-
185
- print ("\n " )
186
- print ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
187
- print ("\n " )
149
+ src_func = f"""
150
+ def get_vocab_base_pre(self, tokenizer) -> str:
151
+ # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
152
+ # is specific for the BPE pre-tokenizer used by the model
153
+ # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
154
+ # use in llama.cpp to implement the same pre-tokenizer
155
+
156
+ chktxt = { repr (chktxt )}
157
+
158
+ chktok = tokenizer.encode(chktxt)
159
+ chkhsh = sha256(str(chktok).encode()).hexdigest()
160
+
161
+ print(f"chktok: {{chktok}}")
162
+ print(f"chkhsh: {{chkhsh}}")
163
+
164
+ res = None
165
+
166
+ # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
167
+ # or pull the latest version of the model from Huggingface
168
+ # don't edit the hashes manually!
169
+ { src_ifs }
170
+ if res is None:
171
+ print("\\ n")
172
+ print("**************************************************************************************")
173
+ print("** WARNING: The BPE pre-tokenizer was not recognized!")
174
+ print("** There are 2 possible reasons for this:")
175
+ print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
176
+ print("** - the pre-tokenization config has changed upstream")
177
+ print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
178
+ print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
179
+ print("**")
180
+ print(f"** chkhsh: {{chkhsh}}")
181
+ print("**************************************************************************************")
182
+ print("\\ n")
183
+ raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
184
+
185
+ print(f"tokenizer.ggml.pre: {{repr(res)}}")
186
+ print(f"chkhsh: {{chkhsh}}")
187
+
188
+ return res
189
+ """
190
+
191
+ print (src_func ) # noqa: NP100
192
+
193
+ logger .info ("\n " )
194
+ logger .info ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
195
+ logger .info ("\n " )
188
196
189
197
# generate tests for each tokenizer model
190
198
@@ -250,7 +258,6 @@ def download_file_with_auth(url, token, save_path):
250
258
tokt = model ["tokt" ]
251
259
252
260
# create the tokenizer
253
- from transformers import AutoTokenizer
254
261
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
255
262
256
263
with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" , encoding = "utf-8" ) as f :
@@ -265,15 +272,15 @@ def download_file_with_auth(url, token, save_path):
265
272
f .write (f" { r } " )
266
273
f .write ("\n " )
267
274
268
- print (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
275
+ logger . info (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
269
276
270
277
# generate commands for creating vocab files
271
278
272
- print ("\n Run the following commands to generate the vocab files for testing:\n " )
279
+ logger . info ("\n Run the following commands to generate the vocab files for testing:\n " )
273
280
274
281
for model in models :
275
282
name = model ["name" ]
276
283
277
- print (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
284
+ logger . info (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
278
285
279
- print ("\n " )
286
+ logger . info ("\n " )
0 commit comments