-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_and_sync_vocab.py
62 lines (48 loc) · 2.23 KB
/
clean_and_sync_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# clean_and_sync_vocab.py
import json
def clean_merges_file(original_merges_path, cleaned_merges_path):
try:
with open(original_merges_path, 'r', encoding='utf-8') as merges_file:
merges = merges_file.readlines()
with open(cleaned_merges_path, 'w', encoding='utf-8') as cleaned_file:
for line in merges:
if not line.startswith("#") and line.strip():
cleaned_file.write(line)
print(f"Cleaned merges.txt and saved to {cleaned_merges_path}")
except Exception as e:
print(f"Error cleaning merges.txt: {e}")
def sync_vocab_with_cleaned_merges(vocab_path, cleaned_merges_path):
try:
# Load vocab.json
with open(vocab_path, 'r', encoding='utf-8') as vocab_file:
vocab = json.load(vocab_file)
# Load cleaned merges.txt
with open(cleaned_merges_path, 'r', encoding='utf-8') as merges_file:
merges = merges_file.readlines()
# Extract all tokens from cleaned merges.txt
merge_tokens = set()
for line in merges:
tokens = line.strip().split()
merge_tokens.update(tokens)
# Find missing tokens
vocab_tokens = set(vocab.keys())
missing_tokens = merge_tokens - vocab_tokens
# Add missing tokens to vocab.json
current_index = max(vocab.values()) + 1
for token in missing_tokens:
if token not in vocab:
vocab[token] = current_index
current_index += 1
# Write updated vocab.json back to file
with open(vocab_path, 'w', encoding='utf-8') as vocab_file:
json.dump(vocab, vocab_file, ensure_ascii=False, indent=2)
print(f"Added {len(missing_tokens)} missing tokens to vocab.json.")
print(f"Missing tokens added: {missing_tokens}")
except Exception as e:
print(f"Error updating vocab.json: {e}")
if __name__ == "__main__":
original_merges_path = "./converted_model/merges.txt"
cleaned_merges_path = "./converted_model/cleaned_merges.txt"
vocab_path = "./converted_model/vocab.json"
clean_merges_file(original_merges_path, cleaned_merges_path)
sync_vocab_with_cleaned_merges(vocab_path, cleaned_merges_path)