-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathppl_scorer_hf.py
110 lines (92 loc) · 3.21 KB
/
ppl_scorer_hf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import sentencepiece as spm
import torch
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import os
import sys
from time import time,sleep
from huggingface_hub import notebook_login,login
login(token='INSERT_TOKEN_HERE')
tokenizer_path = '/raid/nlp/pranavg/meet/IndicLLM/IndicGPT/models/ACL24_Multi_8.model'
tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
model = AutoModelForCausalLM.from_pretrained(f'meetdoshi90/MiniLM-base-en-tiny_4096',trust_remote_code=True)
model = model.eval()
model = model.to('cuda')
SKIP_K = 10
CONTEXT_LEN = 4096
BATCH_SIZE = 8
FOLDER = './data/syn-EN-gu/'
FOLDER_FILES = reversed(sorted(os.listdir(FOLDER)))
FOLDER_FILES = ['1.txt.gu23-eng_Latn.merged']
print('Folder files', FOLDER_FILES)
if not os.path.exists(FOLDER[:-1]+'-scores/'):
os.mkdir(FOLDER[:-1]+'-scores/')
for file in tqdm(FOLDER_FILES):
print(file)
final_scores = []
with open(FOLDER+file,'r+', encoding="utf-8") as f:
data = f.readlines()
print(data[0])
for i in tqdm(range(len(data))):
data[i] = data[i].strip()
if data[i] == '< DOC _ START >' or data[i]=='<DOC _ START>':
data[i] = '<DOC_START>'
elif data[i] == '< DOC _ END >' or data[i]=='<DOC _ END>':
data[i] = '<DOC_END>'
limit_data = []
for i in tqdm(range(len(data))):
limit_data.append(data[i])
data = limit_data
limit_data = []
tokenized_data = tokenizer.encode(data)
data = []
tokenized_file = []
temp = []
token_count = 0
doc_count = 0
for row in tqdm(tokenized_data):
if row == [48554, 48860, 48682, 13096, 48766, 2806, 21873, 48808] or row ==[48554, 48860, 48682, 13096, 48766, 46319, 48808]:
if temp!=[]:
tokenized_file.extend([[1] + temp + [2]])
doc_count += 1
temp = []
else:
temp.extend(row)
token_count += len(row)
if temp!=[]:
tokenized_file.extend([temp])
doc_count+=1
#tokenized_file = splitter(tokenized_file)
print('Doc count =', doc_count)
#data = torch.IntTensor(tokenized_file)
#print(data.shape)
temp = []
for row in tqdm(tokenized_file):
doc = row + [3]*max(0,CONTEXT_LEN+1-len(row))
doc = doc[:CONTEXT_LEN+1]
assert len(doc) == CONTEXT_LEN+1
temp.append(doc)
tokenized_file = torch.tensor(temp).long()
print(tokenized_file.shape)
temp = []
loss = []
for i in tqdm(range(0,tokenized_file.shape[0],BATCH_SIZE)):
batch_inp = tokenized_file[i:i+BATCH_SIZE,:].to('cuda')
out = model(batch_inp,labels=batch_inp,return_dict=True)
loss.extend(out.loss.tolist())
loss = torch.tensor(loss)
print('loss shape',loss.shape)
save_name_tokens = FOLDER[:-1]+'-scores/'+file+'.tokens.pt'
save_name_loss = FOLDER[:-1]+'-scores/'+file+'.loss.pt'
torch.save(tokenized_file,save_name_tokens)
torch.save(loss, save_name_loss)
print(f'{file} done')
tokenized_file=None
loss = None
tokenized_data = None
del tokenized_data,tokenized_file,loss
torch.cuda.empty_cache()
break