Skip to content

Commit

Permalink
Fix unnatural tokenizations if possible
Browse files Browse the repository at this point in the history
  • Loading branch information
KlaudiaTH committed Nov 8, 2023
1 parent b05c53d commit 3e5377f
Showing 1 changed file with 14 additions and 1 deletion.
15 changes: 14 additions & 1 deletion lm_eval/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,20 @@ def loglikelihood(self, requests):
context_enc = self.tok_encode(context)

continuation_enc = self.tok_encode(continuation)
# continuation_enc = self.tok_encode(continuation, is_continuation=True)
ctx_cont_enc = self.tok_encode(context + continuation)

if context_enc + continuation_enc != ctx_cont_enc:
if ctx_cont_enc[: len(context_enc)] == context_enc:
# continuation_enc is incorrect and context_enc is correct
continuation_enc = ctx_cont_enc[len(context_enc) :]
elif ctx_cont_enc[-len(continuation_enc) :] == continuation_enc:
# continuation_enc is correct and context_enc is incorrect
context_enc = ctx_cont_enc[: -len(continuation_enc)]
else:
# Both are incorrect
print(
f"WARNING: Unnatural tokenization of concatenated context ...{repr(context[-20:])} and continuation {repr(continuation)}"
)

new_reqs.append(((context, continuation), context_enc, continuation_enc))

Expand Down

0 comments on commit 3e5377f

Please sign in to comment.