Skip to content

Commit

Permalink
Don't fix unnatural tokenization if context is empty
Browse files Browse the repository at this point in the history
  • Loading branch information
KlaudiaTH committed Nov 8, 2023
1 parent 3e5377f commit 0e2749c
Showing 1 changed file with 16 additions and 16 deletions.
32 changes: 16 additions & 16 deletions lm_eval/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,27 +172,27 @@ def _model_call(self, inps):
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
continuation_enc = self.tok_encode(continuation)

if context == "":
# end of text as context
context_enc = [self.eot_token_id]
else:
context_enc = self.tok_encode(context)

continuation_enc = self.tok_encode(continuation)
ctx_cont_enc = self.tok_encode(context + continuation)

if context_enc + continuation_enc != ctx_cont_enc:
if ctx_cont_enc[: len(context_enc)] == context_enc:
# continuation_enc is incorrect and context_enc is correct
continuation_enc = ctx_cont_enc[len(context_enc) :]
elif ctx_cont_enc[-len(continuation_enc) :] == continuation_enc:
# continuation_enc is correct and context_enc is incorrect
context_enc = ctx_cont_enc[: -len(continuation_enc)]
else:
# Both are incorrect
print(
f"WARNING: Unnatural tokenization of concatenated context ...{repr(context[-20:])} and continuation {repr(continuation)}"
)
ctx_cont_enc = self.tok_encode(context + continuation)

if context_enc + continuation_enc != ctx_cont_enc:
if ctx_cont_enc[: len(context_enc)] == context_enc:
# continuation_enc is incorrect and context_enc is correct
continuation_enc = ctx_cont_enc[len(context_enc) :]
elif ctx_cont_enc[-len(continuation_enc) :] == continuation_enc:
# continuation_enc is correct and context_enc is incorrect
context_enc = ctx_cont_enc[: -len(continuation_enc)]
else:
# Both are incorrect
print(
f"WARNING: Unnatural tokenization of concatenated context ...{repr(context[-20:])} and continuation {repr(continuation)}"
)

new_reqs.append(((context, continuation), context_enc, continuation_enc))

Expand Down

0 comments on commit 0e2749c

Please sign in to comment.