Skip to content

Commit 01be5a4

Browse files
authored
Use more recent sentencepiece tokenizer API (#696)
1 parent bcfdbd7 commit 01be5a4

File tree

1 file changed

+7
-45
lines changed

1 file changed

+7
-45
lines changed

ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb

Lines changed: 7 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,9 @@
8383
"name": "stdout",
8484
"output_type": "stream",
8585
"text": [
86-
"huggingface_hub version: 0.24.7\n",
86+
"huggingface_hub version: 0.33.0\n",
8787
"sentencepiece version: 0.2.0\n",
88-
"torch version: 2.4.1+cu121\n"
88+
"torch version: 2.6.0\n"
8989
]
9090
}
9191
],
@@ -1097,18 +1097,7 @@
10971097
"id": "3357a230-b678-4691-a238-257ee4e80185",
10981098
"outputId": "768ed6af-ce14-40bc-ca18-117b4b448269"
10991099
},
1100-
"outputs": [
1101-
{
1102-
"name": "stdout",
1103-
"output_type": "stream",
1104-
"text": [
1105-
"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
1106-
"Token is valid (permission: read).\n",
1107-
"Your token has been saved to /root/.cache/huggingface/token\n",
1108-
"Login successful\n"
1109-
]
1110-
}
1111-
],
1100+
"outputs": [],
11121101
"source": [
11131102
"from huggingface_hub import login\n",
11141103
"import json\n",
@@ -1155,34 +1144,7 @@
11551144
"id": "69714ea8-b9b8-4687-8392-f3abb8f93a32",
11561145
"outputId": "c230fec9-5c71-4a41-90ab-8a34d114ea01"
11571146
},
1158-
"outputs": [
1159-
{
1160-
"name": "stderr",
1161-
"output_type": "stream",
1162-
"text": [
1163-
"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
1164-
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
1165-
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
1166-
"You will be able to reuse this secret in all of your notebooks.\n",
1167-
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
1168-
" warnings.warn(\n"
1169-
]
1170-
},
1171-
{
1172-
"data": {
1173-
"application/vnd.jupyter.widget-view+json": {
1174-
"model_id": "e6c75a6aa7b942fe84160e286e3acb3d",
1175-
"version_major": 2,
1176-
"version_minor": 0
1177-
},
1178-
"text/plain": [
1179-
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
1180-
]
1181-
},
1182-
"metadata": {},
1183-
"output_type": "display_data"
1184-
}
1185-
],
1147+
"outputs": [],
11861148
"source": [
11871149
"from huggingface_hub import hf_hub_download\n",
11881150
"\n",
@@ -1222,10 +1184,10 @@
12221184
" self.tokenizer = sp\n",
12231185
"\n",
12241186
" def encode(self, text):\n",
1225-
" return self.tokenizer.encode_as_ids(text)\n",
1187+
" return self.tokenizer.encode(text, out_type=int)\n",
12261188
"\n",
12271189
" def decode(self, ids):\n",
1228-
" return self.tokenizer.decode_pieces(ids)\n",
1190+
" return self.tokenizer.decode(ids)\n",
12291191
"\n",
12301192
"\n",
12311193
"tokenizer = LlamaTokenizer(tokenizer_file)"
@@ -1258,7 +1220,7 @@
12581220
"output_type": "stream",
12591221
"text": [
12601222
"Output text:\n",
1261-
" Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéon로жа Bass differencespadxsnu ;; ctx始\n"
1223+
" Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéonkkarto Оryptato#{ningproof eerbye\n"
12621224
]
12631225
}
12641226
],

0 commit comments

Comments
 (0)