jiyangzh
diff --git a/‎.github/workflows/basic-tests-pytorch-rc.yml
+52 b/‎.github/workflows/basic-tests-pytorch-rc.yml
+52
diff --git a/‎.github/workflows/check-links.yml
+1-1 b/‎.github/workflows/check-links.yml
+1-1
diff --git a/‎.gitignore
+8 b/‎.gitignore
+8
diff --git a/‎LICENSE.txt
+1-1 b/‎LICENSE.txt
+1-1
diff --git a/‎README.md
+8-6 b/‎README.md
+8-6
diff --git a/‎appendix-D/01_main-chapter-code/appendix-D.ipynb
+31-32 b/‎appendix-D/01_main-chapter-code/appendix-D.ipynb
+31-32
diff --git a/‎appendix-E/01_main-chapter-code/gpt_download.py
+33-18 b/‎appendix-E/01_main-chapter-code/gpt_download.py
+33-18
diff --git a/‎ch02/01_main-chapter-code/ch02.ipynb
+19-5 b/‎ch02/01_main-chapter-code/ch02.ipynb
+19-5
diff --git a/‎ch02/01_main-chapter-code/dataloader.ipynb
+10-7 b/‎ch02/01_main-chapter-code/dataloader.ipynb
+10-7
@@ -0,0 +1,52 @@
+name: Test latest PyTorch nightly / release candidate
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - '**/*.py'    # Run workflow for changes in Python files
+      - '**/*.ipynb'
+      - '**/*.yaml'
+      - '**/*.yml'
+      - '**/*.sh'
+  pull_request:
+    branches: [ main ]
+    paths:
+      - '**/*.py'
+      - '**/*.ipynb'
+      - '**/*.yaml'
+      - '**/*.yml'
+      - '**/*.sh'
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.10"
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pytest nbval
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install -r ch05/07_gpt_to_llama/tests/test-requirements-extra.txt
+        pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+
+    - name: Test Selected Python Scripts
+      run: |
+        pytest setup/02_installing-python-libraries/tests.py
+        pytest ch04/01_main-chapter-code/tests.py
+        pytest ch05/01_main-chapter-code/tests.py
+        pytest ch05/07_gpt_to_llama/tests/tests.py
+        pytest ch06/01_main-chapter-code/tests.py
+
+    - name: Validate Selected Jupyter Notebooks
+      run: |
+        pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
+        pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
+        pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
@@ -29,6 +29,6 @@ jobs:
 
     - name: Check links
       run: |
-        pytest --check-links ./ --check-links-ignore "https://platform.openai.com/*" --check-links-ignore "https://openai.com/*" --check-links-ignore "https://arena.lmsys.org" --check-links-ignore "https://www.reddit.com/r/*" --check-links-ignore "https://code.visualstudio.com/*" --check-links-ignore https://arxiv.org/* --check-links-ignore "https://ai.stanford.edu/~amaas/data/sentiment/"
+        pytest --check-links ./ --check-links-ignore "https://platform.openai.com/*" --check-links-ignore "https://openai.com/*" --check-links-ignore "https://arena.lmsys.org" --check-links-ignore https://unsloth.ai/blog/gradient --check-links-ignore "https://www.reddit.com/r/*" --check-links-ignore "https://code.visualstudio.com/*" --check-links-ignore https://arxiv.org/* --check-links-ignore "https://ai.stanford.edu/~amaas/data/sentiment/"
         # pytest --check-links ./ --check-links-ignore "https://platform.openai.com/*" --check-links-ignore "https://arena.lmsys.org" --retries 2 --retry-delay 5
 
@@ -31,6 +31,7 @@ appendix-E/01_main-chapter-code/gpt2
 
 ch05/01_main-chapter-code/gpt2/
 ch05/02_alternative_weight_loading/checkpoints
+ch05/02_alternative_weight_loading/*.safetensors
 ch05/01_main-chapter-code/model.pth
 ch05/01_main-chapter-code/model_and_optimizer.pth
 ch05/03_bonus_pretraining_on_gutenberg/model_checkpoints
@@ -101,6 +102,13 @@ ch07/02_dataset-utilities/instruction-examples-modified.json
 ch07/04_preference-tuning-with-dpo/gpt2-medium355M-sft.pth
 ch07/04_preference-tuning-with-dpo/loss-plot.pdf
 
+# Tokenizer files
+ch02/05_bpe-from-scratch/bpe_merges.txt
+ch02/05_bpe-from-scratch/encoder.json
+ch02/05_bpe-from-scratch/vocab.bpe
+ch02/05_bpe-from-scratch/vocab.json
+
+
 # Other
 ch0?/0?_user_interface/.chainlit/
 ch0?/0?_user_interface/chainlit.md
 
@@ -189,7 +189,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2023-2024 Sebastian Raschka
+   Copyright 2023-2025 Sebastian Raschka
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
 
@@ -101,16 +101,17 @@ Several folders contain optional materials as a bonus for interested readers:
   - [Python Setup Tips](setup/01_optional-python-setup-preferences)
   - [Installing Python Packages and Libraries Used In This Book](setup/02_installing-python-libraries)
   - [Docker Environment Setup Guide](setup/03_optional-docker-environment)
-- **Chapter 2:**
+- **Chapter 2: Working with text data**
+  - [Byte Pair Encoding (BPE) Tokenizer From Scratch](ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb)
   - [Comparing Various Byte Pair Encoding (BPE) Implementations](ch02/02_bonus_bytepair-encoder)
   - [Understanding the Difference Between Embedding Layers and Linear Layers](ch02/03_bonus_embedding-vs-matmul)
   - [Dataloader Intuition with Simple Numbers](ch02/04_bonus_dataloader-intuition)
-- **Chapter 3:**
+- **Chapter 3: Coding attention mechanisms**
   - [Comparing Efficient Multi-Head Attention Implementations](ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb)
   - [Understanding PyTorch Buffers](ch03/03_understanding-buffers/understanding-buffers.ipynb)
-- **Chapter 4:**
+- **Chapter 4: Implementing a GPT model from scratch**
   - [FLOPS Analysis](ch04/02_performance-analysis/flops-analysis.ipynb)
-- **Chapter 5:**
+- **Chapter 5: Pretraining on unlabeled data:**
   - [Alternative Weight Loading from Hugging Face Model Hub using Transformers](ch05/02_alternative_weight_loading/weight-loading-hf-transformers.ipynb)
   - [Pretraining GPT on the Project Gutenberg Dataset](ch05/03_bonus_pretraining_on_gutenberg)
   - [Adding Bells and Whistles to the Training Loop](ch05/04_learning_rate_schedulers)
@@ -119,11 +120,12 @@ Several folders contain optional materials as a bonus for interested readers:
   - [Converting GPT to Llama](ch05/07_gpt_to_llama)
   - [Llama 3.2 From Scratch](ch05/07_gpt_to_llama/standalone-llama32.ipynb)
   - [Memory-efficient Model Weight Loading](ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb)
-- **Chapter 6:**
+  - [Extending the Tiktoken BPE Tokenizer with New Tokens](ch05/09_extending-tokenizers/extend-tiktoken.ipynb)
+- **Chapter 6: Finetuning for classification**
   - [Additional experiments finetuning different layers and using larger models](ch06/02_bonus_additional-experiments)
   - [Finetuning different models on 50k IMDB movie review dataset](ch06/03_bonus_imdb-classification)
   - [Building a User Interface to Interact With the GPT-based Spam Classifier](ch06/04_user_interface)
-- **Chapter 7:**
+- **Chapter 7: Finetuning to follow instructions**
   - [Dataset Utilities for Finding Near Duplicates and Creating Passive Voice Entries](ch07/02_dataset-utilities)
   - [Evaluating Instruction Responses Using the OpenAI API and Ollama](ch07/03_model-evaluation)
   - [Generating a Dataset for Instruction Finetuning](ch07/05_dataset-generation/llama3-ollama.ipynb)
 
@@ -23,6 +23,7 @@ def download_and_load_gpt2(model_size, models_dir):
     # Define paths
     model_dir = os.path.join(models_dir, model_size)
     base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
+    backup_base_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2"
     filenames = [
         "checkpoint", "encoder.json", "hparams.json",
         "model.ckpt.data-00000-of-00001", "model.ckpt.index",
@@ -33,8 +34,9 @@ def download_and_load_gpt2(model_size, models_dir):
     os.makedirs(model_dir, exist_ok=True)
     for filename in filenames:
         file_url = os.path.join(base_url, model_size, filename)
+        backup_url = os.path.join(backup_base_url, model_size, filename)
         file_path = os.path.join(model_dir, filename)
-        download_file(file_url, file_path)
+        download_file(file_url, file_path, backup_url)
 
     # Load settings and params
     tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
@@ -44,11 +46,9 @@ def download_and_load_gpt2(model_size, models_dir):
     return settings, params
 
 
-def download_file(url, destination):
-    # Send a GET request to download the file
-
-    try:
-        with urllib.request.urlopen(url) as response:
+def download_file(url, destination, backup_url=None):
+    def _attempt_download(download_url):
+        with urllib.request.urlopen(download_url) as response:
             # Get the total file size from headers, defaulting to 0 if not present
             file_size = int(response.headers.get("Content-Length", 0))
 
@@ -57,29 +57,44 @@ def download_file(url, destination):
                 file_size_local = os.path.getsize(destination)
                 if file_size == file_size_local:
                     print(f"File already exists and is up-to-date: {destination}")
-                    return
+                    return True  # Indicate success without re-downloading
 
-            # Define the block size for reading the file
             block_size = 1024  # 1 Kilobyte
 
             # Initialize the progress bar with total file size
-            progress_bar_description = os.path.basename(url)  # Extract filename from URL
+            progress_bar_description = os.path.basename(download_url)
             with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
-                # Open the destination file in binary write mode
                 with open(destination, "wb") as file:
-                    # Read the file in chunks and write to destination
                     while True:
                         chunk = response.read(block_size)
                         if not chunk:
                             break
                         file.write(chunk)
-                        progress_bar.update(len(chunk))  # Update progress bar
-    except urllib.error.HTTPError:
-        s = (
-            f"The specified URL ({url}) is incorrect, the internet connection cannot be established,"
-            "\nor the requested file is temporarily unavailable.\nPlease visit the following website"
-            " for help: https://github.com/rasbt/LLMs-from-scratch/discussions/273")
-        print(s)
+                        progress_bar.update(len(chunk))
+            return True
+
+    try:
+        if _attempt_download(url):
+            return
+    except (urllib.error.HTTPError, urllib.error.URLError):
+        if backup_url is not None:
+            print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
+            try:
+                if _attempt_download(backup_url):
+                    return
+            except urllib.error.HTTPError:
+                pass
+
+        # If we reach here, both attempts have failed
+        error_message = (
+            f"Failed to download from both primary URL ({url})"
+            f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
+            "\nCheck your internet connection or the file availability.\n"
+            "For help, visit: https://github.com/rasbt/LLMs-from-scratch/discussions/273"
+        )
+        print(error_message)
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
 
 
 # Alternative way using `requests`
 
@@ -1788,7 +1788,10 @@
    ],
    "source": [
     "token_embeddings = token_embedding_layer(inputs)\n",
-    "print(token_embeddings.shape)"
+    "print(token_embeddings.shape)\n",
+    "\n",
+    "# uncomment & execute the following line to see how the embeddings look like\n",
+    "# print(token_embedding)"
    ]
   },
   {
@@ -1807,7 +1810,10 @@
    "outputs": [],
    "source": [
     "context_length = max_length\n",
-    "pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)"
+    "pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)\n",
+    "\n",
+    "# uncomment & execute the following line to see how the embedding layer weights look like\n",
+    "# print(pos_embedding_layer.weight)"
    ]
   },
   {
@@ -1826,7 +1832,10 @@
    ],
    "source": [
     "pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n",
-    "print(pos_embeddings.shape)"
+    "print(pos_embeddings.shape)\n",
+    "\n",
+    "# uncomment & execute the following line to see how the embeddings look like\n",
+    "# print(pos_embeddings)"
    ]
   },
   {
@@ -1853,7 +1862,10 @@
    ],
    "source": [
     "input_embeddings = token_embeddings + pos_embeddings\n",
-    "print(input_embeddings.shape)"
+    "print(input_embeddings.shape)\n",
+    "\n",
+    "# uncomment & execute the following line to see how the embeddings look like\n",
+    "# print(input_embeddings)"
    ]
   },
   {
@@ -1888,7 +1900,9 @@
    "source": [
     "See the [./dataloader.ipynb](./dataloader.ipynb) code notebook, which is a concise version of the data loader that we implemented in this chapter and will need for training the GPT model in upcoming chapters.\n",
     "\n",
-    "See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions."
+    "See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions.\n",
+    "\n",
+    "See the [Byte Pair Encoding (BPE) Tokenizer From Scratch](../02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb) notebook if you are interested in learning how the GPT-2 tokenizer can be implemented and trained from scratch."
    ]
   }
  ],
 
@@ -103,8 +103,8 @@
     "        return self.input_ids[idx], self.target_ids[idx]\n",
     "\n",
     "\n",
-    "def create_dataloader_v1(txt, batch_size=4, max_length=256, \n",
-    "                         stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
+    "def create_dataloader_v1(txt, batch_size, max_length, stride,\n",
+    "                         shuffle=True, drop_last=True, num_workers=0):\n",
     "    # Initialize the tokenizer\n",
     "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
     "\n",
@@ -121,9 +121,6 @@
     "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
     "    raw_text = f.read()\n",
     "\n",
-    "tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-    "encoded_text = tokenizer.encode(raw_text)\n",
-    "\n",
     "vocab_size = 50257\n",
     "output_dim = 256\n",
     "context_length = 1024\n",
@@ -132,8 +129,14 @@
     "token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
     "pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)\n",
     "\n",
+    "batch_size = 8\n",
     "max_length = 4\n",
-    "dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)"
+    "dataloader = create_dataloader_v1(\n",
+    "    raw_text,\n",
+    "    batch_size=batch_size,\n",
+    "    max_length=max_length,\n",
+    "    stride=max_length\n",
+    ")"
    ]
   },
   {
@@ -189,7 +192,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,