ystemsrx
diff --git a/‎data/data.py
+233 b/‎data/data.py
+233
diff --git a/‎data/processed/meta.pkl
628 Bytes b/‎data/processed/meta.pkl
628 Bytes
diff --git a/‎data/processed/train.bin
15.9 KB b/‎data/processed/train.bin
15.9 KB
diff --git a/‎data/processed/val.bin
1.77 KB b/‎data/processed/val.bin
1.77 KB
diff --git a/‎data/raw/merged_input.txt
+11 b/‎data/raw/merged_input.txt
+11
diff --git a/‎modules/gpt.py
+125 b/‎modules/gpt.py
+125
@@ -0,0 +1,233 @@
+"""
+data/data.py
+
+This file handles data processing, including reading raw text, 
+splitting into train/validation sets, tokenizing (char-level or GPT-2), 
+and saving to binary files.
+"""
+
+import os
+import math
+import pickle
+import numpy as np
+import tiktoken
+from multiprocessing import Pool, cpu_count
+
+# We import our global config and integer type definition
+from config.default import DEFAULT_CONFIG, IntegerTypes
+
+
+def get_chunks(text, n):
+    """
+    Splits the text into 'n' roughly equal chunks for parallel processing.
+    :param text: The full text string to split.
+    :param n: The number of chunks to split into.
+    :return: A list of text chunks.
+    """
+    chunk_size = math.ceil(len(text) / n)
+    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+
+
+def get_unique_chars(text):
+    """
+    Returns a set of unique characters found in the text.
+    :param text: A string from which to collect unique characters.
+    :return: A set of unique characters.
+    """
+    return set(text)
+
+
+def encode_text_chunk(chunk, stoi):
+    """
+    Encodes a chunk of text at the character level using the 'stoi' dictionary.
+    :param chunk: A substring of text.
+    :param stoi: A dict mapping characters to their integer IDs.
+    :return: A list of integer IDs representing the chunk.
+    """
+    return [stoi.get(ch, 0) for ch in chunk]
+
+
+def encode_gpt2_chunk(chunk, tokenizer):
+    """
+    Encodes a chunk of text using GPT-2 tokenizer.
+    :param chunk: A substring of text.
+    :param tokenizer: A GPT-2 tokenizer from 'tiktoken'.
+    :return: A list of token IDs.
+    """
+    return tokenizer.encode(chunk, allowed_special={"<|endoftext|>"})
+
+
+def process_data(
+    input_text="",
+    input_dir="",
+    raw_data_dir=DEFAULT_CONFIG["data_process"]["raw_data_dir"],
+    processed_data_dir=DEFAULT_CONFIG["data_process"]["processed_data_dir"],
+    train_split_ratio=DEFAULT_CONFIG["data_process"]["train_split_ratio"],
+    no_validation=DEFAULT_CONFIG["data_process"]["no_validation"],
+    use_gpt2_tokenizer=DEFAULT_CONFIG["data_process"]["use_gpt2_tokenizer"],
+    num_proc=DEFAULT_CONFIG["data_process"]["num_proc"]
+):
+    """
+    Splits data into train and val sets (unless 'no_validation' is True),
+    supports both GPT-2 or char-level tokenization, and utilizes multiprocessing 
+    for encoding large datasets.
+
+    :param input_text: Directly provided text to process.
+    :param input_dir: Directory containing .txt files if no direct text is given.
+    :param raw_data_dir: Where to store the merged raw text.
+    :param processed_data_dir: Where to store the processed binary data (train.bin, val.bin).
+    :param train_split_ratio: The fraction of data to allocate for training.
+    :param no_validation: If True, skip creating a validation set.
+    :param use_gpt2_tokenizer: Whether to use GPT-2 tokenizer or char-level.
+    :param num_proc: Number of processes for parallel encoding.
+    :return: A dict containing information about the processed data, 
+             including processed_data_dir, vocab_size, train_size, and optionally val_size.
+    """
+    os.makedirs(raw_data_dir, exist_ok=True)
+    os.makedirs(processed_data_dir, exist_ok=True)
+
+    data = ""
+    # Priority 1: use 'input_text' if provided
+    if input_text.strip():
+        data = input_text
+    # Priority 2: if 'input_dir' is specified, read .txt files from it
+    elif input_dir.strip():
+        txt_files = [f for f in os.listdir(input_dir) if f.endswith('.txt')]
+        for file_name in txt_files:
+            file_path = os.path.join(input_dir, file_name)
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data += f.read()
+    else:
+        raise ValueError("No text input or directory provided.")
+
+    # Save raw text for reference
+    raw_text_file = os.path.join(raw_data_dir, 'merged_input.txt')
+    with open(raw_text_file, 'w', encoding='utf-8') as f:
+        f.write(data)
+
+    # Estimate data size in MB to decide whether to use multiprocessing
+    data_size = len(data.encode('utf-8')) / (1024 * 1024)
+    suggested_proc = min(num_proc, cpu_count())
+    # For smaller data, using multiple processes is often overkill
+    actual_proc = suggested_proc if data_size > 100 else 1
+
+    # ------------------------------
+    # GPT-2 Tokenization Workflow
+    # ------------------------------
+    if use_gpt2_tokenizer:
+        enc = tiktoken.get_encoding("gpt2")
+        vocab_size = enc.n_vocab
+
+        # Parallel or single-process encoding
+        if actual_proc > 1:
+            chunks = get_chunks(data, actual_proc)
+            with Pool(actual_proc) as pool:
+                token_chunks = pool.starmap(encode_gpt2_chunk, [(chunk, enc) for chunk in chunks])
+            tokens = []
+            for chunk in token_chunks:
+                tokens.extend(chunk)
+        else:
+            tokens = encode_gpt2_chunk(data, enc)
+
+        # Append end-of-text token if missing
+        if tokens and tokens[-1] != enc.eot_token:
+            tokens.append(enc.eot_token)
+
+        # Split train/val
+        if not no_validation:
+            split_idx = int(len(tokens) * train_split_ratio)
+            splits = {
+                "train": tokens[:split_idx],
+                "val": tokens[split_idx:]
+            }
+        else:
+            splits = {"train": tokens}
+
+        # Save to .bin files
+        for split, tokens_ in splits.items():
+            filename = os.path.join(processed_data_dir, f'{split}.bin')
+            arr = np.array(tokens_, dtype=np.uint32)
+            arr.tofile(filename)
+
+        # Save metadata (important for reconstructing the tokenizer state)
+        meta_path = os.path.join(processed_data_dir, 'meta.pkl')
+        meta = {
+            'vocab_size': vocab_size,
+            'itos': {i: enc.decode([i]) for i in range(vocab_size)},
+            'stoi': {enc.decode([i]): i for i in range(vocab_size)},
+            'tokenizer': 'gpt2'
+        }
+        with open(meta_path, 'wb') as f:
+            pickle.dump(meta, f)
+
+        return {
+            "processed_data_dir": processed_data_dir,
+            "vocab_size": vocab_size,
+            "train_size": len(splits["train"]),
+            "val_size": len(splits.get("val", "")) if not no_validation else None
+        }
+
+    # ------------------------------
+    # Char-level Tokenization
+    # ------------------------------
+    else:
+        # Collect all unique characters
+        if actual_proc > 1:
+            chunks = get_chunks(data, actual_proc)
+            with Pool(actual_proc) as pool:
+                char_sets = pool.map(get_unique_chars, chunks)
+            chars = sorted(list(set().union(*char_sets)))
+        else:
+            chars = sorted(list(set(data)))
+
+        vocab_size = len(chars)
+        stoi = {ch: i for i, ch in enumerate(chars)}
+        itos = {i: ch for i, ch in enumerate(chars)}
+
+        # Encode data
+        if actual_proc > 1:
+            chunks = get_chunks(data, actual_proc)
+            with Pool(actual_proc) as pool:
+                encoded_chunks = pool.starmap(encode_text_chunk, [(chunk, stoi) for chunk in chunks])
+            encoded_data = []
+            for chunk in encoded_chunks:
+                encoded_data.extend(chunk)
+        else:
+            encoded_data = encode_text_chunk(data, stoi)
+
+        # Split train/val
+        if not no_validation:
+            split_idx = int(len(encoded_data) * train_split_ratio)
+            train_ids = np.array(encoded_data[:split_idx], dtype=IntegerTypes)
+            val_ids = np.array(encoded_data[split_idx:], dtype=IntegerTypes)
+        else:
+            train_ids = np.array(encoded_data, dtype=IntegerTypes)
+            val_ids = None
+
+        train_bin_path = os.path.join(processed_data_dir, 'train.bin')
+        val_bin_path = os.path.join(processed_data_dir, 'val.bin')
+        meta_path = os.path.join(processed_data_dir, 'meta.pkl')
+
+        # Write binary files
+        train_ids.tofile(train_bin_path)
+        if not no_validation and val_ids is not None:
+            val_ids.tofile(val_bin_path)
+
+        # Save meta info
+        meta = {
+            'vocab_size': vocab_size,
+            'itos': itos,
+            'stoi': stoi,
+        }
+        with open(meta_path, 'wb') as f:
+            pickle.dump(meta, f)
+
+        print(f"Used {actual_proc} process(es) for data processing.")
+        result = {
+            "processed_data_dir": processed_data_dir,
+            "vocab_size": vocab_size,
+            "train_size": len(train_ids),
+        }
+        if not no_validation:
+            result["val_size"] = len(val_ids)
+        return result
@@ -0,0 +1,11 @@
+OpenAI is a pioneering research organization and technology company that has redefined the landscape of artificial intelligence through its innovative suite of products and groundbreaking research. Founded in December 2015, the company has evolved from a non-profit initiative into a capped-profit enterprise, ensuring that its expansive projects receive the investment needed to explore and push the boundaries of AI while keeping ethical considerations at the forefront. OpenAI’s portfolio includes a range of advanced models and tools designed to cater to diverse applications—from natural language processing to computer vision and robotics—each serving a unique function and contributing to the overarching mission of benefiting all of humanity.
+
+At the core of OpenAI’s contributions are the Generative Pre-trained Transformer models. The evolution of these models, starting with GPT-2 and progressing through GPT-3, GPT-3.5, and the latest GPT-4, has set new standards in natural language understanding and generation. These models are capable of producing highly coherent and contextually nuanced text, making them invaluable for tasks such as content creation, summarization, translation, and interactive dialogue. ChatGPT, which leverages these advanced language models, offers users an engaging conversational experience that can simulate human-like interactions, assist with customer support, provide educational tutoring, and even aid in creative writing endeavors.
+
+In addition to its language models, OpenAI has made significant strides in the field of multimodal AI with products like DALL·E. This innovative model translates textual descriptions into detailed, creative images, opening up possibilities for artists, designers, and content creators by providing a new medium for visual expression. DALL·E exemplifies OpenAI’s commitment to merging different forms of data—text and imagery—to produce outputs that were previously unimaginable. Complementing this visual capability is Codex, a model specifically designed to assist programmers by generating and completing code across a variety of programming languages. Codex powers tools like GitHub Copilot, which not only boosts developer productivity but also helps democratize access to coding by lowering the barrier to entry for new programmers.
+
+OpenAI’s work is not limited to static models; it extends into dynamic environments through its investments in reinforcement learning and robotics. The OpenAI Gym provides a comprehensive toolkit for developing and testing reinforcement learning algorithms, allowing researchers to simulate complex scenarios and train agents in controlled yet challenging environments. OpenAI Five, another hallmark project, demonstrated the capacity of AI to learn and strategize in real time by competing at a professional level in the video game Dota 2. These endeavors highlight OpenAI’s commitment to understanding and mastering the intricacies of decision-making processes and dynamic problem-solving in environments that closely mimic real-world challenges.
+
+Underlying all these products is a robust framework dedicated to ensuring the safety, alignment, and ethical deployment of AI technologies. OpenAI conducts rigorous research into AI alignment to ensure that the behavior of its models remains consistent with human values, mitigating potential risks while maximizing societal benefit. The organization collaborates extensively with academic institutions, industry leaders, and policymakers to foster an environment where responsible innovation is prioritized alongside technical advancement. Strategic partnerships, most notably with Microsoft, have not only provided substantial funding but also facilitated the integration of OpenAI’s models into widely used platforms like Azure, thereby extending the reach and applicability of its technologies across various industries.
+
+By continuously refining its models and exploring new applications, OpenAI remains at the cutting edge of artificial intelligence research. Its comprehensive suite of products—spanning advanced language models, image generation systems, code-assistance tools, and reinforcement learning environments—reflects a deep commitment to creating AI systems that are both powerful and safe. As the company pushes forward into the future, it maintains a steadfast focus on ensuring that the transformative potential of AI is harnessed in a manner that is ethical, equitable, and aligned with the broader interests of society.
@@ -0,0 +1,125 @@
+"""
+modules/gpt.py
+
+This file provides the GPTConfig class for storing model hyperparameters,
+and the GPT model class itself. Currently a minimal GPT-like model is implemented.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class GPTConfig:
+    """
+    A configuration class for the GPT model.
+
+    :param vocab_size: The size of the vocabulary.
+    :param block_size: The maximum sequence length (context length).
+    :param n_layer: Number of transformer layers.
+    :param n_head: Number of attention heads.
+    :param n_embd: Embedding dimension.
+    :param dropout: Dropout rate.
+    :param bias: Whether to include bias in linear layers.
+    """
+    def __init__(
+        self, 
+        vocab_size, 
+        block_size, 
+        n_layer, 
+        n_head, 
+        n_embd, 
+        dropout=0.1, 
+        bias=False
+    ):
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_embd = n_embd
+        self.dropout = dropout
+        self.bias = bias
+
+
+class GPT(nn.Module):
+    """
+    A minimal GPT-like model consisting of an embedding layer and a linear layer.
+    This is primarily for demonstration purposes.
+    """
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.config = config
+        self.block_size = config.block_size
+
+        # Token embedding layer
+        self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embd)
+
+        # Language modeling head (linear projection to vocab)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=config.bias)
+
+    def forward(self, idx, targets=None):
+        """
+        Forward pass of the model.
+
+        :param idx: Tensor of shape (batch_size, sequence_length) with token indices.
+        :param targets: Optional tensor of the same shape for computing cross-entropy loss.
+        :return: (logits, loss) tuple. 'logits' is (batch_size, sequence_length, vocab_size).
+                 'loss' is a scalar if 'targets' is provided, otherwise None.
+        """
+        b, t = idx.size()
+        # Convert token indices to embeddings
+        token_emb = self.token_embedding_table(idx)
+        # Project embeddings onto vocab space
+        logits = self.lm_head(token_emb)
+
+        loss = None
+        if targets is not None:
+            # Flatten logits and targets for cross entropy
+            logits_view = logits.view(b * t, -1)
+            targets_view = targets.view(b * t)
+            loss = F.cross_entropy(logits_view, targets_view)
+
+        return logits, loss
+
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Generates tokens one by one, appending each new token to 'idx'. 
+        Continues until 'max_new_tokens' have been generated.
+
+        :param idx: Initial token indices of shape (batch_size, sequence_length).
+        :param max_new_tokens: Number of new tokens to generate.
+        :param temperature: Softmax temperature for sampling.
+        :param top_k: Top-k cutoff for sampling. If None, no cutoff is applied.
+        :return: A tensor of shape (batch_size, sequence_length + max_new_tokens).
+        """
+        for _ in range(max_new_tokens):
+            if idx.size(1) == 0:
+                raise ValueError("Input sequence is empty. Provide at least one token.")
+            idx_cond = idx[:, -self.block_size:]
+            logits, _ = self(idx_cond)
+
+            # Scale logits by temperature
+            logits = logits[:, -1, :] / temperature
+
+            # If top_k is set, zero out everything except the top_k
+            if top_k is not None:
+                v, _ = torch.topk(logits, top_k)
+                top_value = v[:, -1].unsqueeze(-1)
+                logits[logits < top_value] = -float('Inf')
+
+            # Convert logits to probabilities
+            probs = F.softmax(logits, dim=-1)
+            # Sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # Append the new token
+            idx = torch.cat((idx, idx_next), dim=1)
+
+        return idx
+
+    def crop_block_size(self, block_size):
+        """
+        Adjust the model's internal block size (for context length).
+        This is useful if you resume training with a smaller block_size than before.
+        """
+        self.config.block_size = block_size
+        self.block_size = block_size