Skip to content

Commit 22a1112

Browse files
authored
Fix the issue of abnormal inference.
Make the structure more compact.
1 parent 2b43952 commit 22a1112

File tree

8 files changed

+1027
-0
lines changed

8 files changed

+1027
-0
lines changed

data/data.py

+233
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
"""
2+
data/data.py
3+
4+
This file handles data processing, including reading raw text,
5+
splitting into train/validation sets, tokenizing (char-level or GPT-2),
6+
and saving to binary files.
7+
"""
8+
9+
import os
10+
import math
11+
import pickle
12+
import numpy as np
13+
import tiktoken
14+
from multiprocessing import Pool, cpu_count
15+
16+
# We import our global config and integer type definition
17+
from config.default import DEFAULT_CONFIG, IntegerTypes
18+
19+
20+
def get_chunks(text, n):
21+
"""
22+
Splits the text into 'n' roughly equal chunks for parallel processing.
23+
:param text: The full text string to split.
24+
:param n: The number of chunks to split into.
25+
:return: A list of text chunks.
26+
"""
27+
chunk_size = math.ceil(len(text) / n)
28+
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
29+
30+
31+
def get_unique_chars(text):
32+
"""
33+
Returns a set of unique characters found in the text.
34+
:param text: A string from which to collect unique characters.
35+
:return: A set of unique characters.
36+
"""
37+
return set(text)
38+
39+
40+
def encode_text_chunk(chunk, stoi):
41+
"""
42+
Encodes a chunk of text at the character level using the 'stoi' dictionary.
43+
:param chunk: A substring of text.
44+
:param stoi: A dict mapping characters to their integer IDs.
45+
:return: A list of integer IDs representing the chunk.
46+
"""
47+
return [stoi.get(ch, 0) for ch in chunk]
48+
49+
50+
def encode_gpt2_chunk(chunk, tokenizer):
51+
"""
52+
Encodes a chunk of text using GPT-2 tokenizer.
53+
:param chunk: A substring of text.
54+
:param tokenizer: A GPT-2 tokenizer from 'tiktoken'.
55+
:return: A list of token IDs.
56+
"""
57+
return tokenizer.encode(chunk, allowed_special={"<|endoftext|>"})
58+
59+
60+
def process_data(
61+
input_text="",
62+
input_dir="",
63+
raw_data_dir=DEFAULT_CONFIG["data_process"]["raw_data_dir"],
64+
processed_data_dir=DEFAULT_CONFIG["data_process"]["processed_data_dir"],
65+
train_split_ratio=DEFAULT_CONFIG["data_process"]["train_split_ratio"],
66+
no_validation=DEFAULT_CONFIG["data_process"]["no_validation"],
67+
use_gpt2_tokenizer=DEFAULT_CONFIG["data_process"]["use_gpt2_tokenizer"],
68+
num_proc=DEFAULT_CONFIG["data_process"]["num_proc"]
69+
):
70+
"""
71+
Splits data into train and val sets (unless 'no_validation' is True),
72+
supports both GPT-2 or char-level tokenization, and utilizes multiprocessing
73+
for encoding large datasets.
74+
75+
:param input_text: Directly provided text to process.
76+
:param input_dir: Directory containing .txt files if no direct text is given.
77+
:param raw_data_dir: Where to store the merged raw text.
78+
:param processed_data_dir: Where to store the processed binary data (train.bin, val.bin).
79+
:param train_split_ratio: The fraction of data to allocate for training.
80+
:param no_validation: If True, skip creating a validation set.
81+
:param use_gpt2_tokenizer: Whether to use GPT-2 tokenizer or char-level.
82+
:param num_proc: Number of processes for parallel encoding.
83+
:return: A dict containing information about the processed data,
84+
including processed_data_dir, vocab_size, train_size, and optionally val_size.
85+
"""
86+
os.makedirs(raw_data_dir, exist_ok=True)
87+
os.makedirs(processed_data_dir, exist_ok=True)
88+
89+
data = ""
90+
# Priority 1: use 'input_text' if provided
91+
if input_text.strip():
92+
data = input_text
93+
# Priority 2: if 'input_dir' is specified, read .txt files from it
94+
elif input_dir.strip():
95+
txt_files = [f for f in os.listdir(input_dir) if f.endswith('.txt')]
96+
for file_name in txt_files:
97+
file_path = os.path.join(input_dir, file_name)
98+
with open(file_path, 'r', encoding='utf-8') as f:
99+
data += f.read()
100+
else:
101+
raise ValueError("No text input or directory provided.")
102+
103+
# Save raw text for reference
104+
raw_text_file = os.path.join(raw_data_dir, 'merged_input.txt')
105+
with open(raw_text_file, 'w', encoding='utf-8') as f:
106+
f.write(data)
107+
108+
# Estimate data size in MB to decide whether to use multiprocessing
109+
data_size = len(data.encode('utf-8')) / (1024 * 1024)
110+
suggested_proc = min(num_proc, cpu_count())
111+
# For smaller data, using multiple processes is often overkill
112+
actual_proc = suggested_proc if data_size > 100 else 1
113+
114+
# ------------------------------
115+
# GPT-2 Tokenization Workflow
116+
# ------------------------------
117+
if use_gpt2_tokenizer:
118+
enc = tiktoken.get_encoding("gpt2")
119+
vocab_size = enc.n_vocab
120+
121+
# Parallel or single-process encoding
122+
if actual_proc > 1:
123+
chunks = get_chunks(data, actual_proc)
124+
with Pool(actual_proc) as pool:
125+
token_chunks = pool.starmap(encode_gpt2_chunk, [(chunk, enc) for chunk in chunks])
126+
tokens = []
127+
for chunk in token_chunks:
128+
tokens.extend(chunk)
129+
else:
130+
tokens = encode_gpt2_chunk(data, enc)
131+
132+
# Append end-of-text token if missing
133+
if tokens and tokens[-1] != enc.eot_token:
134+
tokens.append(enc.eot_token)
135+
136+
# Split train/val
137+
if not no_validation:
138+
split_idx = int(len(tokens) * train_split_ratio)
139+
splits = {
140+
"train": tokens[:split_idx],
141+
"val": tokens[split_idx:]
142+
}
143+
else:
144+
splits = {"train": tokens}
145+
146+
# Save to .bin files
147+
for split, tokens_ in splits.items():
148+
filename = os.path.join(processed_data_dir, f'{split}.bin')
149+
arr = np.array(tokens_, dtype=np.uint32)
150+
arr.tofile(filename)
151+
152+
# Save metadata (important for reconstructing the tokenizer state)
153+
meta_path = os.path.join(processed_data_dir, 'meta.pkl')
154+
meta = {
155+
'vocab_size': vocab_size,
156+
'itos': {i: enc.decode([i]) for i in range(vocab_size)},
157+
'stoi': {enc.decode([i]): i for i in range(vocab_size)},
158+
'tokenizer': 'gpt2'
159+
}
160+
with open(meta_path, 'wb') as f:
161+
pickle.dump(meta, f)
162+
163+
return {
164+
"processed_data_dir": processed_data_dir,
165+
"vocab_size": vocab_size,
166+
"train_size": len(splits["train"]),
167+
"val_size": len(splits.get("val", "")) if not no_validation else None
168+
}
169+
170+
# ------------------------------
171+
# Char-level Tokenization
172+
# ------------------------------
173+
else:
174+
# Collect all unique characters
175+
if actual_proc > 1:
176+
chunks = get_chunks(data, actual_proc)
177+
with Pool(actual_proc) as pool:
178+
char_sets = pool.map(get_unique_chars, chunks)
179+
chars = sorted(list(set().union(*char_sets)))
180+
else:
181+
chars = sorted(list(set(data)))
182+
183+
vocab_size = len(chars)
184+
stoi = {ch: i for i, ch in enumerate(chars)}
185+
itos = {i: ch for i, ch in enumerate(chars)}
186+
187+
# Encode data
188+
if actual_proc > 1:
189+
chunks = get_chunks(data, actual_proc)
190+
with Pool(actual_proc) as pool:
191+
encoded_chunks = pool.starmap(encode_text_chunk, [(chunk, stoi) for chunk in chunks])
192+
encoded_data = []
193+
for chunk in encoded_chunks:
194+
encoded_data.extend(chunk)
195+
else:
196+
encoded_data = encode_text_chunk(data, stoi)
197+
198+
# Split train/val
199+
if not no_validation:
200+
split_idx = int(len(encoded_data) * train_split_ratio)
201+
train_ids = np.array(encoded_data[:split_idx], dtype=IntegerTypes)
202+
val_ids = np.array(encoded_data[split_idx:], dtype=IntegerTypes)
203+
else:
204+
train_ids = np.array(encoded_data, dtype=IntegerTypes)
205+
val_ids = None
206+
207+
train_bin_path = os.path.join(processed_data_dir, 'train.bin')
208+
val_bin_path = os.path.join(processed_data_dir, 'val.bin')
209+
meta_path = os.path.join(processed_data_dir, 'meta.pkl')
210+
211+
# Write binary files
212+
train_ids.tofile(train_bin_path)
213+
if not no_validation and val_ids is not None:
214+
val_ids.tofile(val_bin_path)
215+
216+
# Save meta info
217+
meta = {
218+
'vocab_size': vocab_size,
219+
'itos': itos,
220+
'stoi': stoi,
221+
}
222+
with open(meta_path, 'wb') as f:
223+
pickle.dump(meta, f)
224+
225+
print(f"Used {actual_proc} process(es) for data processing.")
226+
result = {
227+
"processed_data_dir": processed_data_dir,
228+
"vocab_size": vocab_size,
229+
"train_size": len(train_ids),
230+
}
231+
if not no_validation:
232+
result["val_size"] = len(val_ids)
233+
return result

data/processed/meta.pkl

628 Bytes
Binary file not shown.

data/processed/train.bin

15.9 KB
Binary file not shown.

data/processed/val.bin

1.77 KB
Binary file not shown.

data/raw/merged_input.txt

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
OpenAI is a pioneering research organization and technology company that has redefined the landscape of artificial intelligence through its innovative suite of products and groundbreaking research. Founded in December 2015, the company has evolved from a non-profit initiative into a capped-profit enterprise, ensuring that its expansive projects receive the investment needed to explore and push the boundaries of AI while keeping ethical considerations at the forefront. OpenAI’s portfolio includes a range of advanced models and tools designed to cater to diverse applications—from natural language processing to computer vision and robotics—each serving a unique function and contributing to the overarching mission of benefiting all of humanity.
2+
3+
At the core of OpenAI’s contributions are the Generative Pre-trained Transformer models. The evolution of these models, starting with GPT-2 and progressing through GPT-3, GPT-3.5, and the latest GPT-4, has set new standards in natural language understanding and generation. These models are capable of producing highly coherent and contextually nuanced text, making them invaluable for tasks such as content creation, summarization, translation, and interactive dialogue. ChatGPT, which leverages these advanced language models, offers users an engaging conversational experience that can simulate human-like interactions, assist with customer support, provide educational tutoring, and even aid in creative writing endeavors.
4+
5+
In addition to its language models, OpenAI has made significant strides in the field of multimodal AI with products like DALL·E. This innovative model translates textual descriptions into detailed, creative images, opening up possibilities for artists, designers, and content creators by providing a new medium for visual expression. DALL·E exemplifies OpenAI’s commitment to merging different forms of data—text and imagery—to produce outputs that were previously unimaginable. Complementing this visual capability is Codex, a model specifically designed to assist programmers by generating and completing code across a variety of programming languages. Codex powers tools like GitHub Copilot, which not only boosts developer productivity but also helps democratize access to coding by lowering the barrier to entry for new programmers.
6+
7+
OpenAI’s work is not limited to static models; it extends into dynamic environments through its investments in reinforcement learning and robotics. The OpenAI Gym provides a comprehensive toolkit for developing and testing reinforcement learning algorithms, allowing researchers to simulate complex scenarios and train agents in controlled yet challenging environments. OpenAI Five, another hallmark project, demonstrated the capacity of AI to learn and strategize in real time by competing at a professional level in the video game Dota 2. These endeavors highlight OpenAI’s commitment to understanding and mastering the intricacies of decision-making processes and dynamic problem-solving in environments that closely mimic real-world challenges.
8+
9+
Underlying all these products is a robust framework dedicated to ensuring the safety, alignment, and ethical deployment of AI technologies. OpenAI conducts rigorous research into AI alignment to ensure that the behavior of its models remains consistent with human values, mitigating potential risks while maximizing societal benefit. The organization collaborates extensively with academic institutions, industry leaders, and policymakers to foster an environment where responsible innovation is prioritized alongside technical advancement. Strategic partnerships, most notably with Microsoft, have not only provided substantial funding but also facilitated the integration of OpenAI’s models into widely used platforms like Azure, thereby extending the reach and applicability of its technologies across various industries.
10+
11+
By continuously refining its models and exploring new applications, OpenAI remains at the cutting edge of artificial intelligence research. Its comprehensive suite of products—spanning advanced language models, image generation systems, code-assistance tools, and reinforcement learning environments—reflects a deep commitment to creating AI systems that are both powerful and safe. As the company pushes forward into the future, it maintains a steadfast focus on ensuring that the transformative potential of AI is harnessed in a manner that is ethical, equitable, and aligned with the broader interests of society.

modules/gpt.py

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
"""
2+
modules/gpt.py
3+
4+
This file provides the GPTConfig class for storing model hyperparameters,
5+
and the GPT model class itself. Currently a minimal GPT-like model is implemented.
6+
"""
7+
8+
import torch
9+
import torch.nn as nn
10+
import torch.nn.functional as F
11+
12+
13+
class GPTConfig:
14+
"""
15+
A configuration class for the GPT model.
16+
17+
:param vocab_size: The size of the vocabulary.
18+
:param block_size: The maximum sequence length (context length).
19+
:param n_layer: Number of transformer layers.
20+
:param n_head: Number of attention heads.
21+
:param n_embd: Embedding dimension.
22+
:param dropout: Dropout rate.
23+
:param bias: Whether to include bias in linear layers.
24+
"""
25+
def __init__(
26+
self,
27+
vocab_size,
28+
block_size,
29+
n_layer,
30+
n_head,
31+
n_embd,
32+
dropout=0.1,
33+
bias=False
34+
):
35+
self.vocab_size = vocab_size
36+
self.block_size = block_size
37+
self.n_layer = n_layer
38+
self.n_head = n_head
39+
self.n_embd = n_embd
40+
self.dropout = dropout
41+
self.bias = bias
42+
43+
44+
class GPT(nn.Module):
45+
"""
46+
A minimal GPT-like model consisting of an embedding layer and a linear layer.
47+
This is primarily for demonstration purposes.
48+
"""
49+
def __init__(self, config: GPTConfig):
50+
super().__init__()
51+
self.config = config
52+
self.block_size = config.block_size
53+
54+
# Token embedding layer
55+
self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embd)
56+
57+
# Language modeling head (linear projection to vocab)
58+
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=config.bias)
59+
60+
def forward(self, idx, targets=None):
61+
"""
62+
Forward pass of the model.
63+
64+
:param idx: Tensor of shape (batch_size, sequence_length) with token indices.
65+
:param targets: Optional tensor of the same shape for computing cross-entropy loss.
66+
:return: (logits, loss) tuple. 'logits' is (batch_size, sequence_length, vocab_size).
67+
'loss' is a scalar if 'targets' is provided, otherwise None.
68+
"""
69+
b, t = idx.size()
70+
# Convert token indices to embeddings
71+
token_emb = self.token_embedding_table(idx)
72+
# Project embeddings onto vocab space
73+
logits = self.lm_head(token_emb)
74+
75+
loss = None
76+
if targets is not None:
77+
# Flatten logits and targets for cross entropy
78+
logits_view = logits.view(b * t, -1)
79+
targets_view = targets.view(b * t)
80+
loss = F.cross_entropy(logits_view, targets_view)
81+
82+
return logits, loss
83+
84+
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
85+
"""
86+
Generates tokens one by one, appending each new token to 'idx'.
87+
Continues until 'max_new_tokens' have been generated.
88+
89+
:param idx: Initial token indices of shape (batch_size, sequence_length).
90+
:param max_new_tokens: Number of new tokens to generate.
91+
:param temperature: Softmax temperature for sampling.
92+
:param top_k: Top-k cutoff for sampling. If None, no cutoff is applied.
93+
:return: A tensor of shape (batch_size, sequence_length + max_new_tokens).
94+
"""
95+
for _ in range(max_new_tokens):
96+
if idx.size(1) == 0:
97+
raise ValueError("Input sequence is empty. Provide at least one token.")
98+
idx_cond = idx[:, -self.block_size:]
99+
logits, _ = self(idx_cond)
100+
101+
# Scale logits by temperature
102+
logits = logits[:, -1, :] / temperature
103+
104+
# If top_k is set, zero out everything except the top_k
105+
if top_k is not None:
106+
v, _ = torch.topk(logits, top_k)
107+
top_value = v[:, -1].unsqueeze(-1)
108+
logits[logits < top_value] = -float('Inf')
109+
110+
# Convert logits to probabilities
111+
probs = F.softmax(logits, dim=-1)
112+
# Sample from the distribution
113+
idx_next = torch.multinomial(probs, num_samples=1)
114+
# Append the new token
115+
idx = torch.cat((idx, idx_next), dim=1)
116+
117+
return idx
118+
119+
def crop_block_size(self, block_size):
120+
"""
121+
Adjust the model's internal block size (for context length).
122+
This is useful if you resume training with a smaller block_size than before.
123+
"""
124+
self.config.block_size = block_size
125+
self.block_size = block_size

0 commit comments

Comments
 (0)