Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dtr exp selective checkpointing #506

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,6 @@ venv.bak/
dmypy.json

# Pyre type checker
.pyre/
.pyre/
output
exp_data*
49 changes: 49 additions & 0 deletions configs/dtr_gpt2_pretrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from libai.config import LazyCall
from libai.evaluation import PPLEvaluator
from .common.models.gpt import pretrain_model as model
from .common.train import train
from .common.optim import optim
from .common.data.gpt_dataset import dataloader, tokenization

from .common.models.graph import graph

__dataset_root = "/share_nfs/sd_dataset/lph/datasets/libai_dataset"
vocab_file = f"{__dataset_root}/gpt2-vocab.json"
merge_files = f"{__dataset_root}/gpt2-merges.txt"
data_prefix = (
f"{__dataset_root}/loss_compara_content_sentence"
)

tokenization.tokenizer.vocab_file = vocab_file
tokenization.tokenizer.merges_file = merge_files
dataloader.train.dataset[0].data_prefix = data_prefix
dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix
dataloader.test[0].dataset.data_prefix = data_prefix
dataloader.test[0].dataset.indexed_dataset.data_prefix = data_prefix

# GPT-2 model config
model.cfg.embedding_dropout_prob = 0.1
model.cfg.attention_dropout_prob = 0.1
model.cfg.num_attention_heads = 16
model.cfg.hidden_size = 384
model.cfg.ffn_hidden_size = 1536
model.cfg.hidden_layers = 6
model.cfg.max_seq_length = 1024

train.input_placement_device = "cpu"

train.dist.pipeline_num_layers = model.cfg.hidden_layers

for ds in dataloader.train.dataset:
ds.max_seq_length = model.cfg.max_seq_length

optim.lr = 1.5e-4

train.train_micro_batch_size = 4
# train.train_micro_batch_size = 42 for a800
train.amp.enabled = False

train.evaluation.evaluator = LazyCall(PPLEvaluator)()

train.output_dir = "./output/gpt2_output"
graph.enabled = False
48 changes: 48 additions & 0 deletions configs/dtr_gpt3_pretrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from libai.config import LazyCall
from libai.evaluation import PPLEvaluator
from .common.models.gpt import pretrain_model as model
from .common.train import train
from .common.optim import optim
from .common.data.gpt_dataset import dataloader, tokenization

from .common.models.graph import graph

__dataset_root = "/share_nfs/sd_dataset/lph/datasets/libai_dataset"
vocab_file = f"{__dataset_root}/gpt2-vocab.json"
merge_files = f"{__dataset_root}/gpt2-merges.txt"
data_prefix = (
f"{__dataset_root}/loss_compara_content_sentence"
)

tokenization.tokenizer.vocab_file = vocab_file
tokenization.tokenizer.merges_file = merge_files
dataloader.train.dataset[0].data_prefix = data_prefix
dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix
dataloader.test[0].dataset.data_prefix = data_prefix
dataloader.test[0].dataset.indexed_dataset.data_prefix = data_prefix

# GPT-2 model config
model.cfg.embedding_dropout_prob = 0.1
model.cfg.attention_dropout_prob = 0.1
model.cfg.num_attention_heads = 32
model.cfg.hidden_size = 4096
model.cfg.ffn_hidden_size = 4096*4
model.cfg.hidden_layers = 32
model.cfg.max_seq_length = 1024

train.input_placement_device = "cpu"

train.dist.pipeline_num_layers = model.cfg.hidden_layers

for ds in dataloader.train.dataset:
ds.max_seq_length = model.cfg.max_seq_length

optim.lr = 1.5e-4

train.train_micro_batch_size = 2
train.amp.enabled = False

train.evaluation.evaluator = LazyCall(PPLEvaluator)()

train.output_dir = "./output/gpt3_output"
graph.enabled = False
90 changes: 75 additions & 15 deletions libai/engine/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,51 @@
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py
# --------------------------------------------------------

def count_all_parameters(model, verbose=False):
"""
Count total, trainable, and non-trainable parameters in a PyTorch model.
Args:
model (nn.Module): The model to count parameters for.
verbose (bool, optional): Print detailed information if True.
Returns:
Tuple containing total, trainable, and non-trainable parameters, and percent trainable parameters.
"""
train_params, all_params = 0, 0
for _, param in model.named_parameters():
num_params = param.numel()
all_params += num_params
if param.requires_grad:
train_params += num_params
nontrain_params = all_params - train_params
pct_train_params = train_params / all_params * 100
if verbose:
logger = logging.getLogger(__name__)
logger.info(f"Total params: {format_size(all_params)}")
logger.info(f"Trainable params: {format_size(train_params)}")
logger.info(f"Non-trainable params: {format_size(nontrain_params)}")
logger.info(f"Trainable params %: {pct_train_params:.4f}")
return all_params, train_params, nontrain_params, pct_train_params


def format_size(size):
"""
Convert bytes to a human-readable string with appropriate units.
Args:
size (int): The number of bytes.
Returns:
String representing the number of bytes with appropriate units.
"""
k, m, b, t = 1024, 1024**2, 10**9, 10**12
if size > t:
return f"{round(size / t, 4)}T"
elif size > b:
return f"{round(size / b, 4)}B"
elif size > m:
return f"{round(size / m, 4)}M"
elif size > k:
return f"{round(size / k, 4)}K"
else:
return f"{size}"

def _highlight(code, filename):
try:
Expand Down Expand Up @@ -200,6 +245,18 @@ def default_setup(cfg, args):

_compile_dependencies()

class EmtypOptimizer():
def __init__(self,cfg):
pass

def clip_grad(self):
pass

def step(self):
pass

def zero_grad(self):
pass

class DefaultTrainer(TrainerBase):
"""
Expand Down Expand Up @@ -314,8 +371,9 @@ def __init__(self, cfg):
"Building time: {:.3f} seconds".format(time.time() - start_time)
)

self.optimizer = self.build_optimizer(cfg, self.model)
self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer)
# self.optimizer = self.build_optimizer(cfg, self.model)
self.optimizer = EmtypOptimizer(cfg)
# self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer)

if cfg.graph.enabled:
self.graph_train = self.build_graph(
Expand Down Expand Up @@ -345,13 +403,14 @@ def __init__(self, cfg):
lr_scheduler=self.lr_scheduler,
)
else:
self.checkpointer = Checkpointer(
# Assume you want to save checkpoints together with logs/statistics
self.model,
cfg.train.output_dir,
optimizer=self.optimizer,
lr_scheduler=self.lr_scheduler,
)
pass
# self.checkpointer = Checkpointer(
# # Assume you want to save checkpoints together with logs/statistics
# self.model,
# cfg.train.output_dir,
# optimizer=self.optimizer,
# lr_scheduler=self.lr_scheduler,
# )

# Loading checkpoint before dataloader construction, because
# dataloader needs to know the consumed iterations from
Expand Down Expand Up @@ -415,12 +474,12 @@ def build_hooks(self):

ret = [
hooks.IterationTimer(),
hooks.LRScheduler(), # for beauty lr scheduler printer in `nn.Graph` mode
hooks.PeriodicCheckpointer(
self.checkpointer,
self.cfg.train.checkpointer.period,
max_to_keep=self.cfg.train.checkpointer.max_to_keep,
),
# hooks.LRScheduler(), # for beauty lr scheduler printer in `nn.Graph` mode
# hooks.PeriodicCheckpointer(
# self.checkpointer,
# self.cfg.train.checkpointer.period,
# max_to_keep=self.cfg.train.checkpointer.max_to_keep,
# ),
]

if self.cfg.train.evaluation.enabled:
Expand Down Expand Up @@ -563,6 +622,7 @@ def build_model(cls, cfg):
model = build_model(cfg.model)
logger = logging.getLogger(__name__)
logger.info("Model:\n{}".format(model))
count_all_parameters(model, verbose=True)
model._apply(dist.convert_to_distributed_default_setting)
return model

Expand Down
Loading