Skip to content

不支持bf16报错 #4036

Closed
Closed
@jfy1016

Description

@jfy1016

运行脚本就是官方给出的示例脚本
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

from swift.llm import (
get_model_tokenizer, load_dataset, get_template, EncodePreprocessor, get_model_arch,
get_multimodal_target_regex, LazyLLMDataset
)
from swift.utils import get_logger, get_model_parameter_info, plot_images, seed_everything
from swift.tuners import Swift, LoraConfig
from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from functools import partial

logger = get_logger()
seed_everything(42)

Hyperparameters for training

model

model_id_or_path = '/home/jdn/.cache/modelscope/hub/models/deepseek-ai/deepseek-vl2-tiny'
system = None # Using the default system defined in the template.
output_dir = '/home/jdn/deepseek/output'

dataset

dataset = '/home/jdn/train_CT_and_Xray_last_500.json' # dataset_id or dataset_path. Sampling 20000 data points
data_seed = 42
max_length = 2048
split_dataset_ratio = 0.01 # Split validation set
num_proc = 4 # The number of processes for data loading.

lora

lora_rank = 8
lora_alpha = 32
freeze_llm = False
freeze_vit = True
freeze_aligner = True

training_args

training_args = Seq2SeqTrainingArguments(
output_dir=output_dir,

learning_rate=1e-4,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_checkpointing=True,
weight_decay=0.1,
lr_scheduler_type='cosine',
warmup_ratio=0.05,
report_to=['tensorboard'],
logging_first_step=True,
save_strategy='steps',
save_steps=50,
eval_strategy='steps',
eval_steps=50,
gradient_accumulation_steps=16,
# To observe the training results more quickly, this is set to 1 here. 
# Under normal circumstances, a larger number should be used.
num_train_epochs=1,
metric_for_best_model='loss',
save_total_limit=5,
logging_steps=5,
dataloader_num_workers=4,
data_seed=data_seed,
remove_unused_columns=False,

)

output_dir = os.path.abspath(os.path.expanduser(output_dir))
logger.info(f'output_dir: {output_dir}')

Obtain the model and template

model, processor = get_model_tokenizer(model_id_or_path)
#model.half()#jdn修改
logger.info(f'model_info: {model.model_info}')
template = get_template(model.model_meta.template, processor, default_system=system, max_length=max_length)
template.set_mode('train')
if template.use_model:
template.model = model

Get target_modules and add trainable LoRA modules to the model.

target_modules = get_multimodal_target_regex(model, freeze_llm=freeze_llm, freeze_vit=freeze_vit,
freeze_aligner=freeze_aligner)
lora_config = LoraConfig(task_type='CAUSAL_LM', r=lora_rank, lora_alpha=lora_alpha,
target_modules=target_modules)
model = Swift.prepare_model(model, lora_config)
logger.info(f'lora_config: {lora_config}')

Print model structure and trainable parameters.

logger.info(f'model: {model}')
model_parameter_info = get_model_parameter_info(model)
logger.info(f'model_parameter_info: {model_parameter_info}')

Download and load the dataset, split it into a training set and a validation set,

and encode the text data into tokens.

train_dataset, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc,
seed=data_seed)

logger.info(f'train_dataset: {train_dataset}')
logger.info(f'val_dataset: {val_dataset}')
logger.info(f'train_dataset[0]: {train_dataset[0]}')

train_dataset = LazyLLMDataset(train_dataset, template.encode, random_state=data_seed)
val_dataset = LazyLLMDataset(val_dataset, template.encode, random_state=data_seed)
data = train_dataset[0]
logger.info(f'encoded_train_dataset[0]: {data}')

template.print_inputs(data)

Get the trainer and start the training.

model.enable_input_require_grads() # Compatible with gradient checkpointing
trainer = Seq2SeqTrainer(

model=model,
args=training_args,
data_collator=template.data_collator,
train_dataset=train_dataset,
eval_dataset=val_dataset,
template=template,

)
trainer.train()

last_model_checkpoint = trainer.state.last_model_checkpoint
logger.info(f'last_model_checkpoint: {last_model_checkpoint}')

Visualize the training loss.

You can also use the TensorBoard visualization interface during training by entering

tensorboard --logdir '{output_dir}/runs' at the command line.

images_dir = os.path.join(output_dir, 'images')
logger.info(f'images_dir: {images_dir}')
plot_images(images_dir, training_args.logging_dir, ['train/loss'], 0.9) # save images

Read and display the image.

The light yellow line represents the actual loss value,

while the yellow line represents the loss value smoothed with a smoothing factor of 0.9.

from IPython.display import display
from PIL import Image
image = Image.open(os.path.join(images_dir, 'train_loss.png'))
display(image)

报错为

Image请问如何修改代码能讲bf16 切换为float16

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions