不支持bf16报错

运行脚本就是官方给出的示例脚本
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

from swift.llm import (
    get_model_tokenizer, load_dataset, get_template, EncodePreprocessor, get_model_arch,
    get_multimodal_target_regex, LazyLLMDataset
)
from swift.utils import get_logger, get_model_parameter_info, plot_images, seed_everything
from swift.tuners import Swift, LoraConfig
from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from functools import partial

logger = get_logger()
seed_everything(42)
# Hyperparameters for training
# model
model_id_or_path = '/home/jdn/.cache/modelscope/hub/models/deepseek-ai/deepseek-vl2-tiny'
system = None  # Using the default system defined in the template.
output_dir = '/home/jdn/deepseek/output'

# dataset
dataset = '/home/jdn/train_CT_and_Xray_last_500.json'  # dataset_id or dataset_path. Sampling 20000 data points
data_seed = 42
max_length = 2048
split_dataset_ratio = 0.01  # Split validation set
num_proc = 4  # The number of processes for data loading.

# lora
lora_rank = 8
lora_alpha = 32
freeze_llm = False
freeze_vit = True
freeze_aligner = True

# training_args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    
    learning_rate=1e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_checkpointing=True,
    weight_decay=0.1,
    lr_scheduler_type='cosine',
    warmup_ratio=0.05,
    report_to=['tensorboard'],
    logging_first_step=True,
    save_strategy='steps',
    save_steps=50,
    eval_strategy='steps',
    eval_steps=50,
    gradient_accumulation_steps=16,
    # To observe the training results more quickly, this is set to 1 here. 
    # Under normal circumstances, a larger number should be used.
    num_train_epochs=1,
    metric_for_best_model='loss',
    save_total_limit=5,
    logging_steps=5,
    dataloader_num_workers=4,
    data_seed=data_seed,
    remove_unused_columns=False,
)

output_dir = os.path.abspath(os.path.expanduser(output_dir))
logger.info(f'output_dir: {output_dir}')
# Obtain the model and template
model, processor = get_model_tokenizer(model_id_or_path)
#model.half()#jdn修改
logger.info(f'model_info: {model.model_info}')
template = get_template(model.model_meta.template, processor, default_system=system, max_length=max_length)
template.set_mode('train')
if template.use_model:
    template.model = model

# Get target_modules and add trainable LoRA modules to the model.
target_modules = get_multimodal_target_regex(model, freeze_llm=freeze_llm, freeze_vit=freeze_vit, 
                            freeze_aligner=freeze_aligner)
lora_config = LoraConfig(task_type='CAUSAL_LM', r=lora_rank, lora_alpha=lora_alpha,
                         target_modules=target_modules)
model = Swift.prepare_model(model, lora_config)
logger.info(f'lora_config: {lora_config}')

# Print model structure and trainable parameters.
logger.info(f'model: {model}')
model_parameter_info = get_model_parameter_info(model)
logger.info(f'model_parameter_info: {model_parameter_info}')
# Download and load the dataset, split it into a training set and a validation set,
# and encode the text data into tokens.
train_dataset, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc,
                                          seed=data_seed)

logger.info(f'train_dataset: {train_dataset}')
logger.info(f'val_dataset: {val_dataset}')
logger.info(f'train_dataset[0]: {train_dataset[0]}')

train_dataset = LazyLLMDataset(train_dataset, template.encode, random_state=data_seed)
val_dataset = LazyLLMDataset(val_dataset, template.encode, random_state=data_seed)
data = train_dataset[0]
logger.info(f'encoded_train_dataset[0]: {data}')

template.print_inputs(data)
# Get the trainer and start the training.
model.enable_input_require_grads()  # Compatible with gradient checkpointing
trainer = Seq2SeqTrainer(
    
    model=model,
    args=training_args,
    data_collator=template.data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    template=template,
)
trainer.train()

last_model_checkpoint = trainer.state.last_model_checkpoint
logger.info(f'last_model_checkpoint: {last_model_checkpoint}')
# Visualize the training loss.
# You can also use the TensorBoard visualization interface during training by entering
# `tensorboard --logdir '{output_dir}/runs'` at the command line.
images_dir = os.path.join(output_dir, 'images')
logger.info(f'images_dir: {images_dir}')
plot_images(images_dir, training_args.logging_dir, ['train/loss'], 0.9)  # save images

# Read and display the image.
# The light yellow line represents the actual loss value,
# while the yellow line represents the loss value smoothed with a smoothing factor of 0.9.
from IPython.display import display
from PIL import Image
image = Image.open(os.path.join(images_dir, 'train_loss.png'))
display(image)

报错为

![Image](https://github.com/user-attachments/assets/2cfffca6-8d25-41a7-94d0-3253b1f4c98d)请问如何修改代码能讲bf16 切换为float16

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

不支持bf16报错 #4036

Hyperparameters for training

model

dataset

lora

training_args

Obtain the model and template

Get target_modules and add trainable LoRA modules to the model.

Print model structure and trainable parameters.

Download and load the dataset, split it into a training set and a validation set,

and encode the text data into tokens.

Get the trainer and start the training.

Visualize the training loss.

You can also use the TensorBoard visualization interface during training by entering

`tensorboard --logdir '{output_dir}/runs'` at the command line.

Read and display the image.

The light yellow line represents the actual loss value,

while the yellow line represents the loss value smoothed with a smoothing factor of 0.9.

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

不支持bf16报错 #4036

Description

Hyperparameters for training

model

dataset

lora

training_args

Obtain the model and template

Get target_modules and add trainable LoRA modules to the model.

Print model structure and trainable parameters.

Download and load the dataset, split it into a training set and a validation set,

and encode the text data into tokens.

Get the trainer and start the training.

Visualize the training loss.

You can also use the TensorBoard visualization interface during training by entering

tensorboard --logdir '{output_dir}/runs' at the command line.

Read and display the image.

The light yellow line represents the actual loss value,

while the yellow line represents the loss value smoothed with a smoothing factor of 0.9.

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

`tensorboard --logdir '{output_dir}/runs'` at the command line.