Flux: Finetuning a Finetune with vs without the Original Text Encoders that were Trained? #1765

kmacmcfarlane · 2024-11-07T03:18:53Z

kmacmcfarlane
Nov 7, 2024

I've been trying to finetune the new pixelwave-dev-3 checkpoint with my dataset, and I have not been getting great results. I suspect it's because of clip/t5 training I imagine happened in the pixelwave training. I could be off-base there.

What I encountered on the huggingface repo is the t5 encoder split into two parts (diffusers style?) and I wondered if it was compatible for kohya training or if it requires conversion into one file? https://huggingface.co/mikeyandfriends/PixelWave_FLUX.1-dev_03/tree/main/text_encoder_2

For reference, these are my training scripts in case there's some other obvious problem with the training script that could explain the poor training. For instance, I'm seeing faces are a blurry mass and it's overall sort of messy. The sample images are particularly crazy, and I thought maybe the scale parameter was set wrong (I've been dabbling with de-distilled and might have changed it? I cannot find examples of how it should be set for flux in a toml file or a documented explaination on how to use the param on flux. Does it represent flux guidance automatically on flux models and CFG on non-flux models?)

config.toml

[common]
learning_rate = 2e-7
output_name = "psyart-v0.3.0-pixelwave_flux1Dev03-cosine-annealing-2e-7-2e-11-50-epochs-full_bf16-guidance-6"
log_tracker_name = "psyart-v0.3.0-pixelwave_flux1Dev03-cosine-annealing-2e-7-2e-11-50-epochs-full_bf16-guidance-6"
#sample_dir = "/home/rt/ai/models/stable-diffusion/checkpoints/flux/psyart-v0.3.0-pixelwave_flux1Dev03-cosine-annealing-2e-7-2e-11-50-epochs-full_bf16-guidance-6/samples"
lr_scheduler_type = "CosineAnnealingWarmRestarts"
lr_scheduler_args = [ "eta_min=2e-11", "T_0=2250" ] # 225 for one cycle per epoch
sample_every_n_epochs = 10
save_every_n_epochs = 10
max_train_epochs = 50
save_state_on_train_end = true
caption_prefix = "psyart. "
max_bucket_reso = 1344

[model]
pretrained_model_name_or_path = "/home/rt/ai/models/stable-diffusion/checkpoints/flux/pixelwave_flux1Dev03.safetensors"
clip_l = "/home/rt/ai/models/stable-diffusion/clip/sd3/clip_l.safetensors"
t5xxl = "/home/rt/ai/models/stable-diffusion/clip/sd3/t5xxl_fp16.safetensors"
ae = "/home/rt/ai/models/stable-diffusion/vae/flux/ae.safetensors"
save_model_as = "safetensors" 
output_dir = "/home/rt/ai/models/stable-diffusion/checkpoints/flux/psyart-training-pixelwave_flux1Dev03"

[dataset]
dataset_config = "/home/rt/ai/training/stable-diffusion/psyart/kohya-flux/pixelwave/dataset_config.toml"
persistent_data_loader_workers = true
max_data_loader_n_workers = 2
seed = 42069

[training]
full_bf16 = true
mixed_precision = "bf16"
save_precision = "bf16"
timestep_sampling = "shift"
model_prediction_type = "raw"
max_grad_norm = 0.0 
guidance_scale = 1.0
discrete_flow_shift = 3.1582
#scale_weight_norms = 3
#masked_loss = true
#apply_t5_attn_mask = true # seems to cause OOM errors on 24GB?
#alpha_mask = true

[optimizer]
#optimizer_type = "AdamWScheduleFree"
optimizer_type = "AdaFactor"
optimizer_args = ["relative_step=False", "scale_parameter=False", "warmup_init=False"]
shuffle = true # necessary for using non-linear LR with low epochs

[output]
cpu_offload_checkpointing = true

[memory_optimization]
sdpa = true
gradient_checkpointing = true
highvram = true
cache_text_encoder_outputs_to_disk = true
cache_latents_to_disk = true
fused_backward_pass = true
#blockwise_fused_optimizers = true
blocks_to_swap = 8

[logging_arguments]
log_with = "wandb"
logging_dir = "/home/rt/ai/training/stable-diffusion/psyart/kohya-flux/pixelwave/logs"

[sample_prompt_arguments]
sample_sampler = "k_dpm_2"
sample_prompts = "/home/rt/ai/training/stable-diffusion/psyart/kohya-flux/pixelwave/sample_prompt.toml"

dataset.toml

[general]
enable_bucket = true # Whether to use Aspect Ratio Bucketing
batch_size = 1       # Batch size
num_repeats = 1      # Number of repetitions for training images

[[datasets]]
resolution = 1344    # Training resolution

  [[datasets.subsets]]
  image_dir = '/home/rt/ai/training/stable-diffusion/psyart/kohya-flux/pixelwave/train_data'       # Specify the folder containing the training images

#[[datasets]]
#resolution = 1024  # Training resolution
#
# [[datasets.subsets]]
# is_reg = true
# image_dir = '/home/rt/ai/training/stable-diffusion/psyart/kohya-flux/reg_data' # Specify the folder containing the regularization images

sample_promp.toml

[prompt]
negative_prompt = "makeup, nude, naked, border, frame, signature, text, watermark"
width = 1024
height = 1024
scale = 3
sample_steps = 60
seed = 420

[[prompt.subset]]
prompt = "psyart. A full-body view of a celestial goddess wearing a crown made of vines in a meadow. She is wearing an ornate chest-piece with opal and silver wire-wrapping. The sky is full of clouds and a sunburst forms a halo around her head. Fine whisps of spore dust is floating in the air. There are beautiful trees surrounding the meadow. The grass of the meddow is embedded with fractal tesselations and ancient rune patterns overlayed."

[[prompt.subset]]
prompt = "psyart. A photo of a female medicine woman standing in front of a fire. Behind her ar ancient Aztek overgrown ruins. She is wearing an ornate chest-piece adorned with jade and silver wire-wrapping. The night sky is clear and starry."

[[prompt.subset]]
prompt = "psyart. A closeup photo of a female mountain climber. Behind her is a interdimentional celestial portal shaped like a vortex of lightning and smoke. Majectic mountains loom in the background as snowfall and clouds descend on the mountain range."

[[prompt.subset]]
prompt = "psyart. A RAW photo of a woman facing towards the camera in a prairie with a creek running through it. The sky is on fire with fractaled patterns. The grass swirls to form abstract waves of color and ornate patterns."

[[prompt.subset]]
prompt = "psyart. Landscape with a female deity surrounded by wind and water. She is a metaphysical goddess surrounded by celestial and mystical symbology and runes floating around her. She is in a sacred geometrical dreamscape."

[[prompt.subset]]
prompt = "psyart. A photo of a magical forest goddess floating above the ground glowing mycelium tendrils on the ground. She is surrounded by magical glowing light and curls of smoke with colored fog. Her flowing long wavy brown hair has beautiful flowers in the hair. She wears a flowing long white dress and is illuminated by vertical beam of light. The moon is large in the background."

[[prompt.subset]]
prompt = "psyart. A photograph of a mystical shaman wearing an ornate headdress with opulent jewelry. There is haze and intricate curls of smoke in the background. In the sky are stars and a bright glowing cosmic gateway opening for ascension to the afterlife."


[[prompt.subset]]
prompt = "A photo of a treefrog on a leaf."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Flux: Finetuning a Finetune with vs without the Original Text Encoders that were Trained? #1765

{{title}}

Replies: 0 comments

Select a reply

Flux: Finetuning a Finetune with vs without the Original Text Encoders that were Trained? #1765

kmacmcfarlane Nov 7, 2024

Replies: 0 comments

kmacmcfarlane
Nov 7, 2024