globals: target_fps: original target_nframes: 64 outputs: - image - view resolution: 112 latent_res: 28 latent_channels: 4 denoiser: target: echosyn.common.models.SegDiTTransformer2DModel args: num_attention_heads: 16 attention_head_dim: 64 in_channels: 5 out_channels: 4 num_layers: 24 dropout: 0.0 norm_num_groups: 32 attention_bias: true sample_size: 14 patch_size: 4 activation_fn: gelu-approximate num_embeds_ada_norm: 1000 upcast_attention: false norm_type: ada_norm_zero norm_elementwise_affine: false norm_eps: 1.0e-05 optimizer: target: torch.optim.AdamW args: lr: 5.0e-05 betas: - 0.9 - 0.999 weight_decay: 0.01 eps: 1.0e-08 scheduler: target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup args: warmup_steps: 5000 ref_steps: ${max_train_steps} eta_min: 1.0e-06 decay_rate: 2 vae: target: diffusers.AutoencoderKL pretrained: vae/avae-4f4 datasets: - name: LatentSeg active: true params: root: avae-4f4/dynamic outputs: ${globals.outputs} target_fps: ${globals.target_fps} view_label: A4C target_nframes: ${globals.target_nframes} latent_channels: ${globals.latent_channels} segmentation_root: segmentations/dynamic target_resolution: ${globals.latent_res} - name: LatentSeg active: true params: root: avae-4f4/ped_a4c outputs: ${globals.outputs} target_fps: ${globals.target_fps} view_label: A4C target_nframes: ${globals.target_nframes} latent_channels: ${globals.latent_channels} segmentation_root: segmentations/ped_a4c target_resolution: ${globals.latent_res} - name: LatentSeg active: true params: root: avae-4f4/ped_psax outputs: ${globals.outputs} target_fps: ${globals.target_fps} view_label: PSAX target_nframes: ${globals.target_nframes} latent_channels: ${globals.latent_channels} segmentation_root: segmentations/ped_psax target_resolution: ${globals.latent_res} - name: LatentSeg active: true params: root: avae-4f4/lvh outputs: ${globals.outputs} target_fps: ${globals.target_fps} view_label: PLAX target_nframes: ${globals.target_nframes} latent_channels: ${globals.latent_channels} segmentation_root: no_seg target_resolution: ${globals.latent_res} dataloader: target: torch.utils.data.DataLoader args: shuffle: true batch_size: 128 num_workers: 16 pin_memory: true drop_last: true persistent_workers: true max_train_steps: 1000000 gradient_accumulation_steps: 1 mixed_precision: fp16 use_ema: true noise_offset: 0.1 max_grad_norm: 0.1 max_grad_value: -1 pad_latents: false sample_latents: true output_dir: experiments/${wandb_args.name} logging_dir: logs report_to: wandb wandb_args: project: EchoFlow name: FMiT-L4-4f4 group: FMiT checkpointing_steps: 10000 checkpoints_to_keep: - 50000 - 100000 - 200000 - 500000 - 1000000 resume_from_checkpoint: latest validation: samples: 4 steps: 5000 method: euler timesteps: 25 seed: 42 num_train_epochs: 45455