config/solver/musicgen/default.yaml

# @package __global__

defaults:
  - /solver/default
  - /conditioner: none
  - _self_
  - /solver/musicgen/evaluation: none
  - override /dset: audio/default

autocast: true
autocast_dtype: float16

solver: musicgen
sample_rate: ???
channels: ???
compression_model_checkpoint: ???
# The following will set the num codebooks on the underlying
# model, this might be different from the actual value for n_q
# given to the transformer, when the model output is postprocessed, for instance
# for stereo channels. If not provided, default value for the compression model
# will be used.
compression_model_n_q: null

tokens:
  padding_with_special_token: false

interleave_stereo_codebooks:
  use: false
  per_timestep: false

cache:
  path:
  write: false
  write_shard: 0
  write_num_shards: 1


dataset:
  batch_size: 128
  num_workers: 10
  segment_duration: 30
  min_segment_ratio: 0.8  # lower values such as 0.5 result in generations with a lot of silence.
  return_info: true
  train:
    num_samples: 1000000 # need a randomly large number here for AudioDataset
  valid:
    num_samples: 10000
  generate:
    num_samples: 50

metrics:
  fad:
    use_gt: false
    model: tf
    tf:
      bin: null  # path to local frechet_audio_distance code
      model_path: //reference/fad/vggish_model.ckpt
  kld:
    use_gt: false
    model: passt
    passt:
      pretrained_length: 20
  text_consistency:
    use_gt: false
    model: clap
    clap:
      model_path: //reference/clap/music_audioset_epoch_15_esc_90.14.pt
      model_arch: 'HTSAT-base'
      enable_fusion: false
  chroma_cosine:
    use_gt: false
    model: chroma_base
    chroma_base:
      sample_rate: ${sample_rate}
      n_chroma: 12
      radix2_exp: 14
      argmax: true

generate:
  every: 25
  num_workers: 5
  path: samples
  audio:
    format: wav
    strategy: loudness
    sample_rate: ${sample_rate}
    loudness_headroom_db: 14
  lm:
    prompted_samples: true
    unprompted_samples: true
    no_text_conditioning: false
    gen_gt_samples: false
    prompt_duration: null   # if not set, will use dataset.generate.segment_duration / 4
    gen_duration: null      # if not set, will use dataset.generate.segment_duration
    remove_prompts: false
    # generation params
    use_sampling: false
    temp: 1.0
    top_k: 0
    top_p: 0.0

evaluate:
  every: 25
  num_workers: 5
  metrics:
    base: false
    fad: false
    kld: false
    text_consistency: false
    chroma_cosine: false

checkpoint:
  save_last: true
  save_every: 50
  keep_last: 10
  keep_every_states: null

optim:
  epochs: 200
  updates_per_epoch: 2000
  lr: 1e-4
  optimizer: adamw
  max_norm: 1.0
  eager_sync: true
  adam:
    betas: [0.9, 0.95]
    weight_decay: 0.1
    eps: 1e-8

schedule:
  lr_scheduler: null