-
Notifications
You must be signed in to change notification settings - Fork 2.3k
/
Copy pathdefault.yaml
131 lines (117 loc) · 2.71 KB
/
default.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# @package __global__
defaults:
- /solver/default
- /conditioner: none
- _self_
- /solver/musicgen/evaluation: none
- override /dset: audio/default
autocast: true
autocast_dtype: float16
solver: musicgen
sample_rate: ???
channels: ???
compression_model_checkpoint: ???
# The following will set the num codebooks on the underlying
# model, this might be different from the actual value for n_q
# given to the transformer, when the model output is postprocessed, for instance
# for stereo channels. If not provided, default value for the compression model
# will be used.
compression_model_n_q: null
tokens:
padding_with_special_token: false
interleave_stereo_codebooks:
use: false
per_timestep: false
cache:
path:
write: false
write_shard: 0
write_num_shards: 1
dataset:
batch_size: 128
num_workers: 10
segment_duration: 30
min_segment_ratio: 0.8 # lower values such as 0.5 result in generations with a lot of silence.
return_info: true
train:
num_samples: 1000000 # need a randomly large number here for AudioDataset
valid:
num_samples: 10000
generate:
num_samples: 50
metrics:
fad:
use_gt: false
model: tf
tf:
bin: null # path to local frechet_audio_distance code
model_path: //reference/fad/vggish_model.ckpt
kld:
use_gt: false
model: passt
passt:
pretrained_length: 20
text_consistency:
use_gt: false
model: clap
clap:
model_path: //reference/clap/music_audioset_epoch_15_esc_90.14.pt
model_arch: 'HTSAT-base'
enable_fusion: false
chroma_cosine:
use_gt: false
model: chroma_base
chroma_base:
sample_rate: ${sample_rate}
n_chroma: 12
radix2_exp: 14
argmax: true
generate:
every: 25
num_workers: 5
path: samples
audio:
format: wav
strategy: loudness
sample_rate: ${sample_rate}
loudness_headroom_db: 14
lm:
prompted_samples: true
unprompted_samples: true
no_text_conditioning: false
gen_gt_samples: false
prompt_duration: null # if not set, will use dataset.generate.segment_duration / 4
gen_duration: null # if not set, will use dataset.generate.segment_duration
remove_prompts: false
# generation params
use_sampling: false
temp: 1.0
top_k: 0
top_p: 0.0
evaluate:
every: 25
num_workers: 5
metrics:
base: false
fad: false
kld: false
text_consistency: false
chroma_cosine: false
checkpoint:
save_last: true
save_every: 50
keep_last: 10
keep_every_states: null
optim:
epochs: 200
updates_per_epoch: 2000
lr: 1e-4
optimizer: adamw
max_norm: 1.0
eager_sync: true
adam:
betas: [0.9, 0.95]
weight_decay: 0.1
eps: 1e-8
schedule:
lr_scheduler: null