-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathopennmt_py_transformer_config.yml
76 lines (65 loc) · 1.85 KB
/
opennmt_py_transformer_config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Specific for GPU server
save_data: INSERT-OUTPUT-DIR
# Prevent overwriting existing files in the folder
overwrite: False
# Corpus opts:
data:
corpus_1:
path_src: INSERT-DATA-DIR/src-train.txt
path_tgt: INSERT-DATA-DIR/tgt-train.txt
valid:
path_src: INSERT-DATA-DIR/src-val.txt
path_tgt: INSERT-DATA-DIR/tgt-val.txt
# Shared vocab path since input & output is same language
src_vocab: INSERT-DATA-DIR/vocab.txt
save_model: little_data_transformer
save_checkpoint_steps: 2000
early_stopping: 3
valid_steps: 2000
# train_steps: 20000 # Relying on early stopping
# -------------
# MODEL
# Mostly standard Transformer configs from https://github.com/OpenNMT/OpenNMT-py/blob/master/config/config-transformer-base-1GPU.yml
# but slightly adjusted for smaller dataset.
# -------------
# Custom
src_seq_length: 200 # For camelcase (115 file tokens), max=362, avg=138, std=14
tgt_seq_length: 150 # For camelcase, max=487, avg=43, std=38
share_vocab: true
# share_embeddings: true # May have majorly screwed results
# Batching
queue_size: 10000 # Default: 40
bucket_size: 32768 # Default: 2048
world_size: 1
gpu_ranks: [0]
batch_type: "tokens" # In contrast to "sents" (sentences)
batch_size: 4096 # Default: 64
valid_batch_size: 8 # Default: 32
max_generator_batches: 2 # Default: 32
accum_count: [4]
accum_steps: [0] # Default
# Optimization
model_dtype: "fp32" # Default
optim: "adam"
learning_rate: 2
warmup_steps: 2000 # Default: 4000
decay_method: "noam" # Default: None
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"
# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
rnn_size: 512
word_vec_size: 512
transformer_ff: 2048 # Default
dropout_steps: [0] # Default
dropout: [0.1]
attention_dropout: [0.1] # Default