Skip to content

Commit 6e68e01

Browse files
authored
[examples] update paraformer results on aishell (#2324)
* [examples] update paraformer results on aishell * [examples] update paraformer results on aishell
1 parent 908f5f8 commit 6e68e01

File tree

4 files changed

+55
-43
lines changed

4 files changed

+55
-43
lines changed

examples/aishell/paraformer/README.md

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,36 +5,41 @@ output_dir=exp/paraformer/large
55
mkdir -p ${output_dir}
66
. ./path.sh && python wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py \
77
--output_dir ${output_dir}
8+
# init ctc and embed(used in sampler)
9+
python local/modify_ckpt.py \
10+
--add_list "{\"ctc.ctc_lo.weight\": \"decoder.embed.0.weight\", \"embed.0.weight\": \"decoder.embed.0.weight\"}" \
11+
--input_ckpt exp/paraformer/large/wenet_paraformer.pt \
12+
--output_ckpt exp/paraformer/large/wenet_paraformer.init-ctc.init-embed.pt
813
```
914

1015
# Performance Record
1116

1217
## Paraformer (original) Result
1318

14-
| decoding mode | CER |
15-
|---------------------------|-------|
16-
| paraformer greedy search | 1.95 |
19+
| decoding mode | full | 16 |
20+
|---------------------------|-------|-----|
21+
| paraformer greedy search | 1.95 | N/A |
1722

1823
## Paraformer (full-parameter tuning) Result
1924

20-
* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 40 epochs
25+
* Training info: torch_ddp fp32, batch size 28, ctc_weight: 0.3, acc_grad 1, 8 * 3090 gpu, 60 epochs (about 8h)
2126
* Decoding info: ctc_weight 0.3, average_num 5
2227
* Git hash: TBD
2328

24-
| decoding mode | CER |
25-
|---------------------------|-------|
26-
| ctc greedy search | 4.00 |
27-
| ctc prefix beam search | 4.00 |
28-
| paraformer greedy search | 2.16 |
29+
| decoding mode | full | 16 |
30+
|---------------------------|-------|-----|
31+
| ctc greedy search | 3.45 % N=104765 C=101244 S=3406 D=115 I=91 | N/A |
32+
| ctc prefix beam search | 3.44 % N=104765 C=101247 S=3407 D=111 I=83 | N/A |
33+
| paraformer greedy search | 2.19 % N=104765 C=102643 S=1959 D=163 I=172 | N/A |
2934

3035
## Paraformer-dynamic training (full-parameter tuning) Result
3136

32-
* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 43 epochs
37+
* Training info: torch_ddp fp32, batch size 28, ctc_weight: 0.3, acc_grad 1, 8 * 3090 gpu, 60 epochs (about 8h)
3338
* Decoding info: ctc_weight 0.3, average_num 5
3439
* Git hash: TBD
3540

3641
| decoding mode | full | 16 |
3742
|---------------------------|--------|------|
38-
| ctc greedy search | 3.93 | 4.94 |
39-
| ctc prefix beam search | 3.93 | 4.94 |
40-
| paraformer greedy search | 2.08 | 2.41 |
43+
| ctc greedy search | 3.46 % N=104765 C=101235 S=3409 D=121 I=98 | 4.18 % N=104765 C=100495 S=4149 D=121 I=107 |
44+
| ctc prefix beam search | 3.45 % N=104765 C=101239 S=3413 D=113 I=91 | 4.17 % N=104765 C=100500 S=4150 D=115 I=103 |
45+
| paraformer greedy search | 2.15 % N=104765 C=102640 S=1977 D=148 I=132 | 2.40 % N=104765 C=102409 S=2220 D=136 I=161 |

examples/aishell/paraformer/conf/train_paraformer.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,32 +2,32 @@ input_dim: 560
22

33
encoder: sanm_encoder
44
encoder_conf:
5-
attention_dropout_rate: 0.1
5+
attention_dropout_rate: 0.0
66
attention_heads: 4
7-
dropout_rate: 0.1
7+
dropout_rate: 0.0
88
input_layer: paraformer_dummy
99
kernel_size: 11
1010
linear_units: 2048
1111
normalize_before: true
1212
num_blocks: 50
1313
output_size: 512
1414
pos_enc_layer_type: abs_pos_paraformer
15-
positional_dropout_rate: 0.1
15+
positional_dropout_rate: 0.0
1616
sanm_shfit: 0
1717
gradient_checkpointing: true
1818

1919
decoder: sanm_decoder
2020
decoder_conf:
2121
att_layer_num: 16
2222
attention_heads: 4
23-
dropout_rate: 0.1
23+
dropout_rate: 0.0
2424
kernel_size: 11
2525
linear_units: 2048
2626
num_blocks: 16
27-
positional_dropout_rate: 0.1
27+
positional_dropout_rate: 0.0
2828
sanm_shfit: 0
29-
self_attention_dropout_rate: 0.1
30-
src_attention_dropout_rate: 0.1
29+
self_attention_dropout_rate: 0.0
30+
src_attention_dropout_rate: 0.0
3131
gradient_checkpointing: true
3232

3333
tokenizer: paraformer
@@ -102,12 +102,12 @@ dataset_conf:
102102

103103
grad_clip: 5
104104
accum_grad: 1
105-
max_epoch: 45
105+
max_epoch: 60
106106
log_interval: 100
107107

108108
optim: adam
109109
optim_conf:
110110
lr: 0.0005
111111
scheduler: warmuplr
112112
scheduler_conf:
113-
warmup_steps: 25000
113+
warmup_steps: 12000

examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@ input_dim: 560
22

33
encoder: sanm_encoder
44
encoder_conf:
5-
attention_dropout_rate: 0.1
5+
attention_dropout_rate: 0.0
66
attention_heads: 4
7-
dropout_rate: 0.1
7+
dropout_rate: 0.0
88
input_layer: paraformer_dummy
99
kernel_size: 11
1010
linear_units: 2048
1111
normalize_before: true
1212
num_blocks: 50
1313
output_size: 512
1414
pos_enc_layer_type: abs_pos_paraformer
15-
positional_dropout_rate: 0.1
15+
positional_dropout_rate: 0.0
1616
sanm_shfit: 0
1717
use_dynamic_chunk: true
1818
gradient_checkpointing: true
@@ -21,14 +21,14 @@ decoder: sanm_decoder
2121
decoder_conf:
2222
att_layer_num: 16
2323
attention_heads: 4
24-
dropout_rate: 0.1
24+
dropout_rate: 0.0
2525
kernel_size: 11
2626
linear_units: 2048
2727
num_blocks: 16
28-
positional_dropout_rate: 0.1
28+
positional_dropout_rate: 0.0
2929
sanm_shfit: 0
30-
self_attention_dropout_rate: 0.1
31-
src_attention_dropout_rate: 0.1
30+
self_attention_dropout_rate: 0.0
31+
src_attention_dropout_rate: 0.0
3232
gradient_checkpointing: true
3333

3434
tokenizer: paraformer
@@ -103,12 +103,12 @@ dataset_conf:
103103

104104
grad_clip: 5
105105
accum_grad: 1
106-
max_epoch: 45
106+
max_epoch: 60
107107
log_interval: 100
108108

109109
optim: adam
110110
optim_conf:
111111
lr: 0.0005
112112
scheduler: warmuplr
113113
scheduler_conf:
114-
warmup_steps: 25000
114+
warmup_steps: 12000

examples/aishell/paraformer/run.sh

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,13 @@ job_id=2024
2828
# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
2929
# `shard` is used for large dataset which is over 1k hours, and `shard` is
3030
# faster on reading data and training.
31-
data_type=raw
31+
data_type=shard
3232

3333
train_set=train
3434

35-
train_config=conf/train_paraformer.yaml
36-
checkpoint=exp/paraformer/large/wenet_paraformer.pt
37-
dir=exp/finetune_paraformer
35+
train_config=conf/train_paraformer_dynamic.yaml
36+
checkpoint=exp/paraformer/large/wenet_paraformer.init-ctc.init-embed.pt
37+
dir=exp/finetune_paraformer_dynamic
3838
tensorboard_dir=tensorboard
3939
num_workers=8
4040
prefetch=500
@@ -44,6 +44,12 @@ average_checkpoint=true
4444
decode_checkpoint=$dir/final.pt
4545
average_num=5
4646
decode_modes="ctc_greedy_search ctc_prefix_beam_search paraformer_greedy_search"
47+
decode_device=0
48+
decoding_chunk_size=-1
49+
decode_batch=16
50+
ctc_weight=0.3
51+
reverse_weight=0.5
52+
max_epoch=100
4753

4854
train_engine=torch_ddp
4955

@@ -124,36 +130,37 @@ fi
124130
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
125131
# Test model, please specify the model you want to test by --checkpoint
126132
if [ ${average_checkpoint} == true ]; then
127-
decode_checkpoint=$dir/avg_${average_num}.pt
133+
decode_checkpoint=$dir/avg_${average_num}_maxepoch_${max_epoch}.pt
128134
echo "do model average and final checkpoint is $decode_checkpoint"
129135
python wenet/bin/average_model.py \
130136
--dst_model $decode_checkpoint \
131137
--src_path $dir \
132138
--num ${average_num} \
139+
--max_epoch ${max_epoch} \
133140
--val_best
134141
fi
135142
# Please specify decoding_chunk_size for unified streaming and
136143
# non-streaming model. The default value is -1, which is full chunk
137144
# for non-streaming inference.
138-
decoding_chunk_size=
139-
ctc_weight=0.3
140-
reverse_weight=0.5
141-
python wenet/bin/recognize.py --gpu 0 \
145+
base=$(basename $decode_checkpoint)
146+
result_dir=$dir/${base}_chunk${decoding_chunk_size}_ctc${ctc_weight}_reverse${reverse_weight}
147+
mkdir -p ${result_dir}
148+
python wenet/bin/recognize.py --gpu ${decode_device} \
142149
--modes $decode_modes \
143150
--config $dir/train.yaml \
144151
--data_type $data_type \
145152
--test_data data/test/data.list \
146153
--checkpoint $decode_checkpoint \
147154
--beam_size 10 \
148-
--batch_size 16 \
155+
--batch_size ${decode_batch} \
149156
--penalty 0.0 \
150157
--ctc_weight $ctc_weight \
151158
--reverse_weight $reverse_weight \
152-
--result_dir $dir \
159+
--result_dir $result_dir \
153160
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
154161
for mode in ${decode_modes}; do
155162
python tools/compute-wer.py --char=1 --v=1 \
156-
data/test/text $dir/$mode/text > $dir/$mode/wer
163+
data/test/text $result_dir/$mode/text > $result_dir/$mode/wer
157164
done
158165
fi
159166

0 commit comments

Comments
 (0)