Skip to content

Commit b56dfa2

Browse files
authored
Merge pull request #1140 from FunAudioLLM/dev/lyuxiang.lx
Dev/lyuxiang.lx
2 parents 08312f4 + f0b8e89 commit b56dfa2

33 files changed

+1904
-301
lines changed

Diff for: README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ import torchaudio
128128

129129
**CosyVoice2 Usage**
130130
```python
131-
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False)
131+
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False, use_flow_cache=False)
132132

133133
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
134134
# zero_shot usage

Diff for: cosyvoice/bin/average_model.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,11 @@ def main():
7575
print('Processing {}'.format(path))
7676
states = torch.load(path, map_location=torch.device('cpu'))
7777
for k in states.keys():
78-
if k not in avg.keys():
79-
avg[k] = states[k].clone()
80-
else:
81-
avg[k] += states[k]
78+
if k not in ['step', 'epoch']:
79+
if k not in avg.keys():
80+
avg[k] = states[k].clone()
81+
else:
82+
avg[k] += states[k]
8283
# average
8384
for k in avg.keys():
8485
if avg[k] is not None:

Diff for: cosyvoice/bin/export_jit.py

+20-7
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
sys.path.append('{}/../..'.format(ROOT_DIR))
2525
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
2626
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
27+
from cosyvoice.utils.file_utils import logging
2728

2829

2930
def get_args():
@@ -60,7 +61,8 @@ def main():
6061
model = CosyVoice(args.model_dir)
6162
except Exception:
6263
try:
63-
model = CosyVoice2(args.model_dir)
64+
# NOTE set use_flow_cache=True when export jit for cache inference
65+
model = CosyVoice2(args.model_dir, use_flow_cache=True)
6466
except Exception:
6567
raise TypeError('no valid model_type!')
6668

@@ -71,20 +73,31 @@ def main():
7173
script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
7274
script = get_optimized_script(llm_text_encoder.half())
7375
script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
76+
logging.info('successfully export llm_text_encoder')
7477

7578
# 2. export llm llm
7679
llm_llm = model.model.llm.llm
7780
script = get_optimized_script(llm_llm, ['forward_chunk'])
7881
script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
7982
script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
8083
script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
84+
logging.info('successfully export llm_llm')
8185

82-
# 3. export flow encoder
83-
flow_encoder = model.model.flow.encoder
84-
script = get_optimized_script(flow_encoder)
85-
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
86-
script = get_optimized_script(flow_encoder.half())
87-
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
86+
# 3. export flow encoder
87+
flow_encoder = model.model.flow.encoder
88+
script = get_optimized_script(flow_encoder)
89+
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
90+
script = get_optimized_script(flow_encoder.half())
91+
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
92+
logging.info('successfully export flow_encoder')
93+
else:
94+
# 3. export flow encoder
95+
flow_encoder = model.model.flow.encoder
96+
script = get_optimized_script(flow_encoder, ['forward_chunk'])
97+
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
98+
script = get_optimized_script(flow_encoder.half(), ['forward_chunk'])
99+
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
100+
logging.info('successfully export flow_encoder')
88101

89102

90103
if __name__ == '__main__':

Diff for: cosyvoice/bin/export_onnx.py

+125-47
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
sys.path.append('{}/../..'.format(ROOT_DIR))
2929
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
3030
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
31+
from cosyvoice.utils.file_utils import logging
3132

3233

3334
def get_dummy_input(batch_size, seq_len, out_channels, device):
@@ -51,6 +52,7 @@ def get_args():
5152
return args
5253

5354

55+
@torch.no_grad()
5456
def main():
5557
args = get_args()
5658
logging.basicConfig(level=logging.DEBUG,
@@ -60,56 +62,132 @@ def main():
6062
model = CosyVoice(args.model_dir)
6163
except Exception:
6264
try:
63-
model = CosyVoice2(args.model_dir)
65+
# NOTE set use_flow_cache=True when export jit for cache inference
66+
model = CosyVoice2(args.model_dir, use_flow_cache=True)
6467
except Exception:
6568
raise TypeError('no valid model_type!')
6669

67-
# 1. export flow decoder estimator
68-
estimator = model.model.flow.decoder.estimator
69-
70-
device = model.model.device
71-
batch_size, seq_len = 2, 256
72-
out_channels = model.model.flow.decoder.estimator.out_channels
73-
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
74-
torch.onnx.export(
75-
estimator,
76-
(x, mask, mu, t, spks, cond),
77-
'{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
78-
export_params=True,
79-
opset_version=18,
80-
do_constant_folding=True,
81-
input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
82-
output_names=['estimator_out'],
83-
dynamic_axes={
84-
'x': {2: 'seq_len'},
85-
'mask': {2: 'seq_len'},
86-
'mu': {2: 'seq_len'},
87-
'cond': {2: 'seq_len'},
88-
'estimator_out': {2: 'seq_len'},
89-
}
90-
)
91-
92-
# 2. test computation consistency
93-
option = onnxruntime.SessionOptions()
94-
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
95-
option.intra_op_num_threads = 1
96-
providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
97-
estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
98-
sess_options=option, providers=providers)
99-
100-
for _ in tqdm(range(10)):
101-
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
102-
output_pytorch = estimator(x, mask, mu, t, spks, cond)
103-
ort_inputs = {
104-
'x': x.cpu().numpy(),
105-
'mask': mask.cpu().numpy(),
106-
'mu': mu.cpu().numpy(),
107-
't': t.cpu().numpy(),
108-
'spks': spks.cpu().numpy(),
109-
'cond': cond.cpu().numpy()
110-
}
111-
output_onnx = estimator_onnx.run(None, ort_inputs)[0]
112-
torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
70+
if not isinstance(model, CosyVoice2):
71+
# 1. export flow decoder estimator
72+
estimator = model.model.flow.decoder.estimator
73+
estimator.eval()
74+
75+
device = model.model.device
76+
batch_size, seq_len = 2, 256
77+
out_channels = model.model.flow.decoder.estimator.out_channels
78+
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
79+
torch.onnx.export(
80+
estimator,
81+
(x, mask, mu, t, spks, cond),
82+
'{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
83+
export_params=True,
84+
opset_version=18,
85+
do_constant_folding=True,
86+
input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
87+
output_names=['estimator_out'],
88+
dynamic_axes={
89+
'x': {2: 'seq_len'},
90+
'mask': {2: 'seq_len'},
91+
'mu': {2: 'seq_len'},
92+
'cond': {2: 'seq_len'},
93+
'estimator_out': {2: 'seq_len'},
94+
}
95+
)
96+
97+
# 2. test computation consistency
98+
option = onnxruntime.SessionOptions()
99+
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
100+
option.intra_op_num_threads = 1
101+
providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
102+
estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
103+
sess_options=option, providers=providers)
104+
105+
for _ in tqdm(range(10)):
106+
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
107+
output_pytorch = estimator(x, mask, mu, t, spks, cond)
108+
ort_inputs = {
109+
'x': x.cpu().numpy(),
110+
'mask': mask.cpu().numpy(),
111+
'mu': mu.cpu().numpy(),
112+
't': t.cpu().numpy(),
113+
'spks': spks.cpu().numpy(),
114+
'cond': cond.cpu().numpy()
115+
}
116+
output_onnx = estimator_onnx.run(None, ort_inputs)[0]
117+
torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
118+
logging.info('successfully export estimator')
119+
else:
120+
# 1. export flow decoder estimator
121+
estimator = model.model.flow.decoder.estimator
122+
estimator.forward = estimator.forward_chunk
123+
estimator.eval()
124+
125+
device = model.model.device
126+
batch_size, seq_len = 2, 256
127+
out_channels = model.model.flow.decoder.estimator.out_channels
128+
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
129+
cache = model.model.init_flow_cache()['decoder_cache']
130+
cache.pop('offset')
131+
cache = {k: v[0] for k, v in cache.items()}
132+
torch.onnx.export(
133+
estimator,
134+
(x, mask, mu, t, spks, cond,
135+
cache['down_blocks_conv_cache'],
136+
cache['down_blocks_kv_cache'],
137+
cache['mid_blocks_conv_cache'],
138+
cache['mid_blocks_kv_cache'],
139+
cache['up_blocks_conv_cache'],
140+
cache['up_blocks_kv_cache'],
141+
cache['final_blocks_conv_cache']),
142+
'{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
143+
export_params=True,
144+
opset_version=18,
145+
do_constant_folding=True,
146+
input_names=['x', 'mask', 'mu', 't', 'spks', 'cond', 'down_blocks_conv_cache', 'down_blocks_kv_cache', 'mid_blocks_conv_cache', 'mid_blocks_kv_cache',
147+
'up_blocks_conv_cache', 'up_blocks_kv_cache', 'final_blocks_conv_cache'],
148+
output_names=['estimator_out', 'down_blocks_conv_cache_out', 'down_blocks_kv_cache_out', 'mid_blocks_conv_cache_out', 'mid_blocks_kv_cache_out',
149+
'up_blocks_conv_cache_out', 'up_blocks_kv_cache_out', 'final_blocks_conv_cache_out'],
150+
dynamic_axes={
151+
'x': {2: 'seq_len'},
152+
'mask': {2: 'seq_len'},
153+
'mu': {2: 'seq_len'},
154+
'cond': {2: 'seq_len'},
155+
'down_blocks_kv_cache': {3: 'cache_in_len'},
156+
'mid_blocks_kv_cache': {3: 'cache_in_len'},
157+
'up_blocks_kv_cache': {3: 'cache_in_len'},
158+
'estimator_out': {2: 'seq_len'},
159+
'down_blocks_kv_cache_out': {3: 'cache_out_len'},
160+
'mid_blocks_kv_cache_out': {3: 'cache_out_len'},
161+
'up_blocks_kv_cache_out': {3: 'cache_out_len'},
162+
}
163+
)
164+
165+
# 2. test computation consistency
166+
option = onnxruntime.SessionOptions()
167+
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
168+
option.intra_op_num_threads = 1
169+
providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
170+
estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
171+
sess_options=option, providers=providers)
172+
173+
for _ in tqdm(range(10)):
174+
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
175+
cache = model.model.init_flow_cache()['decoder_cache']
176+
cache.pop('offset')
177+
cache = {k: v[0] for k, v in cache.items()}
178+
output_pytorch = estimator(x, mask, mu, t, spks, cond, **{k: v.clone() for k, v in cache.items()})
179+
ort_inputs = {
180+
'x': x.cpu().numpy(),
181+
'mask': mask.cpu().numpy(),
182+
'mu': mu.cpu().numpy(),
183+
't': t.cpu().numpy(),
184+
'spks': spks.cpu().numpy(),
185+
'cond': cond.cpu().numpy(),
186+
}
187+
output_onnx = estimator_onnx.run(None, {**ort_inputs, **{k: v.clone().cpu().numpy() for k, v in cache.items()}})
188+
for i, j in zip(output_pytorch, output_onnx):
189+
torch.testing.assert_allclose(i, torch.from_numpy(j).to(device), rtol=1e-2, atol=1e-4)
190+
logging.info('successfully export estimator')
113191

114192

115193
if __name__ == "__main__":

Diff for: cosyvoice/bin/export_trt.sh

-10
This file was deleted.

Diff for: cosyvoice/bin/inference.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import torchaudio
2424
from hyperpyyaml import load_hyperpyyaml
2525
from tqdm import tqdm
26-
from cosyvoice.cli.model import CosyVoiceModel
26+
from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
2727
from cosyvoice.dataset.dataset import Dataset
2828

2929

@@ -33,6 +33,7 @@ def get_args():
3333
parser.add_argument('--prompt_data', required=True, help='prompt data file')
3434
parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
3535
parser.add_argument('--tts_text', required=True, help='tts input file')
36+
parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
3637
parser.add_argument('--llm_model', required=True, help='llm model file')
3738
parser.add_argument('--flow_model', required=True, help='flow model file')
3839
parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
@@ -59,16 +60,25 @@ def main():
5960
# Init cosyvoice models from configs
6061
use_cuda = args.gpu >= 0 and torch.cuda.is_available()
6162
device = torch.device('cuda' if use_cuda else 'cpu')
62-
with open(args.config, 'r') as f:
63-
configs = load_hyperpyyaml(f)
63+
try:
64+
with open(args.config, 'r') as f:
65+
configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': args.qwen_pretrain_path})
66+
model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16=False)
67+
except Exception:
68+
try:
69+
with open(args.config, 'r') as f:
70+
configs = load_hyperpyyaml(f)
71+
model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16=False)
72+
except Exception:
73+
raise TypeError('no valid model_type!')
6474

65-
model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
6675
model.load(args.llm_model, args.flow_model, args.hifigan_model)
6776

6877
test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False,
6978
tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
7079
test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
7180

81+
sample_rate = configs['sample_rate']
7282
del configs
7383
os.makedirs(args.result_dir, exist_ok=True)
7484
fn = os.path.join(args.result_dir, 'wav.scp')
@@ -104,7 +114,7 @@ def main():
104114
tts_speeches = torch.concat(tts_speeches, dim=1)
105115
tts_key = '{}_{}'.format(utts[0], tts_index[0])
106116
tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
107-
torchaudio.save(tts_fn, tts_speeches, sample_rate=22050)
117+
torchaudio.save(tts_fn, tts_speeches, sample_rate=sample_rate, backend='soundfile')
108118
f.write('{} {}\n'.format(tts_key, tts_fn))
109119
f.flush()
110120
f.close()

Diff for: cosyvoice/bin/train.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def get_args():
4646
parser.add_argument('--config', required=True, help='config file')
4747
parser.add_argument('--train_data', required=True, help='train data file')
4848
parser.add_argument('--cv_data', required=True, help='cv data file')
49+
parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
4950
parser.add_argument('--checkpoint', help='checkpoint model')
5051
parser.add_argument('--model_dir', required=True, help='save model dir')
5152
parser.add_argument('--tensorboard_dir',
@@ -97,8 +98,12 @@ def main():
9798
override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
9899
if gan is True:
99100
override_dict.pop('hift')
100-
with open(args.config, 'r') as f:
101-
configs = load_hyperpyyaml(f, overrides=override_dict)
101+
try:
102+
with open(args.config, 'r') as f:
103+
configs = load_hyperpyyaml(f, overrides={**override_dict, 'qwen_pretrain_path': args.qwen_pretrain_path})
104+
except Exception:
105+
with open(args.config, 'r') as f:
106+
configs = load_hyperpyyaml(f, overrides=override_dict)
102107
if gan is True:
103108
configs['train_conf'] = configs['train_conf_gan']
104109
configs['train_conf'].update(vars(args))

0 commit comments

Comments
 (0)