FunAudioLLM
diff --git a/Diff for: ‎README.md
+1-1 b/Diff for: ‎README.md
+1-1
diff --git a/Diff for: ‎cosyvoice/bin/average_model.py
+5-4 b/Diff for: ‎cosyvoice/bin/average_model.py
+5-4
diff --git a/Diff for: ‎cosyvoice/bin/export_jit.py
+20-7 b/Diff for: ‎cosyvoice/bin/export_jit.py
+20-7
diff --git a/Diff for: ‎cosyvoice/bin/export_onnx.py
+125-47 b/Diff for: ‎cosyvoice/bin/export_onnx.py
+125-47
diff --git a/Diff for: ‎cosyvoice/bin/export_trt.sh
-10 b/Diff for: ‎cosyvoice/bin/export_trt.sh
-10
diff --git a/Diff for: ‎cosyvoice/bin/inference.py
+15-5 b/Diff for: ‎cosyvoice/bin/inference.py
+15-5
diff --git a/Diff for: ‎cosyvoice/bin/train.py
+7-2 b/Diff for: ‎cosyvoice/bin/train.py
+7-2
@@ -128,7 +128,7 @@ import torchaudio
 
 **CosyVoice2 Usage**
 ```python
-cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False)
+cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False, use_flow_cache=False)
 
 # NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
 # zero_shot usage
 
@@ -75,10 +75,11 @@ def main():
         print('Processing {}'.format(path))
         states = torch.load(path, map_location=torch.device('cpu'))
         for k in states.keys():
-            if k not in avg.keys():
-                avg[k] = states[k].clone()
-            else:
-                avg[k] += states[k]
+            if k not in ['step', 'epoch']:
+                if k not in avg.keys():
+                    avg[k] = states[k].clone()
+                else:
+                    avg[k] += states[k]
     # average
     for k in avg.keys():
         if avg[k] is not None:
 
@@ -24,6 +24,7 @@
 sys.path.append('{}/../..'.format(ROOT_DIR))
 sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+from cosyvoice.utils.file_utils import logging
 
 
 def get_args():
@@ -60,7 +61,8 @@ def main():
         model = CosyVoice(args.model_dir)
     except Exception:
         try:
-            model = CosyVoice2(args.model_dir)
+            # NOTE set use_flow_cache=True when export jit for cache inference
+            model = CosyVoice2(args.model_dir, use_flow_cache=True)
         except Exception:
             raise TypeError('no valid model_type!')
 
@@ -71,20 +73,31 @@ def main():
         script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
         script = get_optimized_script(llm_text_encoder.half())
         script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_text_encoder')
 
         # 2. export llm llm
         llm_llm = model.model.llm.llm
         script = get_optimized_script(llm_llm, ['forward_chunk'])
         script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
         script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
         script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_llm')
 
-    # 3. export flow encoder
-    flow_encoder = model.model.flow.encoder
-    script = get_optimized_script(flow_encoder)
-    script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
-    script = get_optimized_script(flow_encoder.half())
-    script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        # 3. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder)
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half())
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')
+    else:
+        # 3. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder, ['forward_chunk'])
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half(), ['forward_chunk'])
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')
 
 
 if __name__ == '__main__':
 
@@ -28,6 +28,7 @@
 sys.path.append('{}/../..'.format(ROOT_DIR))
 sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+from cosyvoice.utils.file_utils import logging
 
 
 def get_dummy_input(batch_size, seq_len, out_channels, device):
@@ -51,6 +52,7 @@ def get_args():
     return args
 
 
+@torch.no_grad()
 def main():
     args = get_args()
     logging.basicConfig(level=logging.DEBUG,
@@ -60,56 +62,132 @@ def main():
         model = CosyVoice(args.model_dir)
     except Exception:
         try:
-            model = CosyVoice2(args.model_dir)
+            # NOTE set use_flow_cache=True when export jit for cache inference
+            model = CosyVoice2(args.model_dir, use_flow_cache=True)
         except Exception:
             raise TypeError('no valid model_type!')
 
-    # 1. export flow decoder estimator
-    estimator = model.model.flow.decoder.estimator
-
-    device = model.model.device
-    batch_size, seq_len = 2, 256
-    out_channels = model.model.flow.decoder.estimator.out_channels
-    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
-    torch.onnx.export(
-        estimator,
-        (x, mask, mu, t, spks, cond),
-        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
-        export_params=True,
-        opset_version=18,
-        do_constant_folding=True,
-        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
-        output_names=['estimator_out'],
-        dynamic_axes={
-            'x': {2: 'seq_len'},
-            'mask': {2: 'seq_len'},
-            'mu': {2: 'seq_len'},
-            'cond': {2: 'seq_len'},
-            'estimator_out': {2: 'seq_len'},
-        }
-    )
-
-    # 2. test computation consistency
-    option = onnxruntime.SessionOptions()
-    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-    option.intra_op_num_threads = 1
-    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
-    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
-                                                  sess_options=option, providers=providers)
-
-    for _ in tqdm(range(10)):
-        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
-        output_pytorch = estimator(x, mask, mu, t, spks, cond)
-        ort_inputs = {
-            'x': x.cpu().numpy(),
-            'mask': mask.cpu().numpy(),
-            'mu': mu.cpu().numpy(),
-            't': t.cpu().numpy(),
-            'spks': spks.cpu().numpy(),
-            'cond': cond.cpu().numpy()
-        }
-        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
-        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+    if not isinstance(model, CosyVoice2):
+        # 1. export flow decoder estimator
+        estimator = model.model.flow.decoder.estimator
+        estimator.eval()
+
+        device = model.model.device
+        batch_size, seq_len = 2, 256
+        out_channels = model.model.flow.decoder.estimator.out_channels
+        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+        torch.onnx.export(
+            estimator,
+            (x, mask, mu, t, spks, cond),
+            '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+            export_params=True,
+            opset_version=18,
+            do_constant_folding=True,
+            input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+            output_names=['estimator_out'],
+            dynamic_axes={
+                'x': {2: 'seq_len'},
+                'mask': {2: 'seq_len'},
+                'mu': {2: 'seq_len'},
+                'cond': {2: 'seq_len'},
+                'estimator_out': {2: 'seq_len'},
+            }
+        )
+
+        # 2. test computation consistency
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                      sess_options=option, providers=providers)
+
+        for _ in tqdm(range(10)):
+            x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+            output_pytorch = estimator(x, mask, mu, t, spks, cond)
+            ort_inputs = {
+                'x': x.cpu().numpy(),
+                'mask': mask.cpu().numpy(),
+                'mu': mu.cpu().numpy(),
+                't': t.cpu().numpy(),
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy()
+            }
+            output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+            torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+        logging.info('successfully export estimator')
+    else:
+        # 1. export flow decoder estimator
+        estimator = model.model.flow.decoder.estimator
+        estimator.forward = estimator.forward_chunk
+        estimator.eval()
+
+        device = model.model.device
+        batch_size, seq_len = 2, 256
+        out_channels = model.model.flow.decoder.estimator.out_channels
+        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+        cache = model.model.init_flow_cache()['decoder_cache']
+        cache.pop('offset')
+        cache = {k: v[0] for k, v in cache.items()}
+        torch.onnx.export(
+            estimator,
+            (x, mask, mu, t, spks, cond,
+             cache['down_blocks_conv_cache'],
+             cache['down_blocks_kv_cache'],
+             cache['mid_blocks_conv_cache'],
+             cache['mid_blocks_kv_cache'],
+             cache['up_blocks_conv_cache'],
+             cache['up_blocks_kv_cache'],
+             cache['final_blocks_conv_cache']),
+            '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+            export_params=True,
+            opset_version=18,
+            do_constant_folding=True,
+            input_names=['x', 'mask', 'mu', 't', 'spks', 'cond', 'down_blocks_conv_cache', 'down_blocks_kv_cache', 'mid_blocks_conv_cache', 'mid_blocks_kv_cache',
+                         'up_blocks_conv_cache', 'up_blocks_kv_cache', 'final_blocks_conv_cache'],
+            output_names=['estimator_out', 'down_blocks_conv_cache_out', 'down_blocks_kv_cache_out', 'mid_blocks_conv_cache_out', 'mid_blocks_kv_cache_out',
+                          'up_blocks_conv_cache_out', 'up_blocks_kv_cache_out', 'final_blocks_conv_cache_out'],
+            dynamic_axes={
+                'x': {2: 'seq_len'},
+                'mask': {2: 'seq_len'},
+                'mu': {2: 'seq_len'},
+                'cond': {2: 'seq_len'},
+                'down_blocks_kv_cache': {3: 'cache_in_len'},
+                'mid_blocks_kv_cache': {3: 'cache_in_len'},
+                'up_blocks_kv_cache': {3: 'cache_in_len'},
+                'estimator_out': {2: 'seq_len'},
+                'down_blocks_kv_cache_out': {3: 'cache_out_len'},
+                'mid_blocks_kv_cache_out': {3: 'cache_out_len'},
+                'up_blocks_kv_cache_out': {3: 'cache_out_len'},
+            }
+        )
+
+        # 2. test computation consistency
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                      sess_options=option, providers=providers)
+
+        for _ in tqdm(range(10)):
+            x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+            cache = model.model.init_flow_cache()['decoder_cache']
+            cache.pop('offset')
+            cache = {k: v[0] for k, v in cache.items()}
+            output_pytorch = estimator(x, mask, mu, t, spks, cond, **{k: v.clone() for k, v in cache.items()})
+            ort_inputs = {
+                'x': x.cpu().numpy(),
+                'mask': mask.cpu().numpy(),
+                'mu': mu.cpu().numpy(),
+                't': t.cpu().numpy(),
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy(),
+            }
+            output_onnx = estimator_onnx.run(None, {**ort_inputs, **{k: v.clone().cpu().numpy() for k, v in cache.items()}})
+            for i, j in zip(output_pytorch, output_onnx):
+                torch.testing.assert_allclose(i, torch.from_numpy(j).to(device), rtol=1e-2, atol=1e-4)
+        logging.info('successfully export estimator')
 
 
 if __name__ == "__main__":
 
@@ -23,7 +23,7 @@
 import torchaudio
 from hyperpyyaml import load_hyperpyyaml
 from tqdm import tqdm
-from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
 from cosyvoice.dataset.dataset import Dataset
 
 
@@ -33,6 +33,7 @@ def get_args():
     parser.add_argument('--prompt_data', required=True, help='prompt data file')
     parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
     parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
     parser.add_argument('--llm_model', required=True, help='llm model file')
     parser.add_argument('--flow_model', required=True, help='flow model file')
     parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
@@ -59,16 +60,25 @@ def main():
     # Init cosyvoice models from configs
     use_cuda = args.gpu >= 0 and torch.cuda.is_available()
     device = torch.device('cuda' if use_cuda else 'cpu')
-    with open(args.config, 'r') as f:
-        configs = load_hyperpyyaml(f)
+    try:
+        with open(args.config, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': args.qwen_pretrain_path})
+        model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16=False)
+    except Exception:
+        try:
+            with open(args.config, 'r') as f:
+                configs = load_hyperpyyaml(f)
+            model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16=False)
+        except Exception:
+            raise TypeError('no valid model_type!')
 
-    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
     model.load(args.llm_model, args.flow_model, args.hifigan_model)
 
     test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False,
                            tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
     test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
 
+    sample_rate = configs['sample_rate']
     del configs
     os.makedirs(args.result_dir, exist_ok=True)
     fn = os.path.join(args.result_dir, 'wav.scp')
@@ -104,7 +114,7 @@ def main():
             tts_speeches = torch.concat(tts_speeches, dim=1)
             tts_key = '{}_{}'.format(utts[0], tts_index[0])
             tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
-            torchaudio.save(tts_fn, tts_speeches, sample_rate=22050)
+            torchaudio.save(tts_fn, tts_speeches, sample_rate=sample_rate, backend='soundfile')
             f.write('{} {}\n'.format(tts_key, tts_fn))
             f.flush()
     f.close()
 
@@ -46,6 +46,7 @@ def get_args():
     parser.add_argument('--config', required=True, help='config file')
     parser.add_argument('--train_data', required=True, help='train data file')
     parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
     parser.add_argument('--checkpoint', help='checkpoint model')
     parser.add_argument('--model_dir', required=True, help='save model dir')
     parser.add_argument('--tensorboard_dir',
@@ -97,8 +98,12 @@ def main():
     override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
     if gan is True:
         override_dict.pop('hift')
-    with open(args.config, 'r') as f:
-        configs = load_hyperpyyaml(f, overrides=override_dict)
+    try:
+        with open(args.config, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides={**override_dict, 'qwen_pretrain_path': args.qwen_pretrain_path})
+    except Exception:
+        with open(args.config, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides=override_dict)
     if gan is True:
         configs['train_conf'] = configs['train_conf_gan']
     configs['train_conf'].update(vars(args))