@@ -202,6 +202,7 @@ def embedding_loader():
202
202
segmentation = SegmentationModel(segmentation_loader)
203
203
embedding = EmbeddingModel(embedding_loader)
204
204
config = SpeakerDiarizationConfig(
205
+ # Set the segmentation model used in the paper
205
206
segmentation = segmentation,
206
207
embedding = embedding,
207
208
)
@@ -332,20 +333,57 @@ diart.client microphone --host <server-address> --port 7007
332
333
333
334
See ` -h ` for more options.
334
335
336
+ ### From the Dockerfile
337
+
338
+ You can also run the server in a Docker container. First, build the image:
339
+ ``` shell
340
+ docker build -t diart -f Dockerfile .
341
+ ```
342
+
343
+ Run the server with default configuration:
344
+ ``` shell
345
+ docker run -p 7007:7007 --gpus all -e HF_TOKEN=< token> diart
346
+ ```
347
+
348
+ Run with custom configuration:
349
+ ``` shell
350
+ docker run -p 7007:7007 --restart unless-stopped --gpus all \
351
+ -e HF_TOKEN=< token> \
352
+ -e HOST=0.0.0.0 \
353
+ -e PORT=7007 \
354
+ -e SEGMENTATION=pyannote/segmentation-3.0 \
355
+ -e EMBEDDING=speechbrain/spkrec-resnet-voxceleb \
356
+ -e TAU_ACTIVE=0.45 \
357
+ -e RHO_UPDATE=0.25 \
358
+ -e DELTA_NEW=0.6 \
359
+ -e LATENCY=5 \
360
+ -e MAX_SPEAKERS=3 \
361
+ diart
362
+ ```
363
+
364
+ The server can be configured using these environment variables, at runtime:
365
+ - ` HOST ` : Server host (default: 0.0.0.0)
366
+ - ` PORT ` : Server port (default: 7007)
367
+ - ` SEGMENTATION ` : Segmentation model (default: pyannote/segmentation)
368
+ - ` EMBEDDING ` : Embedding model (default: pyannote/embedding)
369
+ - ` TAU_ACTIVE ` : Activity threshold (default: 0.5)
370
+ - ` RHO_UPDATE ` : Update threshold (default: 0.3)
371
+ - ` DELTA_NEW ` : New speaker threshold (default: 1.0)
372
+ - ` LATENCY ` : Processing latency in seconds (default: 0.5)
373
+ - ` MAX_SPEAKERS ` : Maximum number of speakers (default: 20)
374
+
335
375
### From python
336
376
337
- For customized solutions, a server can also be created in python using the ` WebSocketAudioSource ` :
377
+ For customized solutions, a server can also be created in python using ` WebSocketStreamingServer ` :
338
378
339
379
``` python
340
- from diart import SpeakerDiarization
341
- from diart.sources import WebSocketAudioSource
342
- from diart.inference import StreamingInference
380
+ from diart import SpeakerDiarization, SpeakerDiarizationConfig
381
+ from diart.websockets import WebSocketStreamingServer
343
382
344
- pipeline = SpeakerDiarization()
345
- source = WebSocketAudioSource(pipeline.config.sample_rate, " localhost" , 7007 )
346
- inference = StreamingInference(pipeline, source)
347
- inference.attach_hooks(lambda ann_wav : source.send(ann_wav[0 ].to_rttm()))
348
- prediction = inference()
383
+ pipeline_class = SpeakerDiarization
384
+ pipeline_config = SpeakerDiarizationConfig(step = 0.5 , sample_rate = 16000 )
385
+ server = WebSocketStreamingServer(pipeline_class, pipeline_config, host = " localhost" , port = 7007 )
386
+ server.run()
349
387
```
350
388
351
389
## 🔬 Powered by research
0 commit comments