feat(chatterbox): add new backend (#5524)

mudler · web-flow · commit d5c9c717b5b3 · 2025-05-30T10:52:55.000+02:00
Signed-off-by: Ettore Di Giacinto &lt;mudler@localai.io&gt;
diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
@@ -14,6 +14,28 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # Requires CUDA
+  # tests-chatterbox-tts:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with:
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+
+  #     - name: Test chatterbox-tts
+  #       run: |
+  #          make --jobs=5 --output-sync=target -C backend/python/chatterbox
+  #          make --jobs=5 --output-sync=target -C backend/python/chatterbox test
   tests-transformers:
     runs-on: ubuntu-latest
     steps:
diff --git a/Dockerfile b/Dockerfile
@@ -15,7 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,chatterbox:/build/backend/python/chatterbox/run.sh"
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
@@ -434,6 +434,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
     ; fi && \
     if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
         make -C backend/python/diffusers \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "chatterbox" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" && "${BUILD_TYPE}" = "cublas" ]]; then \
+        make -C backend/python/chatterbox \
     ; fi
 
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
diff --git a/Makefile b/Makefile
@@ -549,10 +549,10 @@ protogen-go-clean:
 	$(RM) bin/*
 
 .PHONY: protogen-python
-protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: bark-protogen coqui-protogen chatterbox-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
 
 .PHONY: protogen-python-clean
-protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
+protogen-python-clean: bark-protogen-clean coqui-protogen-clean chatterbox-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
 
 .PHONY: bark-protogen
 bark-protogen:
@@ -574,10 +574,18 @@ coqui-protogen-clean:
 diffusers-protogen:
 	$(MAKE) -C backend/python/diffusers protogen
 
+.PHONY: chatterbox-protogen
+chatterbox-protogen:
+	$(MAKE) -C backend/python/chatterbox protogen
+
 .PHONY: diffusers-protogen-clean
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 
+.PHONY: chatterbox-protogen-clean
+chatterbox-protogen-clean:
+	$(MAKE) -C backend/python/chatterbox protogen-clean
+
 .PHONY: faster-whisper-protogen
 faster-whisper-protogen:
 	$(MAKE) -C backend/python/faster-whisper protogen
@@ -632,6 +640,7 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
+	$(MAKE) -C backend/python/chatterbox
 	$(MAKE) -C backend/python/faster-whisper
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/rerankers
@@ -642,11 +651,13 @@ prepare-extra-conda-environments: protogen-python
 prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers
+	$(MAKE) -C backend/python/chatterbox
 	$(MAKE) -C backend/python/vllm
 
 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
 	$(MAKE) -C backend/python/diffusers test
+	$(MAKE) -C backend/python/chatterbox test
 	$(MAKE) -C backend/python/vllm test
 
 backend-assets:
diff --git a/backend/python/chatterbox/Makefile b/backend/python/chatterbox/Makefile
@@ -0,0 +1,29 @@
+.PHONY: coqui
+coqui: protogen
+	bash install.sh
+
+.PHONY: run
+run: protogen
+	@echo "Running coqui..."
+	bash run.sh
+	@echo "coqui run."
+
+.PHONY: test
+test: protogen
+	@echo "Testing coqui..."
+	bash test.sh
+	@echo "coqui tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+This is an extra gRPC server of LocalAI for Bark TTS
+"""
+from concurrent import futures
+import time
+import argparse
+import signal
+import sys
+import os
+import backend_pb2
+import backend_pb2_grpc
+
+import torch
+import torchaudio as ta
+from chatterbox.tts import ChatterboxTTS
+
+import grpc
+
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', None)
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    BackendServicer is the class that implements the gRPC service
+    """
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+
+        # Get device
+        # device = "cuda" if request.CUDA else "cpu"
+        if torch.cuda.is_available():
+            print("CUDA is available", file=sys.stderr)
+            device = "cuda"
+        else:
+            print("CUDA is not available", file=sys.stderr)
+            device = "cpu"
+
+        if not torch.cuda.is_available() and request.CUDA:
+            return backend_pb2.Result(success=False, message="CUDA is not available")
+
+        self.AudioPath = None
+
+        if os.path.isabs(request.AudioPath):
+            self.AudioPath = request.AudioPath
+        elif request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
+            # get base path of modelFile
+            modelFileBase = os.path.dirname(request.ModelFile)
+            # modify LoraAdapter to be relative to modelFileBase
+            self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
+
+        try:
+            print("Preparing models, please wait", file=sys.stderr)
+            self.model = ChatterboxTTS.from_pretrained(device=device)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        # Implement your logic here for the LoadModel service
+        # Replace this with your desired response
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        try:
+            # Generate audio using ChatterboxTTS
+            if self.AudioPath is not None:
+                wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
+            else:
+                wav = self.model.generate(request.text)
+            
+            # Save the generated audio
+            ta.save(request.dst, wav, self.model.sr)
+            
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
diff --git a/backend/python/chatterbox/install.sh b/backend/python/chatterbox/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+installRequirements
diff --git a/backend/python/chatterbox/requirements-cpu.txt b/backend/python/chatterbox/requirements-cpu.txt
@@ -0,0 +1,5 @@
+accelerate
+torch==2.6.0
+torchaudio==2.6.0
+transformers==4.46.3
+chatterbox-tts
diff --git a/backend/python/chatterbox/requirements-cublas11.txt b/backend/python/chatterbox/requirements-cublas11.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.6.0+cu118
+torchaudio==2.6.0+cu118
+transformers==4.46.3
+chatterbox-tts
+accelerate
diff --git a/backend/python/chatterbox/requirements-cublas12.txt b/backend/python/chatterbox/requirements-cublas12.txt
@@ -0,0 +1,5 @@
+torch==2.6.0
+torchaudio==2.6.0
+transformers==4.46.3
+chatterbox-tts
+accelerate
diff --git a/backend/python/chatterbox/requirements-hipblas.txt b/backend/python/chatterbox/requirements-hipblas.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch==2.6.0+rocm6.0
+torchaudio==2.6.0+rocm6.0
+transformers==4.46.3
+chatterbox-tts
+accelerate
diff --git a/backend/python/chatterbox/requirements-intel.txt b/backend/python/chatterbox/requirements-intel.txt
@@ -0,0 +1,12 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch==2.3.110+xpu
+torch==2.3.1+cxx11.abi
+torchaudio==2.3.1+cxx11.abi
+transformers==4.46.3
+chatterbox-tts
+accelerate
+oneccl_bind_pt==2.3.100+xpu
+optimum[openvino]
+setuptools
+transformers==4.48.3
+accelerate
diff --git a/backend/python/chatterbox/requirements.txt b/backend/python/chatterbox/requirements.txt
@@ -0,0 +1,5 @@
+grpcio==1.72.0
+protobuf
+certifi
+packaging
+setuptools
diff --git a/backend/python/chatterbox/run.sh b/backend/python/chatterbox/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+source $(dirname $0)/../common/libbackend.sh
+
+startBackend $@
diff --git a/backend/python/chatterbox/test.py b/backend/python/chatterbox/test.py
diff --git a/backend/python/chatterbox/test.sh b/backend/python/chatterbox/test.sh