adds insane mode for transcribe anything

zackees · Jan 12, 2024 · a3dadd0 · a3dadd0
1 parent d024607
commit a3dadd0
Show file tree

Hide file tree

Showing 8 changed files with 159 additions and 65 deletions.
diff --git a/.gitignore b/.gitignore
@@ -130,4 +130,5 @@ dmypy.json
 activate.sh
 tests/test_data
 tests/localfile/text_video
-tests/localfile/text_video_insane
+tests/localfile/text_video_insane
+tests/localfile/text_video_api_insane
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@ static-ffmpeg>=2.5
 yt-dlp>=2023.3.4
 appdirs==1.4.4
 disklru>=1.0.7
-isolated-environment>=1.0.1
+isolated-environment>=1.0.1
+FileLock
diff --git a/tests/test_insane_whisper_cmd_arg.py b/tests/test_insane_whisper_cmd_arg.py
@@ -0,0 +1,41 @@
+"""
+Tests transcribe_anything
+"""
+
+# pylint: disable=bad-option-value,useless-option-value,no-self-use,protected-access,R0801
+# flake8: noqa E501
+
+import os
+import unittest
+import shutil
+
+from transcribe_anything.api import transcribe
+
+from transcribe_anything.insanely_fast_whisper import has_nvidia_smi
+
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+LOCALFILE_DIR = os.path.join(HERE, "localfile")
+TESTS_DATA_DIR = os.path.join(LOCALFILE_DIR, "text_video_api_insane", "en")
+
+
+class InsaneWhisperModeTester(unittest.TestCase):
+    """Tester for transcribe anything."""
+
+    @unittest.skipUnless(has_nvidia_smi(), "No GPU detected")
+    def test_local_file(self) -> None:
+        """Check that the command works on a local file."""
+        shutil.rmtree(TESTS_DATA_DIR, ignore_errors=True)
+        vidfile = os.path.join(LOCALFILE_DIR, "video.mp4")
+        transcribe(
+            url_or_file=vidfile,
+            language="en",
+            model="tiny",
+            device="insane",
+            output_dir=TESTS_DATA_DIR,
+        )
+
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_local_file_cmd.py b/tests/test_local_file_cmd.py
@@ -20,11 +20,14 @@ class TranscribeAnythingTester(unittest.TestCase):
     def test_local_file(self) -> None:
         """Check that the command works on a local file."""
         shutil.rmtree(TESTS_DATA_DIR, ignore_errors=True)
-        subprocess.check_output(
-            ["transcribe_anything", "video.mp4", "--language", "en", "--model", "tiny"],
-            cwd=LOCALFILE_DIR,
-        )
-
+        try:
+            subprocess.check_output(
+                ["transcribe_anything", "video.mp4", "--language", "en", "--model", "tiny"],
+                cwd=LOCALFILE_DIR,
+            )
+        except subprocess.CalledProcessError as e:  # pylint: disable=R0801
+            print(e.output)
+            raise e
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_whisper.py b/tests/test_whisper.py
@@ -16,12 +16,15 @@ class WhisperTester(unittest.TestCase):
     def test_whisper_cmd(self) -> None:
         """Check that the command is installed by the setup process."""
         env = get_environment().environment()
-        subprocess.check_output(
-            "whisper --help",
-            shell=True,
-            env=env,
-        )
-
+        try:
+            subprocess.check_output(
+                "whisper --help",
+                shell=True,
+                env=env,
+            )
+        except subprocess.CalledProcessError as e:
+            print(e.output)
+            raise e
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transcribe_anything/api.py b/transcribe_anything/api.py
@@ -17,6 +17,7 @@
 import tempfile
 import shutil
 from pathlib import Path
+from enum import Enum
 
 from appdirs import user_config_dir  # type: ignore
 # from disklru import DiskLRUCache  # type: ignore  # pylint: disable=unused-import
@@ -30,6 +31,7 @@
 )
 from transcribe_anything.logger import log_error
 from transcribe_anything.whisper import run_whisper, get_computing_device
+from transcribe_anything.insanely_fast_whisper import run_insanely_fast_whisper
 
 os.environ["PYTHONIOENCODING"] = "utf-8"
 
@@ -44,6 +46,30 @@
     | stat.S_IWGRP
 )
 
+class Device(Enum):
+    """Device enum."""
+    CPU = "cpu"
+    CUDA = "cuda"
+    INSANE = "insane"
+
+    def __str__(self) -> str:
+        return self.value
+
+    def __repr__(self) -> str:
+        return str(self)
+
+    @staticmethod
+    def from_str(device: str) -> "Device":
+        """Returns the device from a string."""
+        if device == "cpu":
+            return Device.CPU
+        if device == "cuda":
+            return Device.CUDA
+        if device == "insane":
+            return Device.INSANE
+        raise ValueError(f"Unknown device {device}")
+
+
 ffmpeg_add_paths()
 
 
@@ -145,30 +171,45 @@ def transcribe(
     #cached_data = cache.get_json(key)
     # print(f"Todo: cached data: {cached_data}")
     device = device or get_computing_device()
-    if device == "cuda":
+    device_enum = Device.from_str(device)
+    if device_enum == Device.CUDA:
         print("#####################################")
         print("######### GPU ACCELERATED! ##########")
         print("#####################################")
-    elif device == "cpu":
+    elif device_enum == Device.INSANE:
+        print("#####################################")
+        print("####### INSANE GPU MODE! ############")
+        print("#####################################")
+    elif device_enum == Device.CPU:
         print("WARNING: NOT using GPU acceleration, using 10x slower CPU instead.")
     else:
         raise ValueError(f"Unknown device {device}")
     print(f"Using device {device}")
-    model_str = f" --model {model}" if model else ""
-    task_str = f" --task {task}" if task else ""
-    language_str = f" --language {language}" if language else ""
+    model_str = f"{model}" if model else ""
+    task_str = f"{task}" if task else "transcribe"
+    language_str = f"{language}" if language else ""
 
     print(f"Running whisper on {tmp_wav} (will install models on first run)")
     with tempfile.TemporaryDirectory() as tmpdir:
-        run_whisper(
-            Path(tmp_wav),
-            device,
-            model_str,
-            Path(tmpdir),
-            task_str,
-            language_str,
-            other_args or [],
-        )
+        if device_enum == Device.INSANE:
+            run_insanely_fast_whisper(
+                Path(tmp_wav),
+                model_str,
+                Path(tmpdir),
+                task_str,
+                language_str,
+                other_args or [],
+            )
+        else:
+            run_whisper(
+                Path(tmp_wav),
+                str(device),
+                model_str,
+                Path(tmpdir),
+                task_str,
+                language_str,
+                other_args or [],
+            )
         files = [os.path.join(tmpdir, name) for name in os.listdir(tmpdir)]
         srt_file: Optional[str] = None
         for file in files:

diff --git a/transcribe_anything/insanely_fast_whisper.py b/transcribe_anything/insanely_fast_whisper.py
@@ -12,14 +12,15 @@
 from pathlib import Path
 import subprocess
 from typing import Optional, Any
+from filelock import FileLock
 
 from isolated_environment import IsolatedEnvironment  # type: ignore
 from transcribe_anything.cuda_available import CudaInfo
 
 HERE = Path(__file__).parent
 ENV: Optional[IsolatedEnvironment] = None
 CUDA_INFO: Optional[CudaInfo] = None
-
+ENV_LOCK = FileLock(HERE / "insane_whisper_env.lock")
 
 # Set the versions
 TENSOR_VERSION = "2.1.2"
@@ -41,20 +42,21 @@ def has_nvidia_smi() -> bool:
 def get_environment() -> IsolatedEnvironment:
     """Returns the environment."""
     global ENV  # pylint: disable=global-statement
-    if ENV is not None:
-        return ENV
-    venv_dir = HERE / "venv" / "insanely_fast_whisper"
-    env = IsolatedEnvironment(venv_dir)
-    if not venv_dir.exists():
-        env.install_environment()
-        if has_nvidia_smi():
-            env.pip_install(f"torch=={TENSOR_VERSION}", extra_index=EXTRA_INDEX_URL)
-        else:
-            env.pip_install(f"torch=={TENSOR_VERSION}")
-        env.pip_install("openai-whisper")
-        env.pip_install("insanely-fast-whisper")
-    ENV = env
-    return env
+    with ENV_LOCK:
+        if ENV is not None:
+            return ENV
+        venv_dir = HERE / "venv" / "insanely_fast_whisper"
+        env = IsolatedEnvironment(venv_dir)
+        if not venv_dir.exists():
+            env.install_environment()
+            if has_nvidia_smi():
+                env.pip_install(f"torch=={TENSOR_VERSION}", extra_index=EXTRA_INDEX_URL)
+            else:
+                env.pip_install(f"torch=={TENSOR_VERSION}")
+            env.pip_install("openai-whisper")
+            env.pip_install("insanely-fast-whisper")
+        ENV = env
+        return env
 
 
 def get_cuda_info() -> CudaInfo:

diff --git a/transcribe_anything/whisper.py b/transcribe_anything/whisper.py
@@ -9,12 +9,14 @@
 from pathlib import Path
 import subprocess
 from typing import Optional
+from filelock import FileLock
 
 from isolated_environment import IsolatedEnvironment  # type: ignore
 
 HERE = Path(__file__).parent
 ENV: Optional[IsolatedEnvironment] = None
 CUDA_AVAILABLE: Optional[bool] = None
+ENV_LOCK = FileLock(HERE / "whisper_env.lock")
 
 # Set the versions
 TENSOR_VERSION = "2.1.2"
@@ -48,18 +50,20 @@ def get_environment() -> IsolatedEnvironment:
     return env
 
 
+
 def get_computing_device() -> str:
     """Get the computing device."""
-    global CUDA_AVAILABLE  # pylint: disable=global-statement
-    if CUDA_AVAILABLE is None:
-        iso_env = get_environment()
-        env = iso_env.environment()
-        py_file = HERE / "cuda_available.py"
-        rtn = subprocess.run([
-            "python", py_file
-        ], check=False, env=env).returncode
-        CUDA_AVAILABLE = rtn == 0
-    return "cuda" if CUDA_AVAILABLE else "cpu"
+    with ENV_LOCK:
+        global CUDA_AVAILABLE  # pylint: disable=global-statement
+        if CUDA_AVAILABLE is None:
+            iso_env = get_environment()
+            env = iso_env.environment()
+            py_file = HERE / "cuda_available.py"
+            rtn = subprocess.run([
+                "python", py_file
+            ], check=False, env=env).returncode
+            CUDA_AVAILABLE = rtn == 0
+        return "cuda" if CUDA_AVAILABLE else "cpu"
 
 def run_whisper(  # pylint: disable=too-many-arguments
     input_wav: Path,
@@ -71,25 +75,23 @@ def run_whisper(  # pylint: disable=too-many-arguments
     other_args: Optional[list[str]]
 ) -> None:
     """Runs whisper."""
-
     iso_env = get_environment()
     cmd_list = []
     if sys.platform == "win32":
         # Set the text mode to UTF-8 on Windows.
         cmd_list.extend(["chcp", "65001", "&&"])
+    cmd_list.append("whisper")
+    cmd_list.append(f'"{input_wav}"')
+    cmd_list.append("--device")
+    cmd_list.append(device)
+    cmd_list.append("--model")
+    cmd_list.append(model)
+    cmd_list.append(f'--output_dir "{output_dir}"')
+    cmd_list.append("--task")
+    cmd_list.append(task)
+    if language:
+        cmd_list.append(f'--language "{language}"')
 
-    cmd_list.extend(
-        [
-            "whisper",
-            f'"{input_wav}"',
-            "--device",
-            device,
-            model,
-            f'--output_dir "{output_dir}"',
-            task,
-            language,
-        ]
-    )
     if other_args:
         cmd_list.extend(other_args)
     # Remove the empty strings.