Analyze subtitle files with PyAV

WyattBlue · WyattBlue · commit 758b2159f187 · 2024-06-22T03:21:26.000-04:00
diff --git a/auto_editor/analyze.py b/auto_editor/analyze.py
@@ -3,6 +3,7 @@
 import os
 import re
 from dataclasses import dataclass
+from fractions import Fraction
 from typing import TYPE_CHECKING
 
 import numpy as np
@@ -25,6 +26,7 @@
     pAttr,
     pAttrs,
 )
+from auto_editor.utils.subtitle_tools import convert_ass_to_text
 from auto_editor.wavfile import read
 
 if TYPE_CHECKING:
@@ -307,31 +309,65 @@ def subtitle(
         except re.error as e:
             self.log.error(e)
 
-        sub_file = self.ensure.subtitle(self.src, stream)
-        parser = SubtitleParser(self.tb)
+        import av
 
-        with open(sub_file, encoding="utf-8") as file:
-            parser.parse(file.read(), "webvtt")
+        try:
+            container = av.open(self.src.path, "r")
+            subtitle_stream = container.streams.subtitles[stream]
+            assert isinstance(subtitle_stream.time_base, Fraction)
+        except Exception as e:
+            self.log.error(e)
 
-        # stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
-        def cleanhtml(raw_html: str) -> str:
-            cleanr = re.compile("<.*?>")
-            return re.sub(cleanr, "", raw_html)
+        # Get the length of the subtitle stream.
+        sub_length = 0
+        for packet in container.demux(subtitle_stream):
+            for subset in packet.decode():
+                if packet.pts is None or packet.duration is None:
+                    continue
+                # See definition of `AVSubtitle`
+                # in: https://ffmpeg.org/doxygen/trunk/avcodec_8h_source.html
+                start = float(packet.pts * subtitle_stream.time_base)
+                dur = float(packet.duration * subtitle_stream.time_base)
 
-        if not parser.contents:
-            self.log.error("subtitle has no valid entries")
+                end = round((start + dur) * self.tb)
+                sub_length = max(sub_length, end)
 
-        result = np.zeros((parser.contents[-1].end), dtype=np.bool_)
+        result = np.zeros((sub_length), dtype=np.bool_)
+        del sub_length
 
         count = 0
-        for content in parser.contents:
-            if max_count is not None and count >= max_count:
+        early_exit = False
+        container.seek(0)
+        for packet in container.demux(subtitle_stream):
+            if early_exit:
                 break
 
-            line = cleanhtml(content.after.strip())
-            if line and re.search(pattern, line):
-                result[content.start : content.end] = 1
-                count += 1
+            for subset in packet.decode():
+                if packet.pts is None or packet.duration is None:
+                    continue
+                if max_count is not None and count >= max_count:
+                    early_exit = True
+                    break
+
+                start = float(packet.pts * subtitle_stream.time_base)
+                dur = float(packet.duration * subtitle_stream.time_base)
+
+                san_start = round(start * self.tb)
+                san_end = round((start + dur) * self.tb)
+
+                for sub in subset:
+                    if sub.type == b"ass":
+                        line = convert_ass_to_text(sub.ass.decode(errors="ignore"))
+                    elif sub.type == b"text":
+                        line = sub.text.decode(errors="ignore")
+                    else:
+                        continue
+
+                    if line and re.search(pattern, line):
+                        result[san_start:san_end] = 1
+                        count += 1
+
+        container.close()
 
         return result
 
diff --git a/auto_editor/output.py b/auto_editor/output.py
@@ -32,6 +32,7 @@ def audio(self, src: FileInfo, stream: int) -> str:
 
         if first_time:
             self.log.conwrite("Extracting audio")
+            self.log.debug(f"Making external audio for stream: {stream}")
 
             cmd = ["-i", f"{src.path}", "-map", f"0:a:{stream}"]
             cmd += ["-ac", "2", "-ar", f"{self._sr}", "-rf64", "always", out_path]
@@ -52,6 +53,7 @@ def subtitle(self, src: FileInfo, stream: int) -> str:
 
         if first_time:
             self.log.conwrite("Extracting subtitle")
+            self.log.debug(f"Making external subtitle: {out_path}")
             self._ffmpeg.run(["-i", f"{src.path}", "-map", f"0:s:{stream}", out_path])
 
         return out_path
diff --git a/auto_editor/utils/subtitle_tools.py b/auto_editor/utils/subtitle_tools.py
@@ -0,0 +1,29 @@
+def convert_ass_to_text(ass_text: str) -> str:
+    result = ""
+    comma_count = i = 0
+
+    while comma_count < 8 and i < len(ass_text):
+        if ass_text[i] == ",":
+            comma_count += 1
+        i += 1
+
+    state = False
+    while i < len(ass_text):
+        char = ass_text[i]
+        next_char = "" if i + 1 >= len(ass_text) else ass_text[i + 1]
+
+        if char == "\\" and next_char == "N":
+            result += "\n"
+            i += 2
+            continue
+
+        if not state:
+            if char == "{":
+                state = True
+            else:
+                result += ass_text[i]
+        elif char == "}":
+            state = False
+        i += 1
+
+    return result