Skip to content

Commit 621bfac

Browse files
committed
Analyze subtitle files with PyAV
1 parent 3a46c26 commit 621bfac

File tree

3 files changed

+85
-19
lines changed

3 files changed

+85
-19
lines changed

auto_editor/analyze.py

+54-19
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import re
55
from dataclasses import dataclass
6+
from fractions import Fraction
67
from typing import TYPE_CHECKING
78

89
import numpy as np
@@ -19,12 +20,12 @@
1920
orc,
2021
)
2122
from auto_editor.lib.data_structs import Sym
22-
from auto_editor.render.subtitle import SubtitleParser
2323
from auto_editor.utils.cmdkw import (
2424
Required,
2525
pAttr,
2626
pAttrs,
2727
)
28+
from auto_editor.utils.subtitle_tools import convert_ass_to_text
2829
from auto_editor.wavfile import read
2930

3031
if TYPE_CHECKING:
@@ -307,31 +308,65 @@ def subtitle(
307308
except re.error as e:
308309
self.log.error(e)
309310

310-
sub_file = self.ensure.subtitle(self.src, stream)
311-
parser = SubtitleParser(self.tb)
311+
import av
312+
from av.subtitles.subtitle import AssSubtitle, TextSubtitle
312313

313-
with open(sub_file, encoding="utf-8") as file:
314-
parser.parse(file.read(), "webvtt")
314+
try:
315+
container = av.open(self.src.path, "r")
316+
subtitle_stream = container.streams.subtitles[stream]
317+
assert isinstance(subtitle_stream.time_base, Fraction)
318+
except Exception as e:
319+
self.log.error(e)
315320

316-
# stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
317-
def cleanhtml(raw_html: str) -> str:
318-
cleanr = re.compile("<.*?>")
319-
return re.sub(cleanr, "", raw_html)
321+
# Get the length of the subtitle stream.
322+
sub_length = 0
323+
for packet in container.demux(subtitle_stream):
324+
if packet.pts is None or packet.duration is None:
325+
continue
326+
for subset in packet.decode():
327+
# See definition of `AVSubtitle`
328+
# in: https://ffmpeg.org/doxygen/trunk/avcodec_8h_source.html
329+
start = float(packet.pts * subtitle_stream.time_base)
330+
dur = float(packet.duration * subtitle_stream.time_base)
320331

321-
if not parser.contents:
322-
self.log.error("subtitle has no valid entries")
332+
end = round((start + dur) * self.tb)
333+
sub_length = max(sub_length, end)
323334

324-
result = np.zeros((parser.contents[-1].end), dtype=np.bool_)
335+
result = np.zeros((sub_length), dtype=np.bool_)
336+
del sub_length
325337

326338
count = 0
327-
for content in parser.contents:
328-
if max_count is not None and count >= max_count:
339+
early_exit = False
340+
container.seek(0)
341+
for packet in container.demux(subtitle_stream):
342+
if packet.pts is None or packet.duration is None:
343+
continue
344+
if early_exit:
329345
break
330-
331-
line = cleanhtml(content.after.strip())
332-
if line and re.search(pattern, line):
333-
result[content.start : content.end] = 1
334-
count += 1
346+
for subset in packet.decode():
347+
if max_count is not None and count >= max_count:
348+
early_exit = True
349+
break
350+
351+
start = float(packet.pts * subtitle_stream.time_base)
352+
dur = float(packet.duration * subtitle_stream.time_base)
353+
354+
san_start = round(start * self.tb)
355+
san_end = round((start + dur) * self.tb)
356+
357+
for sub in subset:
358+
if isinstance(sub, AssSubtitle):
359+
line = convert_ass_to_text(sub.ass.decode(errors="ignore"))
360+
elif isinstance(sub, TextSubtitle):
361+
line = sub.text.decode(errors="ignore")
362+
else:
363+
continue
364+
365+
if line and re.search(pattern, line):
366+
result[san_start:san_end] = 1
367+
count += 1
368+
369+
container.close()
335370

336371
return result
337372

auto_editor/output.py

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def audio(self, src: FileInfo, stream: int) -> str:
3232

3333
if first_time:
3434
self.log.conwrite("Extracting audio")
35+
self.log.debug(f"Making external audio for stream: {stream}")
3536

3637
cmd = ["-i", f"{src.path}", "-map", f"0:a:{stream}"]
3738
cmd += ["-ac", "2", "-ar", f"{self._sr}", "-rf64", "always", out_path]
@@ -52,6 +53,7 @@ def subtitle(self, src: FileInfo, stream: int) -> str:
5253

5354
if first_time:
5455
self.log.conwrite("Extracting subtitle")
56+
self.log.debug(f"Making external subtitle: {out_path}")
5557
self._ffmpeg.run(["-i", f"{src.path}", "-map", f"0:s:{stream}", out_path])
5658

5759
return out_path

auto_editor/utils/subtitle_tools.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
def convert_ass_to_text(ass_text: str) -> str:
2+
result = ""
3+
comma_count = i = 0
4+
5+
while comma_count < 8 and i < len(ass_text):
6+
if ass_text[i] == ",":
7+
comma_count += 1
8+
i += 1
9+
10+
state = False
11+
while i < len(ass_text):
12+
char = ass_text[i]
13+
next_char = "" if i + 1 >= len(ass_text) else ass_text[i + 1]
14+
15+
if char == "\\" and next_char == "N":
16+
result += "\n"
17+
i += 2
18+
continue
19+
20+
if not state:
21+
if char == "{":
22+
state = True
23+
else:
24+
result += ass_text[i]
25+
elif char == "}":
26+
state = False
27+
i += 1
28+
29+
return result

0 commit comments

Comments
 (0)