Skip to content

Commit 758b215

Browse files
committed
Analyze subtitle files with PyAV
1 parent 3a46c26 commit 758b215

File tree

3 files changed

+84
-17
lines changed

3 files changed

+84
-17
lines changed

auto_editor/analyze.py

+53-17
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import re
55
from dataclasses import dataclass
6+
from fractions import Fraction
67
from typing import TYPE_CHECKING
78

89
import numpy as np
@@ -25,6 +26,7 @@
2526
pAttr,
2627
pAttrs,
2728
)
29+
from auto_editor.utils.subtitle_tools import convert_ass_to_text
2830
from auto_editor.wavfile import read
2931

3032
if TYPE_CHECKING:
@@ -307,31 +309,65 @@ def subtitle(
307309
except re.error as e:
308310
self.log.error(e)
309311

310-
sub_file = self.ensure.subtitle(self.src, stream)
311-
parser = SubtitleParser(self.tb)
312+
import av
312313

313-
with open(sub_file, encoding="utf-8") as file:
314-
parser.parse(file.read(), "webvtt")
314+
try:
315+
container = av.open(self.src.path, "r")
316+
subtitle_stream = container.streams.subtitles[stream]
317+
assert isinstance(subtitle_stream.time_base, Fraction)
318+
except Exception as e:
319+
self.log.error(e)
315320

316-
# stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
317-
def cleanhtml(raw_html: str) -> str:
318-
cleanr = re.compile("<.*?>")
319-
return re.sub(cleanr, "", raw_html)
321+
# Get the length of the subtitle stream.
322+
sub_length = 0
323+
for packet in container.demux(subtitle_stream):
324+
for subset in packet.decode():
325+
if packet.pts is None or packet.duration is None:
326+
continue
327+
# See definition of `AVSubtitle`
328+
# in: https://ffmpeg.org/doxygen/trunk/avcodec_8h_source.html
329+
start = float(packet.pts * subtitle_stream.time_base)
330+
dur = float(packet.duration * subtitle_stream.time_base)
320331

321-
if not parser.contents:
322-
self.log.error("subtitle has no valid entries")
332+
end = round((start + dur) * self.tb)
333+
sub_length = max(sub_length, end)
323334

324-
result = np.zeros((parser.contents[-1].end), dtype=np.bool_)
335+
result = np.zeros((sub_length), dtype=np.bool_)
336+
del sub_length
325337

326338
count = 0
327-
for content in parser.contents:
328-
if max_count is not None and count >= max_count:
339+
early_exit = False
340+
container.seek(0)
341+
for packet in container.demux(subtitle_stream):
342+
if early_exit:
329343
break
330344

331-
line = cleanhtml(content.after.strip())
332-
if line and re.search(pattern, line):
333-
result[content.start : content.end] = 1
334-
count += 1
345+
for subset in packet.decode():
346+
if packet.pts is None or packet.duration is None:
347+
continue
348+
if max_count is not None and count >= max_count:
349+
early_exit = True
350+
break
351+
352+
start = float(packet.pts * subtitle_stream.time_base)
353+
dur = float(packet.duration * subtitle_stream.time_base)
354+
355+
san_start = round(start * self.tb)
356+
san_end = round((start + dur) * self.tb)
357+
358+
for sub in subset:
359+
if sub.type == b"ass":
360+
line = convert_ass_to_text(sub.ass.decode(errors="ignore"))
361+
elif sub.type == b"text":
362+
line = sub.text.decode(errors="ignore")
363+
else:
364+
continue
365+
366+
if line and re.search(pattern, line):
367+
result[san_start:san_end] = 1
368+
count += 1
369+
370+
container.close()
335371

336372
return result
337373

auto_editor/output.py

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def audio(self, src: FileInfo, stream: int) -> str:
3232

3333
if first_time:
3434
self.log.conwrite("Extracting audio")
35+
self.log.debug(f"Making external audio for stream: {stream}")
3536

3637
cmd = ["-i", f"{src.path}", "-map", f"0:a:{stream}"]
3738
cmd += ["-ac", "2", "-ar", f"{self._sr}", "-rf64", "always", out_path]
@@ -52,6 +53,7 @@ def subtitle(self, src: FileInfo, stream: int) -> str:
5253

5354
if first_time:
5455
self.log.conwrite("Extracting subtitle")
56+
self.log.debug(f"Making external subtitle: {out_path}")
5557
self._ffmpeg.run(["-i", f"{src.path}", "-map", f"0:s:{stream}", out_path])
5658

5759
return out_path

auto_editor/utils/subtitle_tools.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
def convert_ass_to_text(ass_text: str) -> str:
2+
result = ""
3+
comma_count = i = 0
4+
5+
while comma_count < 8 and i < len(ass_text):
6+
if ass_text[i] == ",":
7+
comma_count += 1
8+
i += 1
9+
10+
state = False
11+
while i < len(ass_text):
12+
char = ass_text[i]
13+
next_char = "" if i + 1 >= len(ass_text) else ass_text[i + 1]
14+
15+
if char == "\\" and next_char == "N":
16+
result += "\n"
17+
i += 2
18+
continue
19+
20+
if not state:
21+
if char == "{":
22+
state = True
23+
else:
24+
result += ass_text[i]
25+
elif char == "}":
26+
state = False
27+
i += 1
28+
29+
return result

0 commit comments

Comments
 (0)