Research automatically generating VTT file given text and audio #6

gbroques · 2024-10-22T17:06:50Z

A Python library called aeneas can do this, and Rafa (Ράφας), a member of the Learning Greek Discord server, has experiences with this.

aeneas installation instructions can be found here:
https://github.com/daxida/lingq/blob/924fc982351a4a6492c483e95af8662a957364d6/etc/forced_alignment/forced_alignment.py#L55-L81

Rafa contributed the following script which is hardcoded for the dolphin text and expects to be ran from the root.

It splits on sentences so there is an easy way to get the cues by looking at the original text which assumes some particular structure of the original text.

import os
import re
import logging
import json
from dataclasses import dataclass

from aeneas.executetask import ExecuteTask
from aeneas.task import Task


@dataclass
class TimedText:
    begin: str
    end: str
    text: str


RESET = "\033[0m"
RED = "\033[31m"
GREEN = "\033[32m"
BLUE = "\033[34m"


class ColoredFormatter(logging.Formatter):
    def format(self, record):
        match record.levelname:
            case "DEBUG":
                record.levelname = f"{BLUE}[{record.levelname}]{RESET}"
            case "INFO":
                record.levelname = f"{GREEN}[{record.levelname}]{RESET}"
            case "ERROR":
                record.levelname = f"{RED}[{record.levelname}]{RESET}"
        return super().format(record)


logging.basicConfig(level=logging.DEBUG)
for handler in logging.getLogger().handlers:
    handler.setFormatter(ColoredFormatter("%(levelname)s %(message)s"))


def align_audio_to_text(
    audio_file_path: str,
    text_file_path: str,
    opath: str,
    language: str = "ell",
) -> None:
    logging.info(
        f"Starting alignment for audio: {audio_file_path} and text: {text_file_path}"
    )
    # logging.info(open(text_file_path).read())

    # Update the task configuration string to use the specified language
    config = [
        f"task_language={language}",
        "is_text_type=plain",
        "os_task_file_format=vtt",
    ]
    config_string = "|".join(config)
    task = Task(config_string=config_string)
    task.audio_file_path_absolute = audio_file_path
    task.text_file_path_absolute = text_file_path
    task.sync_map_file_path_absolute = opath

    # Execute alignment task
    logging.debug(f"Executing alignment task with configuration:\n  {config_string}")
    ExecuteTask(task).execute()
    task.output_sync_map_file()
    logging.debug(f"Alignment task completed. Sync map file saved to: {opath}")


PARAGRAPH_SEPARATOR = "\n\n"


def split_text(text):
    """Returns a list of lists of strings. Paragraphs and then sentences."""
    sentence_separator = r"(?<=[.!?])\s+"

    paragraphs = text.strip().split(PARAGRAPH_SEPARATOR)
    paragraph_sentences = [
        re.split(sentence_separator, par.strip()) for par in paragraphs
    ]

    return paragraph_sentences


def join_text(paragraph_sentences):
    """Joins a list of lists of strings back into a single string."""
    paragraphs = [" ".join(sentences) for sentences in paragraph_sentences]
    return PARAGRAPH_SEPARATOR.join(paragraphs)


def update_config_cues(splitted_text, config_path):
    """Mutates config."""
    with open(config_path, "r", encoding="utf-8") as f:
        config = json.load(f)

    n_paragraphs = len(splitted_text)
    n_total_sentences = sum(len(p) for p in splitted_text)
    logging.debug(f"{n_paragraphs=} {n_total_sentences=}")
    markup = []
    start = 2
    for paragraph in splitted_text[1:]:  # Skip title
        end = start + len(paragraph) - 1
        entry = {
            "type": "p",
            "children": [{"type": "cueRange", "start": start, "end": end}],
        }
        markup.append(entry)
        start = end + 1

    config["markup"] = markup

    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config, f, ensure_ascii=False, indent=2)

    logging.info(f"Updated config cues in {config_path}")


def format_text(text, config_path):
    # logging.info(text)
    splitted_text = split_text(text)

    # Probably should go elsewhere
    update_config_cues(splitted_text, config_path)

    paragraphs = ["\n".join(sentences) for sentences in splitted_text]
    fmt_text = "\n".join(paragraphs)
    # logging.info(fmt_text)
    return fmt_text


def main():
    # Care the jpg has a different encoding for this lesson!
    folder = "Το δελφίνι"
    if not os.path.exists(folder):
        print(f"Could not find the folder {folder}. Exiting")
        return

    text_path = os.path.join(folder, "text.txt")
    audio_path = os.path.join(folder, "audio.mp3")
    config_path = os.path.join(folder, "config.json")
    text = open(text_path, "r").read()

    # Format text to change the timestamps
    text_formatted_path = os.path.join(folder, "text_fmt.txt")
    with open(text_formatted_path, "w") as f:
        new_text = format_text(text, config_path)
        f.write(new_text)

    opath = os.path.join(folder, "transcript.vtt")
    align_audio_to_text(audio_path, text_formatted_path, opath)

    os.remove(text_formatted_path)
    logging.debug("Removed temporary text file")


if __name__ == "__main__":
    main()

The text was updated successfully, but these errors were encountered:

gbroques changed the title ~~Research automatically generating VTT file give text and audio~~ Research automatically generating VTT file given text and audio Oct 22, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Research automatically generating VTT file given text and audio #6

Research automatically generating VTT file given text and audio #6

gbroques commented Oct 22, 2024 •

edited

Loading

Research automatically generating VTT file given text and audio #6

Research automatically generating VTT file given text and audio #6

Comments

gbroques commented Oct 22, 2024 • edited Loading

gbroques commented Oct 22, 2024 •

edited

Loading