Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Research automatically generating VTT file given text and audio #6

Open
gbroques opened this issue Oct 22, 2024 · 0 comments
Open

Research automatically generating VTT file given text and audio #6

gbroques opened this issue Oct 22, 2024 · 0 comments

Comments

@gbroques
Copy link
Owner

gbroques commented Oct 22, 2024

A Python library called aeneas can do this, and Rafa (Ράφας), a member of the Learning Greek Discord server, has experiences with this.

aeneas installation instructions can be found here:
https://github.com/daxida/lingq/blob/924fc982351a4a6492c483e95af8662a957364d6/etc/forced_alignment/forced_alignment.py#L55-L81

Rafa contributed the following script which is hardcoded for the dolphin text and expects to be ran from the root.

It splits on sentences so there is an easy way to get the cues by looking at the original text which assumes some particular structure of the original text.

import os
import re
import logging
import json
from dataclasses import dataclass

from aeneas.executetask import ExecuteTask
from aeneas.task import Task


@dataclass
class TimedText:
    begin: str
    end: str
    text: str


RESET = "\033[0m"
RED = "\033[31m"
GREEN = "\033[32m"
BLUE = "\033[34m"


class ColoredFormatter(logging.Formatter):
    def format(self, record):
        match record.levelname:
            case "DEBUG":
                record.levelname = f"{BLUE}[{record.levelname}]{RESET}"
            case "INFO":
                record.levelname = f"{GREEN}[{record.levelname}]{RESET}"
            case "ERROR":
                record.levelname = f"{RED}[{record.levelname}]{RESET}"
        return super().format(record)


logging.basicConfig(level=logging.DEBUG)
for handler in logging.getLogger().handlers:
    handler.setFormatter(ColoredFormatter("%(levelname)s %(message)s"))


def align_audio_to_text(
    audio_file_path: str,
    text_file_path: str,
    opath: str,
    language: str = "ell",
) -> None:
    logging.info(
        f"Starting alignment for audio: {audio_file_path} and text: {text_file_path}"
    )
    # logging.info(open(text_file_path).read())

    # Update the task configuration string to use the specified language
    config = [
        f"task_language={language}",
        "is_text_type=plain",
        "os_task_file_format=vtt",
    ]
    config_string = "|".join(config)
    task = Task(config_string=config_string)
    task.audio_file_path_absolute = audio_file_path
    task.text_file_path_absolute = text_file_path
    task.sync_map_file_path_absolute = opath

    # Execute alignment task
    logging.debug(f"Executing alignment task with configuration:\n  {config_string}")
    ExecuteTask(task).execute()
    task.output_sync_map_file()
    logging.debug(f"Alignment task completed. Sync map file saved to: {opath}")


PARAGRAPH_SEPARATOR = "\n\n"


def split_text(text):
    """Returns a list of lists of strings. Paragraphs and then sentences."""
    sentence_separator = r"(?<=[.!?])\s+"

    paragraphs = text.strip().split(PARAGRAPH_SEPARATOR)
    paragraph_sentences = [
        re.split(sentence_separator, par.strip()) for par in paragraphs
    ]

    return paragraph_sentences


def join_text(paragraph_sentences):
    """Joins a list of lists of strings back into a single string."""
    paragraphs = [" ".join(sentences) for sentences in paragraph_sentences]
    return PARAGRAPH_SEPARATOR.join(paragraphs)


def update_config_cues(splitted_text, config_path):
    """Mutates config."""
    with open(config_path, "r", encoding="utf-8") as f:
        config = json.load(f)

    n_paragraphs = len(splitted_text)
    n_total_sentences = sum(len(p) for p in splitted_text)
    logging.debug(f"{n_paragraphs=} {n_total_sentences=}")
    markup = []
    start = 2
    for paragraph in splitted_text[1:]:  # Skip title
        end = start + len(paragraph) - 1
        entry = {
            "type": "p",
            "children": [{"type": "cueRange", "start": start, "end": end}],
        }
        markup.append(entry)
        start = end + 1

    config["markup"] = markup

    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config, f, ensure_ascii=False, indent=2)

    logging.info(f"Updated config cues in {config_path}")


def format_text(text, config_path):
    # logging.info(text)
    splitted_text = split_text(text)

    # Probably should go elsewhere
    update_config_cues(splitted_text, config_path)

    paragraphs = ["\n".join(sentences) for sentences in splitted_text]
    fmt_text = "\n".join(paragraphs)
    # logging.info(fmt_text)
    return fmt_text


def main():
    # Care the jpg has a different encoding for this lesson!
    folder = "Το δελφίνι"
    if not os.path.exists(folder):
        print(f"Could not find the folder {folder}. Exiting")
        return

    text_path = os.path.join(folder, "text.txt")
    audio_path = os.path.join(folder, "audio.mp3")
    config_path = os.path.join(folder, "config.json")
    text = open(text_path, "r").read()

    # Format text to change the timestamps
    text_formatted_path = os.path.join(folder, "text_fmt.txt")
    with open(text_formatted_path, "w") as f:
        new_text = format_text(text, config_path)
        f.write(new_text)

    opath = os.path.join(folder, "transcript.vtt")
    align_audio_to_text(audio_path, text_formatted_path, opath)

    os.remove(text_formatted_path)
    logging.debug("Removed temporary text file")


if __name__ == "__main__":
    main()
@gbroques gbroques changed the title Research automatically generating VTT file give text and audio Research automatically generating VTT file given text and audio Oct 22, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant