You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Rafa contributed the following script which is hardcoded for the dolphin text and expects to be ran from the root.
It splits on sentences so there is an easy way to get the cues by looking at the original text which assumes some particular structure of the original text.
importosimportreimportloggingimportjsonfromdataclassesimportdataclassfromaeneas.executetaskimportExecuteTaskfromaeneas.taskimportTask@dataclassclassTimedText:
begin: strend: strtext: strRESET="\033[0m"RED="\033[31m"GREEN="\033[32m"BLUE="\033[34m"classColoredFormatter(logging.Formatter):
defformat(self, record):
matchrecord.levelname:
case"DEBUG":
record.levelname=f"{BLUE}[{record.levelname}]{RESET}"case"INFO":
record.levelname=f"{GREEN}[{record.levelname}]{RESET}"case"ERROR":
record.levelname=f"{RED}[{record.levelname}]{RESET}"returnsuper().format(record)
logging.basicConfig(level=logging.DEBUG)
forhandlerinlogging.getLogger().handlers:
handler.setFormatter(ColoredFormatter("%(levelname)s %(message)s"))
defalign_audio_to_text(
audio_file_path: str,
text_file_path: str,
opath: str,
language: str="ell",
) ->None:
logging.info(
f"Starting alignment for audio: {audio_file_path} and text: {text_file_path}"
)
# logging.info(open(text_file_path).read())# Update the task configuration string to use the specified languageconfig= [
f"task_language={language}",
"is_text_type=plain",
"os_task_file_format=vtt",
]
config_string="|".join(config)
task=Task(config_string=config_string)
task.audio_file_path_absolute=audio_file_pathtask.text_file_path_absolute=text_file_pathtask.sync_map_file_path_absolute=opath# Execute alignment tasklogging.debug(f"Executing alignment task with configuration:\n{config_string}")
ExecuteTask(task).execute()
task.output_sync_map_file()
logging.debug(f"Alignment task completed. Sync map file saved to: {opath}")
PARAGRAPH_SEPARATOR="\n\n"defsplit_text(text):
"""Returns a list of lists of strings. Paragraphs and then sentences."""sentence_separator=r"(?<=[.!?])\s+"paragraphs=text.strip().split(PARAGRAPH_SEPARATOR)
paragraph_sentences= [
re.split(sentence_separator, par.strip()) forparinparagraphs
]
returnparagraph_sentencesdefjoin_text(paragraph_sentences):
"""Joins a list of lists of strings back into a single string."""paragraphs= [" ".join(sentences) forsentencesinparagraph_sentences]
returnPARAGRAPH_SEPARATOR.join(paragraphs)
defupdate_config_cues(splitted_text, config_path):
"""Mutates config."""withopen(config_path, "r", encoding="utf-8") asf:
config=json.load(f)
n_paragraphs=len(splitted_text)
n_total_sentences=sum(len(p) forpinsplitted_text)
logging.debug(f"{n_paragraphs=}{n_total_sentences=}")
markup= []
start=2forparagraphinsplitted_text[1:]: # Skip titleend=start+len(paragraph) -1entry= {
"type": "p",
"children": [{"type": "cueRange", "start": start, "end": end}],
}
markup.append(entry)
start=end+1config["markup"] =markupwithopen(config_path, "w", encoding="utf-8") asf:
json.dump(config, f, ensure_ascii=False, indent=2)
logging.info(f"Updated config cues in {config_path}")
defformat_text(text, config_path):
# logging.info(text)splitted_text=split_text(text)
# Probably should go elsewhereupdate_config_cues(splitted_text, config_path)
paragraphs= ["\n".join(sentences) forsentencesinsplitted_text]
fmt_text="\n".join(paragraphs)
# logging.info(fmt_text)returnfmt_textdefmain():
# Care the jpg has a different encoding for this lesson!folder="Το δελφίνι"ifnotos.path.exists(folder):
print(f"Could not find the folder {folder}. Exiting")
returntext_path=os.path.join(folder, "text.txt")
audio_path=os.path.join(folder, "audio.mp3")
config_path=os.path.join(folder, "config.json")
text=open(text_path, "r").read()
# Format text to change the timestampstext_formatted_path=os.path.join(folder, "text_fmt.txt")
withopen(text_formatted_path, "w") asf:
new_text=format_text(text, config_path)
f.write(new_text)
opath=os.path.join(folder, "transcript.vtt")
align_audio_to_text(audio_path, text_formatted_path, opath)
os.remove(text_formatted_path)
logging.debug("Removed temporary text file")
if__name__=="__main__":
main()
The text was updated successfully, but these errors were encountered:
gbroques
changed the title
Research automatically generating VTT file give text and audio
Research automatically generating VTT file given text and audio
Oct 22, 2024
A Python library called aeneas can do this, and Rafa (Ράφας), a member of the Learning Greek Discord server, has experiences with this.
aeneas
installation instructions can be found here:https://github.com/daxida/lingq/blob/924fc982351a4a6492c483e95af8662a957364d6/etc/forced_alignment/forced_alignment.py#L55-L81
Rafa contributed the following script which is hardcoded for the dolphin text and expects to be ran from the root.
It splits on sentences so there is an easy way to get the cues by looking at the original text which assumes some particular structure of the original text.
The text was updated successfully, but these errors were encountered: