include optional word time codes in forced alignment and transcription

baxtree · Feb 17, 2025 · a0a9e99 · a0a9e99
1 parent 6ce24da
commit a0a9e99
Show file tree

Hide file tree

Showing 13 changed files with 293 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -104,13 +104,16 @@ $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/sub
 $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt
 $ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt
 $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" -o subtitle_aligned.srt
+$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" --word_time_codes -o raw_subtitle.json
 $ subaligner -m transcribe -v video.mp4 -s subtitle.srt -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
-$ subaligner -m transcribe -v video.mp4 -s subtitle.srt -upp -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
+$ subaligner -m transcribe -v video.mp4 -s subtitle.srt --use_prior_prompting -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
+
 ```
 ```
 # Alignment on segmented plain texts (double newlines as the delimiter)
 
 $ subaligner -m script -v video.mp4 -s subtitle.txt -o subtitle_aligned.srt
+$ subaligner -m script -v video.mp4 -s subtitle.txt --word_time_codes -o raw_subtitle.json
 $ subaligner -m script -v https://example.com/video.mp4 -s https://example.com/subtitle.txt -o subtitle_aligned.srt
 ```
 ```
@@ -175,7 +178,9 @@ $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner -m dual -
 $ docker run -it baxtree/subaligner subaligner -m single -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
 $ docker run -it baxtree/subaligner subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
 ```
-The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLIs, run `subaligner -h` or `subaligner_batch -h`, `subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h` for additional utilities. `subaligner_1pass` and `subaligner_2pass` are shortcuts for running `subaligner` with `-m single` and `-m dual` options, respectively.
+The aligned subtitle will be saved at `subtitle_aligned.srt`. To obtain the subtitle in raw JSON format for downstream
+processing, replace the output file extension with `.json`. For details on CLIs, run `subaligner -h` or `subaligner_batch -h`,
+`subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h` for additional utilities. `subaligner_1pass` and `subaligner_2pass` are shortcuts for running `subaligner` with `-m single` and `-m dual` options, respectively.
 
 ![](figures/screencast.gif)
 

diff --git a/site/source/usage.rst b/site/source/usage.rst
@@ -27,12 +27,14 @@ Make sure you have got the virtual environment activated upfront.
     (.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt
     (.venv) $ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt
     (.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" -o subtitle_aligned.srt
+    (.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" --word_time_codes -o raw_subtitle.json
     (.venv) $ subaligner -m transcribe -v video.mp4 -s subtitle.srt -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
-    (.venv) $ subaligner -m transcribe -v video.mp4 -s subtitle.srt -upp -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
+    (.venv) $ subaligner -m transcribe -v video.mp4 -s subtitle.srt --use_prior_prompting -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
 
 **Alignment on segmented plain texts (double newlines as the delimiter)**::
 
     (.venv) $ subaligner -m script -v video.mp4 -s subtitle.txt -o subtitle_aligned.srt
+    (.venv) $ subaligner -m script -v video.mp4 -s subtitle.txt --word_time_codes -o raw_subtitle.json
     (.venv) $ subaligner -m script -v https://example.com/video.mp4 -s https://example.com/subtitle.txt -o subtitle_aligned.srt
 
 **Alignment on multiple subtitles against the single media file**::
@@ -80,6 +82,11 @@ Make sure you have got the virtual environment activated upfront.
     $ docker run -it baxtree/subaligner subaligner -m single -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
     $ docker run -it baxtree/subaligner subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
 
+The aligned subtitle will be saved at `subtitle_aligned.srt`. To obtain the subtitle in raw JSON format for downstream
+processing, replace the output file extension with `.json`. For details on CLIs, run `subaligner -h` or `subaligner_batch -h`,
+`subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h` for additional utilities. `subaligner_1pass` and
+`subaligner_2pass` are shortcuts for running `subaligner` with `-m single` and `-m dual` options, respectively.
+
 **Run alignments with pipx**::
 
     $ pipx run subaligner -m single -v video.mp4 -s subtitle.srt

diff --git a/subaligner/__main__.py b/subaligner/__main__.py
@@ -5,14 +5,14 @@
                   [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-os OFFSET_SECONDS]
                   [-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
                   [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}] [-ip INITIAL_PROMPT] [-mcl MAX_CHAR_LENGTH]
-                  [-tr {helsinki-nlp,whisper,facebook-mbart,facebook-m2m100}] [-tf TRANSLATION_FLAVOUR] [-mpt MEDIA_PROCESS_TIMEOUT] [-sat SEGMENT_ALIGNMENT_TIMEOUT] [-wt] [-upp] [-lgs] [-d] [-q] [-ver]
+                  [-tr {helsinki-nlp,whisper,facebook-mbart,facebook-m2m100}] [-tf TRANSLATION_FLAVOUR] [-mpt MEDIA_PROCESS_TIMEOUT] [-sat SEGMENT_ALIGNMENT_TIMEOUT] [-upp] [-wtc] [-lgs] [-d] [-q] [-ver]
 
 Subaligner command line interface
 
 optional arguments:
   -h, --help            show this help message and exit
   -s SUBTITLE_PATH [SUBTITLE_PATH ...], --subtitle_path SUBTITLE_PATH [SUBTITLE_PATH ...]
-                        File path or URL to the subtitle file (Extensions of supported subtitles: .vtt, .srt, .ass, .sbv, .sub, .txt, .ttml, .ssa, .dfxp, .ytt, .stl, .tmp, .smi, .scc, .sami, .xml) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
+                        File path or URL to the subtitle file (Extensions of supported subtitles: .ssa, .tmp, .srt, .sbv, .stl, .json, .sami, .ttml, .smi, .txt, .scc, .sub, .ass, .vtt, .xml, .ytt, .dfxp) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
   -l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS
                         Max global log loss for alignment
   -so, --stretch_on     Switch on stretch on subtitles)
@@ -48,6 +48,8 @@
                         Maximum waiting time in seconds when aligning each segment
   -upp, --use_prior_prompting
                         Whether to use the previous subtitle cue as the current prompt.
+  -wtc, --word_time_codes
+                        Whether to output time codes for each word in the subtitle file.
   -lgs, --languages     Print out language codes used for stretch and translation
   -d, --debug           Print out debugging information
   -q, --quiet           Switch off logging information
@@ -231,6 +233,8 @@ def main():
     )
     parser.add_argument("-upp", "--use_prior_prompting", action="store_true",
                         help="Whether to use the previous subtitle cue as the current prompt.")
+    parser.add_argument("-wtc", "--word_time_codes", action="store_true",
+                        help="Whether to output time codes for each word in the subtitle file.")
     parser.add_argument("-lgs", "--languages", action="store_true",
                         help="Print out language codes used for stretch and translation")
     parser.add_argument("-d", "--debug", action="store_true",
@@ -364,6 +368,7 @@ def main():
                         video_file_path=local_video_path,
                         subtitle_file_path=local_subtitle_path,
                         stretch_in_lang=stretch_in_lang,
+                        with_word_time_codes=FLAGS.word_time_codes,
                     )
                 elif FLAGS.mode == "transcribe":
                     from subaligner.transcriber import Transcriber
@@ -372,13 +377,15 @@ def main():
                         subtitle, frame_rate = transcriber.transcribe(video_file_path=local_video_path,
                                                                       language_code=stretch_in_lang,
                                                                       initial_prompt=FLAGS.initial_prompt,
-                                                                      max_char_length=FLAGS.max_char_length)
+                                                                      max_char_length=FLAGS.max_char_length,
+                                                                      with_word_time_codes=FLAGS.word_time_codes)
                     else:
                         subtitle, frame_rate = transcriber.transcribe_with_subtitle_as_prompts(video_file_path=local_video_path,
                                                                                                subtitle_file_path=local_subtitle_path,
                                                                                                language_code=stretch_in_lang,
                                                                                                max_char_length=FLAGS.max_char_length,
-                                                                                               use_prior_prompting=FLAGS.use_prior_prompting)
+                                                                                               use_prior_prompting=FLAGS.use_prior_prompting,
+                                                                                               with_word_time_codes=FLAGS.word_time_codes,)
                     aligned_subs = subtitle.subs
                 else:
                     print("ERROR: Unknown mode {}".format(FLAGS.mode))

diff --git a/subaligner/predictor.py b/subaligner/predictor.py
@@ -140,13 +140,18 @@ def predict_dual_pass(
             if os.path.exists(audio_file_path):
                 os.remove(audio_file_path)
 
-    def predict_plain_text(self, video_file_path: str, subtitle_file_path: str, stretch_in_lang: str = "eng") -> tuple:
+    def predict_plain_text(self,
+                           video_file_path: str,
+                           subtitle_file_path: str,
+                           stretch_in_lang: str = "eng",
+                           with_word_time_codes: bool = False) -> tuple:
         """Predict time to shift with plain texts
 
             Arguments:
                 video_file_path {string} -- The input video file path.
                 subtitle_file_path {string} -- The path to the subtitle file.
                 stretch_in_lang {str} -- The language used for stretching subtitles (default: {"eng"}).
+                with_word_time_codes {bool} -- True to output time codes for each word (default: {False}).
 
             Returns:
                 tuple: The shifted subtitles, the audio file path (None) and the voice probabilities of the original audio (None).
@@ -178,9 +183,22 @@ def predict_plain_text(self, video_file_path: str, subtitle_file_path: str, stre
         runtime_config_string = "dtw_algorithm=stripe"  # stripe or exact
         task = Task(config_string=task_config_string)
 
+        path = None
+        if with_word_time_codes:
+            _, path = tempfile.mkstemp()
+            processed = []
+            with open(subtitle_file_path, "r", encoding="utf-8") as f:
+                for line in f.readlines():
+                    # TODO: Use tokenizers to process languages that do not use spaces as word delimiters
+                    processed.append((os.linesep * 2).join(line.strip().split()) if line.strip() else os.linesep)
+
+            with open(path, "w", encoding="utf-8") as f:
+                f.write((os.linesep * 2).join(processed))
+                f.flush()
+
         try:
             task.audio_file_path_absolute = audio_file_path
-            task.text_file_path_absolute = subtitle_file_path
+            task.text_file_path_absolute = subtitle_file_path if not with_word_time_codes else path
             task.sync_map_file_path_absolute = "{}.srt".format(root)
 
             tee = False if self.__LOGGER.level == getattr(logging, 'DEBUG') else True
@@ -205,7 +223,8 @@ def predict_plain_text(self, video_file_path: str, subtitle_file_path: str, stre
             try:
                 frame_rate = self.__media_helper.get_frame_rate(video_file_path)
                 self.__feature_embedder.step_sample = 1 / frame_rate
-                self.__on_frame_timecodes(adjusted_subs)
+                if not with_word_time_codes:
+                    self.__on_frame_timecodes(adjusted_subs)
             except NoFrameRateException:
                 self.__LOGGER.warning("Cannot detect the frame rate for %s" % video_file_path)
 
@@ -220,6 +239,8 @@ def predict_plain_text(self, video_file_path: str, subtitle_file_path: str, stre
                 os.remove(task.audio_file_path_absolute)
             if task.sync_map_file_path_absolute is not None and os.path.exists(task.sync_map_file_path_absolute):
                 os.remove(task.sync_map_file_path_absolute)
+            if path is not None and os.path.exists(path):
+                os.remove(path)
 
     def get_log_loss(self, voice_probabilities: np.ndarray, subs: List[SubRipItem]) -> float:
         """Returns a single loss value on voice prediction

diff --git a/subaligner/subtitle.py b/subaligner/subtitle.py
@@ -45,6 +45,7 @@ class Subtitle(object):
     SCC_EXTENSIONS = [".scc"]
     SBV_EXTENSIONS = [".sbv"]
     YT_TRANSCRIPT_EXTENSIONS = [".ytt"]
+    JSON_RAW_EXTENSIONS = [".json"]
 
     def __init__(self, secret: object, subtitle_file_path: str, subtitle_format: str) -> None:
         assert (
@@ -81,6 +82,8 @@ def __init__(self, secret: object, subtitle_file_path: str, subtitle_format: str
             self.__subs = self.__convert_sbv_to_subs(subtitle_file_path)
         elif subtitle_format == "ytt":
             self.__subs = self.__convert_ytt_to_subs(subtitle_file_path)
+        elif subtitle_format == "json":
+            self.__subs = self.__convert_json_raw_to_subs(subtitle_file_path)
         else:
             raise UnsupportedFormatException(
                 "Unknown subtitle format for file: {}".format(subtitle_file_path)
@@ -272,6 +275,19 @@ def load_ytt(cls, subtitle_file_path: str) -> "Subtitle":
 
         return cls(cls.__secret, subtitle_file_path, "ytt")
 
+    @classmethod
+    def load_json(cls, subtitle_file_path: str) -> "Subtitle":
+        """Load a JSON raw subtitle file.
+
+        Arguments:
+            subtitle_file_path {string} -- The path to the subtitle file.
+
+        Returns:
+            Subtitle: Subtitle object.
+        """
+
+        return cls(cls.__secret, subtitle_file_path, "json")
+
     @classmethod
     def load(cls, subtitle_file_path: str) -> "Subtitle":
         """Load a SubRip or TTML subtitle file based on the file extension.
@@ -310,6 +326,8 @@ def load(cls, subtitle_file_path: str) -> "Subtitle":
             return cls(cls.__secret, subtitle_file_path, "sbv")
         elif file_extension in cls.YT_TRANSCRIPT_EXTENSIONS:
             return cls(cls.__secret, subtitle_file_path, "ytt")
+        elif file_extension in cls.JSON_RAW_EXTENSIONS:
+            return cls(cls.__secret, subtitle_file_path, "json")
         else:
             return cls(cls.__secret, subtitle_file_path, "unknown")
 
@@ -380,6 +398,8 @@ def shift_subtitle(
                 subs = cls(cls.__secret, subtitle_file_path, "sbv").subs
             elif file_extension.lower() in cls.YT_TRANSCRIPT_EXTENSIONS:
                 subs = cls(cls.__secret, subtitle_file_path, "ytt").subs
+            elif file_extension.lower() in cls.JSON_RAW_EXTENSIONS:
+                subs = cls(cls.__secret, subtitle_file_path, "json").subs
             else:
                 raise UnsupportedFormatException(
                     "Unknown subtitle format for file: {}".format(subtitle_file_path)
@@ -493,7 +513,7 @@ def subtitle_extensions() -> set:
                    + Subtitle.SSA_EXTENTIONS + Subtitle.ADVANCED_SSA_EXTENTIONS + Subtitle.MICRODVD_EXTENSIONS
                    + Subtitle.MPL2_EXTENSIONS + Subtitle.TMP_EXTENSIONS + Subtitle.SAMI_EXTENSIONS
                    + Subtitle.STL_EXTENSIONS + Subtitle.SCC_EXTENSIONS + Subtitle.SBV_EXTENSIONS
-                   + Subtitle.YT_TRANSCRIPT_EXTENSIONS)
+                   + Subtitle.YT_TRANSCRIPT_EXTENSIONS + Subtitle.JSON_RAW_EXTENSIONS)
 
     @property
     def subtitle_file_path(self) -> str:
@@ -712,6 +732,21 @@ def __convert_ytt_to_subs(ytt_file_path: str) -> SubRipFile:
 
         return Subtitle._get_srt_subs(path, housekeep=True)
 
+    @staticmethod
+    def __convert_json_raw_to_subs(json_file_path: str) -> SubRipFile:
+        """Convert a subtitle file from the JSON raw format to the SubRip format
+
+        Arguments:
+            json_file_path {string} -- The path to the JSON subtitle file.
+
+        Returns:
+            SubRipFile: A list of SubRipItems.
+        """
+        _, path = tempfile.mkstemp()
+        Utils.json2srt(json_file_path, path)
+
+        return Subtitle._get_srt_subs(path, housekeep=True)
+
     @staticmethod
     def __export_with_format(subs: List[SubRipItem], source_file_path: str, target_file_path: Optional[str], file_extension: str, suffix: str) -> None:
         if target_file_path is None:
@@ -833,6 +868,13 @@ def __save_subtitle_by_extension(file_extension: str,
                 Utils.srt2ytt(path, target_file_path)
             finally:
                 os.remove(path)
+        elif file_extension in Subtitle.JSON_RAW_EXTENSIONS:
+            try:
+                _, path = tempfile.mkstemp()
+                SubRipFile(subs).save(path, encoding=encoding)
+                Utils.srt2json(path, target_file_path)
+            finally:
+                os.remove(path)
         else:
             raise UnsupportedFormatException(
                 "Unknown subtitle format for file: {}".format(source_file_path)