[detectors] Implement Koala-36M

Breakthrough · Breakthrough · commit 1b757a3ce2bb · 2025-02-19T21:44:18.000-05:00
Implement algorithm similar to that described in Koala-36M. Add `KoalaDetector` and `detect-koala` command. #441
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -37,6 +37,7 @@ The following results indicate that ContentDetector achieves the highest perform
 |    HashDetector   |  92.96 |   76.27   | 83.79 |         16.26         |
 | HistogramDetector |  90.55 |   72.76   | 80.68 |         16.13         |
 | ThresholdDetector |  0.00  |   0.00    |  0.00 |         18.95         |
+|   KoalaDetector   |  86.83 |   78.38   | 82.39 |         97.75         |
 
 ## Citation
 ### BBC
diff --git a/benchmarks/bbc_dataset.py b/benchmarks/bbc_dataset.py
@@ -12,15 +12,19 @@ class BBCDataset:
 
     def __init__(self, dataset_dir: str):
         self._video_files = [
-            file for file in sorted(glob.glob(os.path.join(dataset_dir, "videos", "*.mp4")))
+            file
+            for file in sorted(
+                glob.glob(os.path.join("benchmarks", dataset_dir, "videos", "*.mp4"))
+            )
         ]
         self._scene_files = [
-            file for file in sorted(glob.glob(os.path.join(dataset_dir, "fixed", "*.txt")))
+            file
+            for file in sorted(glob.glob(os.path.join("benchmarks", dataset_dir, "fixed", "*.txt")))
         ]
         assert len(self._video_files) == len(self._scene_files)
         for video_file, scene_file in zip(self._video_files, self._scene_files):
             video_id = os.path.basename(video_file).replace("bbc_", "").split(".")[0]
-            scene_id = os.path.basename(scene_file).split("_")[0]
+            scene_id = os.path.basename(scene_file).split("-")[0]
             assert video_id == scene_id
 
     def __getitem__(self, index):
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -10,8 +10,10 @@
     ContentDetector,
     HashDetector,
     HistogramDetector,
+    KoalaDetector,
+    SceneManager,
     ThresholdDetector,
-    detect,
+    open_video,
 )
 
 
@@ -22,6 +24,7 @@ def make_detector(detector_name: str):
         "detect-hash": HashDetector(),
         "detect-hist": HistogramDetector(),
         "detect-threshold": ThresholdDetector(),
+        "detect-koala": KoalaDetector(),
     }
     return detector_map[detector_name]
 
@@ -31,7 +34,19 @@ def _detect_scenes(detector_type: str, dataset):
     for video_file, scene_file in tqdm(dataset):
         start = time.time()
         detector = make_detector(detector_type)
-        pred_scene_list = detect(video_file, detector)
+
+        video = open_video(video_file)
+        scene_manager = SceneManager()
+        scene_manager.add_detector(detector)
+        # TODO: We should also do this for detect-hash.
+        if detector_type == "detect-koala":
+            scene_manager.auto_downscale = False
+        scene_manager.detect_scenes(
+            video=video,
+            show_progress=True,
+        )
+        pred_scene_list = scene_manager.get_scene_list()
+
         elapsed = time.time() - start
         scenes = {
             scene_file: {
@@ -74,6 +89,7 @@ def main(args):
             "detect-hash",
             "detect-hist",
             "detect-threshold",
+            "detect-koala",
         ],
         default="detect-content",
         help="Detector name. Implemented detectors are listed: https://www.scenedetect.com/docs/latest/cli.html",
diff --git a/dist/requirements_windows.txt b/dist/requirements_windows.txt
@@ -7,6 +7,7 @@ moviepy==2.1.1
 numpy==2.1.3
 platformdirs==4.3.6
 tqdm==4.67.1
+scikit-image==0.24.0
 
 # Build-only and test-only requirements.
 pyinstaller
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ opencv-python
 platformdirs
 pytest>=7.0
 tqdm
+scikit-image
diff --git a/requirements_headless.txt b/requirements_headless.txt
@@ -7,4 +7,5 @@ numpy
 opencv-python-headless
 platformdirs
 pytest>=7.0
-tqdm
+scikit-image
+tqdm
diff --git a/scenedetect/__init__.py b/scenedetect/__init__.py
@@ -42,6 +42,7 @@
     ThresholdDetector,
     HistogramDetector,
     HashDetector,
+    KoalaDetector,
 )
 from scenedetect.backends import (
     AVAILABLE_BACKENDS,
diff --git a/scenedetect/_cli/__init__.py b/scenedetect/_cli/__init__.py
@@ -42,6 +42,7 @@
     ContentDetector,
     HashDetector,
     HistogramDetector,
+    KoalaDetector,
     ThresholdDetector,
 )
 from scenedetect.platform import get_cv2_imwrite_params, get_system_version_info
@@ -1590,3 +1591,16 @@ def save_qp_command(
 scenedetect.add_command(list_scenes_command)
 scenedetect.add_command(save_images_command)
 scenedetect.add_command(split_video_command)
+
+
+@click.command("detect-koala", cls=Command, help="""WIP""")
+@click.pass_context
+def detect_koala_command(
+    ctx: click.Context,
+):
+    ctx = ctx.obj
+    assert isinstance(ctx, CliContext)
+    ctx.add_detector(KoalaDetector, {"min_scene_len": None})
+
+
+scenedetect.add_command(detect_koala_command)
diff --git a/scenedetect/detectors/__init__.py b/scenedetect/detectors/__init__.py
@@ -40,6 +40,7 @@
 from scenedetect.detectors.adaptive_detector import AdaptiveDetector
 from scenedetect.detectors.hash_detector import HashDetector
 from scenedetect.detectors.histogram_detector import HistogramDetector
+from scenedetect.detectors.koala_detector import KoalaDetector
 
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 #                                                                             #
diff --git a/scenedetect/detectors/koala_detector.py b/scenedetect/detectors/koala_detector.py
@@ -0,0 +1,88 @@
+#
+#            PySceneDetect: Python-Based Video Scene Detector
+#   -------------------------------------------------------------------
+#     [  Site:    https://scenedetect.com                           ]
+#     [  Docs:    https://scenedetect.com/docs/                     ]
+#     [  Github:  https://github.com/Breakthrough/PySceneDetect/    ]
+#
+# Copyright (C) 2014-2024 Brandon Castellano <http://www.bcastell.com>.
+# PySceneDetect is licensed under the BSD 3-Clause License; see the
+# included LICENSE file, or visit one of the above pages for details.
+#
+""":class:`KoalaDetector` uses the detection method described by Koala-36M.
+See https://koala36m.github.io/ for details.
+
+TODO: Cite correctly.
+
+This detector is available from the command-line as the `detect-koala` command.
+"""
+
+import typing as ty
+
+import cv2
+import numpy as np
+from skimage.metrics import structural_similarity
+
+from scenedetect.scene_detector import SceneDetector
+
+
+class KoalaDetector(SceneDetector):
+    def __init__(self, min_scene_len: int = None):
+        self._start_frame_num: int = None
+        self._min_scene_len: int = min_scene_len if min_scene_len else 0
+        self._last_histogram: np.ndarray = None
+        self._last_edges: np.ndarray = None
+        self._scores: ty.List[ty.List[int]] = []
+
+        # Tunables (TODO: Make these config params):
+
+        # Boxcar filter size (should be <= window size)
+        self._filter_size: int = 3
+        # Window to use for calculating threshold (should be >= filter size).
+        self._window_size: int = 8
+        # Multiplier for standard deviations when calculating threshold.
+        self._deviation: float = 3.0
+
+    def process_frame(self, frame_num: int, frame_img: np.ndarray) -> ty.List[int]:
+        # TODO: frame_img is already downscaled here. The same problem exists in HashDetector.
+        # For now we can just set downscale factor to 1 in SceneManager to work around the issue.
+        frame_img = cv2.resize(frame_img, (256, 256))
+        histogram = np.asarray(
+            [cv2.calcHist([c], [0], None, [254], [1, 255]) for c in cv2.split(frame_img)]
+        )
+        # TODO: Make the parameters below tunable.
+        frame_gray = cv2.resize(cv2.cvtColor(frame_img, cv2.COLOR_BGR2GRAY), (128, 128))
+        edges = np.maximum(frame_gray, cv2.Canny(frame_gray, 100, 200))
+        if self._start_frame_num is not None:
+            delta_histogram = cv2.compareHist(self._last_histogram, histogram, cv2.HISTCMP_CORREL)
+            delta_edges = structural_similarity(self._last_edges, edges, data_range=255)
+            score = 4.61480465 * delta_histogram + 3.75211168 * delta_edges - 5.485968377115124
+            self._scores.append(score)
+        if self._start_frame_num is None:
+            self._start_frame_num = frame_num
+        self._last_histogram = histogram
+        self._last_edges = edges
+        return []
+
+    def post_process(self, frame_num: int) -> ty.List[int]:
+        cut_found = [score < 0.0 for score in self._scores]
+        cut_found.append(True)
+        filter = [1] * self._filter_size
+        cutoff = float(self._filter_size) / float(self._filter_size + 1)
+        filtered = np.convolve(self._scores, filter, mode="same")
+        for frame_num in range(len(self._scores)):
+            if frame_num >= self._window_size and filtered[frame_num] < cutoff:
+                # TODO: Should we discard the N most extreme values before calculating threshold?
+                window = filtered[frame_num - self._window_size : frame_num]
+                threshold = window.mean() - (self._deviation * window.std())
+                if filtered[frame_num] < threshold:
+                    cut_found[frame_num] = True
+
+        cuts = []
+        last_cut = 0
+        for frame_num in range(len(cut_found)):
+            if cut_found[frame_num]:
+                if (frame_num - last_cut) > self._window_size:
+                    cuts.append(last_cut)
+                last_cut = frame_num + 1
+        return [cut + self._start_frame_num for cut in cuts][1:]
diff --git a/tests/test_detectors.py b/tests/test_detectors.py
@@ -29,6 +29,7 @@
     ContentDetector,
     HashDetector,
     HistogramDetector,
+    KoalaDetector,
     ThresholdDetector,
 )
 
@@ -37,6 +38,7 @@
     ContentDetector,
     HashDetector,
     HistogramDetector,
+    KoalaDetector,
 )
 
 ALL_DETECTORS: ty.Tuple[ty.Type[SceneDetector]] = (*FAST_CUT_DETECTORS, ThresholdDetector)
@@ -123,7 +125,9 @@ def get_fast_cut_test_cases():
             ),
             id="%s/m=30" % detector_type.__name__,
         )
+        # TODO: Make this work, right now min_scene_len isn't used by the detector.
         for detector_type in FAST_CUT_DETECTORS
+        if detector_type != KoalaDetector
     ]
     return test_cases
 

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@`
`42`	`42`	`ThresholdDetector,`
`43`	`43`	`HistogramDetector,`
`44`	`44`	`HashDetector,`
	`45`	`+ KoalaDetector,`
`45`	`46`	`)`
`46`	`47`	`from scenedetect.backends import (`
`47`	`48`	`AVAILABLE_BACKENDS,`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@`
`29`	`29`	`ContentDetector,`
`30`	`30`	`HashDetector,`
`31`	`31`	`HistogramDetector,`
	`32`	`+ KoalaDetector,`
`32`	`33`	`ThresholdDetector,`
`33`	`34`	`)`
`34`	`35`
`@@ -37,6 +38,7 @@`
`37`	`38`	`ContentDetector,`
`38`	`39`	`HashDetector,`
`39`	`40`	`HistogramDetector,`
	`41`	`+ KoalaDetector,`
`40`	`42`	`)`
`41`	`43`
`42`	`44`	`ALL_DETECTORS: ty.Tuple[ty.Type[SceneDetector]] = (*FAST_CUT_DETECTORS, ThresholdDetector)`
`@@ -123,7 +125,9 @@ def get_fast_cut_test_cases():`
`123`	`125`	`),`
`124`	`126`	`id="%s/m=30" % detector_type.__name__,`
`125`	`127`	`)`
	`128`	`+ # TODO: Make this work, right now min_scene_len isn't used by the detector.`
`126`	`129`	`for detector_type in FAST_CUT_DETECTORS`
	`130`	`+ if detector_type != KoalaDetector`
`127`	`131`	`]`
`128`	`132`	`return test_cases`
`129`	`133`