src/modules/core/transition_mix.c

/*
 * transition_mix.c -- mix two audio streams
 * Copyright (C) 2003-2020 Meltytech, LLC
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include <framework/mlt_frame.h>
#include <framework/mlt_log.h>
#include <framework/mlt_transition.h>

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_SAMPLES (192000)
#define SAMPLE_BYTES(samples, channels) ((samples) * (channels) * sizeof(float))
#define MAX_BYTES SAMPLE_BYTES(MAX_SAMPLES, MAX_CHANNELS)

typedef struct transition_mix_s
{
    mlt_transition parent;
    float src_buffer[MAX_SAMPLES * MAX_CHANNELS];
    float dest_buffer[MAX_SAMPLES * MAX_CHANNELS];
    int src_buffer_count;
    int dest_buffer_count;
    mlt_position previous_frame_a;
    mlt_position previous_frame_b;
} * transition_mix;

static void mix_audio(double weight_start,
                      double weight_end,
                      float *buffer_a,
                      float *buffer_b,
                      int channels_a,
                      int channels_b,
                      int channels_out,
                      int samples)
{
    int i, j;
    double a, b, v;

    // Compute a smooth ramp over start to end
    double mix = weight_start;
    double mix_step = (weight_end - weight_start) / samples;

    for (i = 0; i < samples; i++) {
        for (j = 0; j < channels_out; j++) {
            a = (double) buffer_a[i * channels_a + j];
            b = (double) buffer_b[i * channels_b + j];
            v = mix * b + (1.0 - mix) * a;
            buffer_a[i * channels_a + j] = v;
        }
        mix += mix_step;
    }
}

static void sum_audio(double weight_start,
                      double weight_end,
                      float *buffer_a,
                      float *buffer_b,
                      int channels_a,
                      int channels_b,
                      int channels_out,
                      int samples)
{
    int i, j;
    double a, b;

    // Compute a smooth ramp over start to end
    double mix = weight_start;
    double mix_step = (weight_end - weight_start) / samples;

    for (i = 0; i < samples; i++) {
        for (j = 0; j < channels_out; j++) {
            a = (double) buffer_a[i * channels_a + j];
            b = (double) buffer_b[i * channels_b + j];
            buffer_a[i * channels_a + j] = mix * b + a;
        }
        mix += mix_step;
    }
}

// This filter uses an inline low pass filter to allow mixing without volume hacking.
static void combine_audio(double weight,
                          float *buffer_a,
                          float *buffer_b,
                          int channels_a,
                          int channels_b,
                          int channels_out,
                          int samples)
{
    int i, j;
    double Fc = 0.5;
    double B = exp(-2.0 * M_PI * Fc);
    double A = 1.0 - B;
    double a, b, v;
    double v_prev[MAX_CHANNELS];

    for (j = 0; j < channels_out; j++)
        v_prev[j] = (double) buffer_a[j];

    for (i = 0; i < samples; i++) {
        for (j = 0; j < channels_out; j++) {
            a = (double) buffer_a[i * channels_a + j];
            b = (double) buffer_b[i * channels_b + j];
            v = weight * a + b;
            v_prev[j] = buffer_a[i * channels_a + j] = v * A + v_prev[j] * B;
        }
    }
}

/** Get the audio.
*/

static int transition_get_audio(mlt_frame frame_a,
                                void **buffer,
                                mlt_audio_format *format,
                                int *frequency,
                                int *channels,
                                int *samples)
{
    int error = 0;

    // Get the b frame from the stack
    mlt_frame frame_b = mlt_frame_pop_audio(frame_a);

    // Get the effect
    mlt_transition transition = mlt_frame_pop_audio(frame_a);

    // Get the properties of the b frame
    mlt_properties b_props = MLT_FRAME_PROPERTIES(frame_b);

    transition_mix self = transition->child;
    float *buffer_b, *buffer_a;
    int frequency_b = *frequency, frequency_a = *frequency;
    int channels_b = *channels, channels_a = *channels;
    int samples_b = *samples, samples_a = *samples;

    // We can only mix interleaved 32-bit float.
    *format = mlt_audio_f32le;
    // Get the audio from our producers
    mlt_frame_get_audio(frame_b, (void **) &buffer_b, format, &frequency_b, &channels_b, &samples_b);
    mlt_frame_get_audio(frame_a, (void **) &buffer_a, format, &frequency_a, &channels_a, &samples_a);

    // Prevent dividing by zero.
    if (!channels_a || !channels_b || !buffer_a || !buffer_b)
        return 1;

    if (buffer_b == buffer_a) {
        *samples = samples_b;
        *channels = channels_b;
        *buffer = buffer_b;
        *frequency = frequency_b;
        return error;
    }

    // I do not recall what these silent_audio properties are about.
    int silent = mlt_properties_get_int(MLT_FRAME_PROPERTIES(frame_a), "silent_audio");
    mlt_properties_set_int(MLT_FRAME_PROPERTIES(frame_a), "silent_audio", 0);
    if (silent)
        memset(buffer_a, 0, samples_a * channels_a * sizeof(float));
    silent = mlt_properties_get_int(b_props, "silent_audio");
    mlt_properties_set_int(b_props, "silent_audio", 0);
    if (silent)
        memset(buffer_b, 0, samples_b * channels_b * sizeof(float));

        // At this point we have two frames of audio with possibly differing sample
        // counts. How to reconcile this?

#ifdef KEEP_IT_SIMPLE_AND_STUPID
    // The simple and stupid way to deal with different sample counts was to
    // use the lesser of the two. This sounds good. You can #define SIMPLE_AND_STUPID
    // and hear what it sounds like.
    *samples = MIN(samples_a, samples_b);
    *channels = MIN(MIN(channels_b, channels_a), MAX_CHANNELS);
    *frequency = frequency_a;
    // Note this direct call to sum_audio() skips ramping and the alternative
    // mixing methods.
    sum_audio(1, 1, buffer_a, buffer_b, channels_a, channels_b, *channels, *samples);
    *buffer = buffer_a;

    return error;
#endif

    // However, the simple and stupid approach drops samples. Over time, this
    // can accumulate and cause an A/V sync drift, which addressed in b2640656
    // by saving the unused samples in a buffer and then using them first on the
    // next iteration.

    // determine number of samples to process
    *samples = MIN(self->src_buffer_count + samples_b, self->dest_buffer_count + samples_a);
    *channels = MIN(MIN(channels_b, channels_a), MAX_CHANNELS);
    *frequency = frequency_a;

    // Prevent src buffer overflow by discarding oldest samples.
    samples_b = MIN(samples_b, MAX_SAMPLES * MAX_CHANNELS / channels_b);
    size_t bytes = SAMPLE_BYTES(samples_b, channels_b);
    if (SAMPLE_BYTES(self->src_buffer_count + samples_b, channels_b) > MAX_BYTES) {
        mlt_log_verbose(MLT_TRANSITION_SERVICE(transition),
                        "buffer overflow: src_buffer_count %d\n",
                        self->src_buffer_count);
        self->src_buffer_count = MAX_SAMPLES * MAX_CHANNELS / channels_b - samples_b;
        memmove(self->src_buffer,
                &self->src_buffer[MAX_SAMPLES * MAX_CHANNELS - samples_b * channels_b],
                SAMPLE_BYTES(samples_b, channels_b));
    }

    // Silence src buffer if discontinuity
    if (self->src_buffer_count > 0 && mlt_frame_get_position(frame_b) != self->previous_frame_b + 1)
        memset(self->src_buffer, 0, SAMPLE_BYTES(self->src_buffer_count, channels_b));
    self->previous_frame_b = mlt_frame_get_position(frame_b);

    // Append the new samples from frame B to the src buffer
    memcpy(&self->src_buffer[self->src_buffer_count * channels_b], buffer_b, bytes);
    self->src_buffer_count += samples_b;
    buffer_b = self->src_buffer;

    // Prevent dest buffer overflow by discarding oldest samples.
    samples_a = MIN(samples_a, MAX_SAMPLES * MAX_CHANNELS / channels_a);
    bytes = SAMPLE_BYTES(samples_a, channels_a);
    if (SAMPLE_BYTES(self->dest_buffer_count + samples_a, channels_a) > MAX_BYTES) {
        mlt_log_verbose(MLT_TRANSITION_SERVICE(transition),
                        "buffer overflow: dest_buffer_count %d\n",
                        self->dest_buffer_count);
        self->dest_buffer_count = MAX_SAMPLES * MAX_CHANNELS / channels_a - samples_a;
        memmove(self->dest_buffer,
                &self->dest_buffer[MAX_SAMPLES * MAX_CHANNELS - samples_a * channels_a],
                SAMPLE_BYTES(samples_a, channels_a));
    }

    // Silence dest buffer if discontinuity
    if (self->dest_buffer_count > 0 && mlt_frame_get_position(frame_a) != self->previous_frame_a + 1)
        memset(self->dest_buffer, 0, SAMPLE_BYTES(self->dest_buffer_count, channels_a));
    self->previous_frame_a = mlt_frame_get_position(frame_a);

    // Append the new samples from frame A to the dest buffer
    memcpy(&self->dest_buffer[self->dest_buffer_count * channels_a], buffer_a, bytes);
    self->dest_buffer_count += samples_a;
    buffer_a = self->dest_buffer;

    // Do the mixing.
    if (mlt_properties_get_int(MLT_TRANSITION_PROPERTIES(transition), "sum")) {
        double mix_start = 1.0, mix_end = 1.0;
        if (mlt_properties_get(b_props, "audio.previous_mix"))
            mix_start = mlt_properties_get_double(b_props, "audio.previous_mix");
        if (mlt_properties_get(b_props, "audio.mix"))
            mix_end = mlt_properties_get_double(b_props, "audio.mix");
        if (mlt_properties_get_int(b_props, "audio.reverse")) {
            mix_start = 1.0 - mix_start;
            mix_end = 1.0 - mix_end;
        }
        sum_audio(mix_start, mix_end, buffer_a, buffer_b, channels_a, channels_b, *channels, *samples);
    } else if (mlt_properties_get_int(MLT_TRANSITION_PROPERTIES(transition), "combine")) {
        double weight = 1.0;
        if (mlt_properties_get_int(MLT_FRAME_PROPERTIES(frame_a), "meta.mixdown"))
            weight = 1.0 - mlt_properties_get_double(MLT_FRAME_PROPERTIES(frame_a), "meta.volume");
        combine_audio(weight, buffer_a, buffer_b, channels_a, channels_b, *channels, *samples);
    } else {
        double mix_start = 0.5, mix_end = 0.5;
        if (mlt_properties_get(b_props, "audio.previous_mix"))
            mix_start = mlt_properties_get_double(b_props, "audio.previous_mix");
        if (mlt_properties_get(b_props, "audio.mix"))
            mix_end = mlt_properties_get_double(b_props, "audio.mix");
        if (mlt_properties_get_int(b_props, "audio.reverse")) {
            mix_start = 1.0 - mix_start;
            mix_end = 1.0 - mix_end;
        }
        mix_audio(mix_start, mix_end, buffer_a, buffer_b, channels_a, channels_b, *channels, *samples);
    }

    // Copy the audio from the dest buffer into the frame.
    bytes = SAMPLE_BYTES(*samples, *channels);
    *buffer = mlt_pool_alloc(bytes);
    memcpy(*buffer, buffer_a, bytes);
    mlt_frame_set_audio(frame_a, *buffer, *format, bytes, mlt_pool_release);

    if (mlt_properties_get_int(b_props, "_speed") == 0) {
        // Flush the buffer when paused and scrubbing.
        samples_b = self->src_buffer_count;
        samples_a = self->dest_buffer_count;
    } else {
        // It is also not good for A/V sync to let many samples accumulate in
        // the buffer. This part provides a time-based buffer limit.

        // Determine the maximum amount of latency permitted in the buffer.
        int max_latency = CLAMP(*frequency / 1000, 0, MAX_SAMPLES); // samples in 1ms
        // samples_b becomes the new target src buffer count.
        samples_b = CLAMP(self->src_buffer_count - *samples, 0, max_latency);
        // samples_b becomes the number of samples to consume: difference between actual and the target.
        samples_b = self->src_buffer_count - samples_b;
        // samples_a becomes the new target dest buffer count.
        samples_a = CLAMP(self->dest_buffer_count - *samples, 0, max_latency);
        // samples_a becomes the number of samples to consume: difference between actual and the target.
        samples_a = self->dest_buffer_count - samples_a;
    }

    // Consume the src buffer.
    self->src_buffer_count -= samples_b;
    if (self->src_buffer_count) {
        memmove(self->src_buffer,
                &self->src_buffer[samples_b * channels_b],
                SAMPLE_BYTES(self->src_buffer_count, channels_b));
    }
    // Consume the dest buffer.
    self->dest_buffer_count -= samples_a;
    if (self->dest_buffer_count > 0) {
        memmove(self->dest_buffer,
                &self->dest_buffer[samples_a * channels_a],
                SAMPLE_BYTES(self->dest_buffer_count, channels_a));
    }

    return error;
}

/** Mix transition processing.
*/

static mlt_frame transition_process(mlt_transition transition, mlt_frame a_frame, mlt_frame b_frame)
{
    mlt_properties properties = MLT_TRANSITION_PROPERTIES(transition);
    mlt_properties b_props = MLT_FRAME_PROPERTIES(b_frame);

    // Only if mix is specified, otherwise a producer may set the mix
    if (mlt_properties_get(properties, "start")) {
        // Determine the time position of this frame in the transition duration
        mlt_properties props = mlt_properties_get_data(MLT_FRAME_PROPERTIES(b_frame),
                                                       "_producer",
                                                       NULL);
        mlt_position in = mlt_properties_get_int(props, "in");
        mlt_position out = mlt_properties_get_int(props, "out");
        int length = mlt_properties_get_int(properties, "length");
        mlt_position time = mlt_properties_get_int(props, "_frame");
        double mix = mlt_transition_get_progress(transition, b_frame);
        if (mlt_properties_get_int(properties, "always_active"))
            mix = (double) (time - in) / (double) (out - in + 1);

        // TODO: Check the logic here - shouldn't we be computing current and next mixing levels in all cases?
        if (length == 0) {
            // If there is an end mix level adjust mix to the range
            if (mlt_properties_get(properties, "end")) {
                double start = mlt_properties_get_double(properties, "start");
                double end = mlt_properties_get_double(properties, "end");
                mix = start + (end - start) * mix;
            }
            // A negative means total crossfade (uses position)
            else if (mlt_properties_get_double(properties, "start") >= 0) {
                // Otherwise, start/constructor is a constant mix level
                mix = mlt_properties_get_double(properties, "start");
            }

            // Finally, set the mix property on the frame
            mlt_properties_set_double(b_props, "audio.mix", mix);

            // Initialise transition previous mix value to prevent an inadvertent jump from 0
            mlt_position last_position = mlt_properties_get_position(properties, "_last_position");
            mlt_position current_position = mlt_frame_get_position(b_frame);
            mlt_properties_set_position(properties, "_last_position", current_position);
            if (!mlt_properties_get(properties, "_previous_mix")
                || current_position != last_position + 1)
                mlt_properties_set_double(properties, "_previous_mix", mix);

            // Tell b frame what the previous mix level was
            mlt_properties_set_double(b_props,
                                      "audio.previous_mix",
                                      mlt_properties_get_double(properties, "_previous_mix"));

            // Save the current mix level for the next iteration
            mlt_properties_set_double(properties,
                                      "_previous_mix",
                                      mlt_properties_get_double(b_props, "audio.mix"));

            mlt_properties_set_double(b_props,
                                      "audio.reverse",
                                      mlt_properties_get_double(properties, "reverse"));
        } else {
            double level = mlt_properties_get_double(properties, "start");
            double mix_start = level;
            double mix_end = mix_start;
            double mix_increment = 1.0 / length;
            if (time - in < length) {
                mix_start = mix_start * ((double) (time - in) / length);
                mix_end = mix_start + mix_increment;
            } else if (time > out - length) {
                mix_end = mix_start * ((double) (out - time - in) / length);
                mix_start = mix_end - mix_increment;
            }

            mix_start = mix_start < 0 ? 0 : mix_start > level ? level : mix_start;
            mix_end = mix_end < 0 ? 0 : mix_end > level ? level : mix_end;
            mlt_properties_set_double(b_props, "audio.previous_mix", mix_start);
            mlt_properties_set_double(b_props, "audio.mix", mix_end);
        }
    }

    // Override the get_audio method
    mlt_frame_push_audio(a_frame, transition);
    mlt_frame_push_audio(a_frame, b_frame);
    mlt_frame_push_audio(a_frame, transition_get_audio);

    // Ensure transition_get_audio is called if test_audio=1.
    if (mlt_properties_get_int(properties, "accepts_blanks"))
        mlt_properties_set_int(MLT_FRAME_PROPERTIES(a_frame), "test_audio", 0);

    return a_frame;
}

static void transition_close(mlt_transition transition)
{
    free(transition->child);
    transition->close = NULL;
    mlt_transition_close(transition);
}

/** Constructor for the transition.
*/

mlt_transition transition_mix_init(mlt_profile profile,
                                   mlt_service_type type,
                                   const char *id,
                                   char *arg)
{
    transition_mix mix = calloc(1, sizeof(struct transition_mix_s));
    mlt_transition transition = calloc(1, sizeof(struct mlt_transition_s));
    if (mix && transition && !mlt_transition_init(transition, mix)) {
        mix->parent = transition;
        transition->close = transition_close;
        transition->process = transition_process;
        if (arg) {
            mlt_properties_set_double(MLT_TRANSITION_PROPERTIES(transition), "start", atof(arg));
            if (atof(arg) < 0)
                mlt_properties_set_int(MLT_TRANSITION_PROPERTIES(transition), "accepts_blanks", 1);
        }
        // Inform apps and framework that this is an audio only transition
        mlt_properties_set_int(MLT_TRANSITION_PROPERTIES(transition), "_transition_type", 2);
    } else {
        if (transition)
            mlt_transition_close(transition);
        if (mix)
            free(mix);
    }
    return transition;
}