Merge pull request #1770 from pipecat-ai/pk/amazon-nova-sonic-interruption-reliability

kompfner · web-flow · commit fb5438e9c2ac · 2025-05-07T17:16:06.000-04:00
AWS Nova Sonic service - make interruption handling more reliable, in…
diff --git a/src/pipecat/services/aws_nova_sonic/aws.py b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -172,6 +172,7 @@ def __init__(
         self._content_being_received: Optional[CurrentContent] = None
         self._assistant_is_responding = False
         self._ready_to_send_context = False
+        self._handling_bot_stopped_speaking = False
         self._triggering_assistant_response = False
         self._assistant_response_trigger_audio: Optional[bytes] = (
             None  # Not cleared on _disconnect()
@@ -205,7 +206,7 @@ async def cancel(self, frame: CancelFrame):
 
     async def reset_conversation(self):
         logger.debug("Resetting conversation")
-        await self._handle_bot_stopped_speaking()
+        await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=False)
 
         # Carry over previous context through disconnect
         context = self._context
@@ -226,7 +227,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
         elif isinstance(frame, InputAudioRawFrame):
             await self._handle_input_audio_frame(frame)
         elif isinstance(frame, BotStoppedSpeakingFrame):
-            await self._handle_bot_stopped_speaking()
+            await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=True)
         elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame):
             await self._handle_function_call_result(frame)
 
@@ -248,25 +249,45 @@ async def _handle_input_audio_frame(self, frame: InputAudioRawFrame):
 
         await self._send_user_audio_event(frame.audio)
 
-    async def _handle_bot_stopped_speaking(self):
-        if self._assistant_is_responding:
-            # Consider the assistant finished with their response (after a short delay, to allow for
-            # any FINAL text block to come in).
-            #
-            # TODO: ideally we could base this solely on the LLM output events, but I couldn't
-            # figure out a reliable way to determine when we've gotten our last FINAL text block
-            # after the LLM is done talking.
-            #
-            # First I looked at stopReason, but it doesn't seem like the last FINAL text block is
-            # reliably marked END_TURN (sometimes the *first* one is, but not the last...bug?)
-            #
-            # Then I considered schemes where we tally or match up SPECULATIVE text blocks with
-            # FINAL text blocks to know how many or which FINAL blocks to expect, but user
-            # interruptions throw a wrench in these schemes: depending on the exact timing of the
-            # interruption, we should or shouldn't expect some FINAL blocks.
-            await asyncio.sleep(0.25)
-            self._assistant_is_responding = False
-            await self._report_assistant_response_ended()
+    async def _handle_bot_stopped_speaking(self, delay_to_catch_trailing_assistant_text: bool):
+        # Protect against back-to-back BotStoppedSpeaking calls, which I've observed
+        if self._handling_bot_stopped_speaking:
+            return
+        self._handling_bot_stopped_speaking = True
+
+        async def finalize_assistant_response():
+            if self._assistant_is_responding:
+                # Consider the assistant finished with their response (possibly after a short delay,
+                # to allow for any trailing FINAL assistant text block to come in that need to make
+                # it into context).
+                #
+                # TODO: ideally we could base this solely on the LLM output events, but I couldn't
+                # figure out a reliable way to determine when we've gotten our last FINAL text block
+                # after the LLM is done talking.
+                #
+                # First I looked at stopReason, but it doesn't seem like the last FINAL text block
+                # is reliably marked END_TURN (sometimes the *first* one is, but not the last...
+                # bug?)
+                #
+                # Then I considered schemes where we tally or match up SPECULATIVE text blocks with
+                # FINAL text blocks to know how many or which FINAL blocks to expect, but user
+                # interruptions throw a wrench in these schemes: depending on the exact timing of
+                # the interruption, we should or shouldn't expect some FINAL blocks.
+                if delay_to_catch_trailing_assistant_text:
+                    # This delay length is a balancing act between "catching" trailing assistant
+                    # text that is quite delayed but not waiting so long that user text comes in
+                    # first and results in a bit of context message order scrambling.
+                    await asyncio.sleep(1.25)
+                self._assistant_is_responding = False
+                await self._report_assistant_response_ended()
+
+            self._handling_bot_stopped_speaking = False
+
+        # Finalize the assistant response, either now or after a delay
+        if delay_to_catch_trailing_assistant_text:
+            self.create_task(finalize_assistant_response())
+        else:
+            await finalize_assistant_response()
 
     async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame):
         result = frame.result_frame
@@ -391,6 +412,7 @@ async def _disconnect(self):
             self._content_being_received = None
             self._assistant_is_responding = False
             self._ready_to_send_context = False
+            self._handling_bot_stopped_speaking = False
             self._triggering_assistant_response = False
             self._disconnecting = False
             self._connected_time = None