Skip to content

Commit fb5438e

Browse files
authored
Merge pull request #1770 from pipecat-ai/pk/amazon-nova-sonic-interruption-reliability
AWS Nova Sonic service - make interruption handling more reliable, in…
2 parents 7da9f66 + 84d040c commit fb5438e

File tree

1 file changed

+43
-21
lines changed
  • src/pipecat/services/aws_nova_sonic

1 file changed

+43
-21
lines changed

src/pipecat/services/aws_nova_sonic/aws.py

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ def __init__(
172172
self._content_being_received: Optional[CurrentContent] = None
173173
self._assistant_is_responding = False
174174
self._ready_to_send_context = False
175+
self._handling_bot_stopped_speaking = False
175176
self._triggering_assistant_response = False
176177
self._assistant_response_trigger_audio: Optional[bytes] = (
177178
None # Not cleared on _disconnect()
@@ -205,7 +206,7 @@ async def cancel(self, frame: CancelFrame):
205206

206207
async def reset_conversation(self):
207208
logger.debug("Resetting conversation")
208-
await self._handle_bot_stopped_speaking()
209+
await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=False)
209210

210211
# Carry over previous context through disconnect
211212
context = self._context
@@ -226,7 +227,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
226227
elif isinstance(frame, InputAudioRawFrame):
227228
await self._handle_input_audio_frame(frame)
228229
elif isinstance(frame, BotStoppedSpeakingFrame):
229-
await self._handle_bot_stopped_speaking()
230+
await self._handle_bot_stopped_speaking(delay_to_catch_trailing_assistant_text=True)
230231
elif isinstance(frame, AWSNovaSonicFunctionCallResultFrame):
231232
await self._handle_function_call_result(frame)
232233

@@ -248,25 +249,45 @@ async def _handle_input_audio_frame(self, frame: InputAudioRawFrame):
248249

249250
await self._send_user_audio_event(frame.audio)
250251

251-
async def _handle_bot_stopped_speaking(self):
252-
if self._assistant_is_responding:
253-
# Consider the assistant finished with their response (after a short delay, to allow for
254-
# any FINAL text block to come in).
255-
#
256-
# TODO: ideally we could base this solely on the LLM output events, but I couldn't
257-
# figure out a reliable way to determine when we've gotten our last FINAL text block
258-
# after the LLM is done talking.
259-
#
260-
# First I looked at stopReason, but it doesn't seem like the last FINAL text block is
261-
# reliably marked END_TURN (sometimes the *first* one is, but not the last...bug?)
262-
#
263-
# Then I considered schemes where we tally or match up SPECULATIVE text blocks with
264-
# FINAL text blocks to know how many or which FINAL blocks to expect, but user
265-
# interruptions throw a wrench in these schemes: depending on the exact timing of the
266-
# interruption, we should or shouldn't expect some FINAL blocks.
267-
await asyncio.sleep(0.25)
268-
self._assistant_is_responding = False
269-
await self._report_assistant_response_ended()
252+
async def _handle_bot_stopped_speaking(self, delay_to_catch_trailing_assistant_text: bool):
253+
# Protect against back-to-back BotStoppedSpeaking calls, which I've observed
254+
if self._handling_bot_stopped_speaking:
255+
return
256+
self._handling_bot_stopped_speaking = True
257+
258+
async def finalize_assistant_response():
259+
if self._assistant_is_responding:
260+
# Consider the assistant finished with their response (possibly after a short delay,
261+
# to allow for any trailing FINAL assistant text block to come in that need to make
262+
# it into context).
263+
#
264+
# TODO: ideally we could base this solely on the LLM output events, but I couldn't
265+
# figure out a reliable way to determine when we've gotten our last FINAL text block
266+
# after the LLM is done talking.
267+
#
268+
# First I looked at stopReason, but it doesn't seem like the last FINAL text block
269+
# is reliably marked END_TURN (sometimes the *first* one is, but not the last...
270+
# bug?)
271+
#
272+
# Then I considered schemes where we tally or match up SPECULATIVE text blocks with
273+
# FINAL text blocks to know how many or which FINAL blocks to expect, but user
274+
# interruptions throw a wrench in these schemes: depending on the exact timing of
275+
# the interruption, we should or shouldn't expect some FINAL blocks.
276+
if delay_to_catch_trailing_assistant_text:
277+
# This delay length is a balancing act between "catching" trailing assistant
278+
# text that is quite delayed but not waiting so long that user text comes in
279+
# first and results in a bit of context message order scrambling.
280+
await asyncio.sleep(1.25)
281+
self._assistant_is_responding = False
282+
await self._report_assistant_response_ended()
283+
284+
self._handling_bot_stopped_speaking = False
285+
286+
# Finalize the assistant response, either now or after a delay
287+
if delay_to_catch_trailing_assistant_text:
288+
self.create_task(finalize_assistant_response())
289+
else:
290+
await finalize_assistant_response()
270291

271292
async def _handle_function_call_result(self, frame: AWSNovaSonicFunctionCallResultFrame):
272293
result = frame.result_frame
@@ -391,6 +412,7 @@ async def _disconnect(self):
391412
self._content_being_received = None
392413
self._assistant_is_responding = False
393414
self._ready_to_send_context = False
415+
self._handling_bot_stopped_speaking = False
394416
self._triggering_assistant_response = False
395417
self._disconnecting = False
396418
self._connected_time = None

0 commit comments

Comments
 (0)