@@ -172,6 +172,7 @@ def __init__(
172
172
self ._content_being_received : Optional [CurrentContent ] = None
173
173
self ._assistant_is_responding = False
174
174
self ._ready_to_send_context = False
175
+ self ._handling_bot_stopped_speaking = False
175
176
self ._triggering_assistant_response = False
176
177
self ._assistant_response_trigger_audio : Optional [bytes ] = (
177
178
None # Not cleared on _disconnect()
@@ -205,7 +206,7 @@ async def cancel(self, frame: CancelFrame):
205
206
206
207
async def reset_conversation (self ):
207
208
logger .debug ("Resetting conversation" )
208
- await self ._handle_bot_stopped_speaking ()
209
+ await self ._handle_bot_stopped_speaking (delay_to_catch_trailing_assistant_text = False )
209
210
210
211
# Carry over previous context through disconnect
211
212
context = self ._context
@@ -226,7 +227,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
226
227
elif isinstance (frame , InputAudioRawFrame ):
227
228
await self ._handle_input_audio_frame (frame )
228
229
elif isinstance (frame , BotStoppedSpeakingFrame ):
229
- await self ._handle_bot_stopped_speaking ()
230
+ await self ._handle_bot_stopped_speaking (delay_to_catch_trailing_assistant_text = True )
230
231
elif isinstance (frame , AWSNovaSonicFunctionCallResultFrame ):
231
232
await self ._handle_function_call_result (frame )
232
233
@@ -248,25 +249,45 @@ async def _handle_input_audio_frame(self, frame: InputAudioRawFrame):
248
249
249
250
await self ._send_user_audio_event (frame .audio )
250
251
251
- async def _handle_bot_stopped_speaking (self ):
252
- if self ._assistant_is_responding :
253
- # Consider the assistant finished with their response (after a short delay, to allow for
254
- # any FINAL text block to come in).
255
- #
256
- # TODO: ideally we could base this solely on the LLM output events, but I couldn't
257
- # figure out a reliable way to determine when we've gotten our last FINAL text block
258
- # after the LLM is done talking.
259
- #
260
- # First I looked at stopReason, but it doesn't seem like the last FINAL text block is
261
- # reliably marked END_TURN (sometimes the *first* one is, but not the last...bug?)
262
- #
263
- # Then I considered schemes where we tally or match up SPECULATIVE text blocks with
264
- # FINAL text blocks to know how many or which FINAL blocks to expect, but user
265
- # interruptions throw a wrench in these schemes: depending on the exact timing of the
266
- # interruption, we should or shouldn't expect some FINAL blocks.
267
- await asyncio .sleep (0.25 )
268
- self ._assistant_is_responding = False
269
- await self ._report_assistant_response_ended ()
252
+ async def _handle_bot_stopped_speaking (self , delay_to_catch_trailing_assistant_text : bool ):
253
+ # Protect against back-to-back BotStoppedSpeaking calls, which I've observed
254
+ if self ._handling_bot_stopped_speaking :
255
+ return
256
+ self ._handling_bot_stopped_speaking = True
257
+
258
+ async def finalize_assistant_response ():
259
+ if self ._assistant_is_responding :
260
+ # Consider the assistant finished with their response (possibly after a short delay,
261
+ # to allow for any trailing FINAL assistant text block to come in that need to make
262
+ # it into context).
263
+ #
264
+ # TODO: ideally we could base this solely on the LLM output events, but I couldn't
265
+ # figure out a reliable way to determine when we've gotten our last FINAL text block
266
+ # after the LLM is done talking.
267
+ #
268
+ # First I looked at stopReason, but it doesn't seem like the last FINAL text block
269
+ # is reliably marked END_TURN (sometimes the *first* one is, but not the last...
270
+ # bug?)
271
+ #
272
+ # Then I considered schemes where we tally or match up SPECULATIVE text blocks with
273
+ # FINAL text blocks to know how many or which FINAL blocks to expect, but user
274
+ # interruptions throw a wrench in these schemes: depending on the exact timing of
275
+ # the interruption, we should or shouldn't expect some FINAL blocks.
276
+ if delay_to_catch_trailing_assistant_text :
277
+ # This delay length is a balancing act between "catching" trailing assistant
278
+ # text that is quite delayed but not waiting so long that user text comes in
279
+ # first and results in a bit of context message order scrambling.
280
+ await asyncio .sleep (1.25 )
281
+ self ._assistant_is_responding = False
282
+ await self ._report_assistant_response_ended ()
283
+
284
+ self ._handling_bot_stopped_speaking = False
285
+
286
+ # Finalize the assistant response, either now or after a delay
287
+ if delay_to_catch_trailing_assistant_text :
288
+ self .create_task (finalize_assistant_response ())
289
+ else :
290
+ await finalize_assistant_response ()
270
291
271
292
async def _handle_function_call_result (self , frame : AWSNovaSonicFunctionCallResultFrame ):
272
293
result = frame .result_frame
@@ -391,6 +412,7 @@ async def _disconnect(self):
391
412
self ._content_being_received = None
392
413
self ._assistant_is_responding = False
393
414
self ._ready_to_send_context = False
415
+ self ._handling_bot_stopped_speaking = False
394
416
self ._triggering_assistant_response = False
395
417
self ._disconnecting = False
396
418
self ._connected_time = None
0 commit comments