From d63c1776b5c72ce5a60cd93b25abbea50b676f58 Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Fri, 20 Feb 2026 18:39:34 +0100 Subject: [PATCH] last commit before new session --- Unreal/PS_AI_Agent/Bug en suspend.txt | 359 ++++++++++++++++++ ...ElevenLabsConversationalAgentComponent.cpp | 191 ++++++++-- .../Private/ElevenLabsWebSocketProxy.cpp | 34 ++ .../ElevenLabsConversationalAgentComponent.h | 63 ++- 4 files changed, 621 insertions(+), 26 deletions(-) create mode 100644 Unreal/PS_AI_Agent/Bug en suspend.txt diff --git a/Unreal/PS_AI_Agent/Bug en suspend.txt b/Unreal/PS_AI_Agent/Bug en suspend.txt new file mode 100644 index 0000000..358e117 --- /dev/null +++ b/Unreal/PS_AI_Agent/Bug en suspend.txt @@ -0,0 +1,359 @@ +seem to less stuck but timeout often : LogDebuggerCommands: Repeating last play command: Selected Viewport +LogPlayLevel: PlayLevel: No blueprints needed recompiling +LogPlayLevel: Creating play world package: /Game/UEDPIE_0_TestMap +LogPlayLevel: PIE: StaticDuplicateObject took: (0.005478s) +LogPlayLevel: PIE: Created PIE world by copying editor world from /Game/TestMap.TestMap to /Game/UEDPIE_0_TestMap.TestMap (0.005520s) +LogUObjectHash: Compacting FUObjectHashTables data took 0.64ms +LogChaosDD: Creating Chaos Debug Draw Scene for world TestMap +LogPlayLevel: PIE: World Init took: (0.001821s) +LogAudio: Display: Creating Audio Device: Id: 4, Scope: Unique, Realtime: True +LogAudioMixer: Display: Audio Mixer Platform Settings: +LogAudioMixer: Display: Sample Rate: 48000 +LogAudioMixer: Display: Callback Buffer Frame Size Requested: 1024 +LogAudioMixer: Display: Callback Buffer Frame Size To Use: 1024 +LogAudioMixer: Display: Number of buffers to queue: 1 +LogAudioMixer: Display: Max Channels (voices): 32 +LogAudioMixer: Display: Number of Async Source Workers: 4 +LogAudio: Display: AudioDevice MaxSources: 32 +LogAudio: Display: Audio Spatialization Plugin: None (built-in). +LogAudio: Display: Audio Reverb Plugin: None (built-in). +LogAudio: Display: Audio Occlusion Plugin: None (built-in). +LogAudioMixer: Display: Initializing audio mixer using platform API: 'XAudio2' +LogAudioMixer: Display: Using Audio Hardware Device Speakers (Realtek(R) Audio) +LogAudioMixer: Display: Initializing Sound Submixes... +LogAudioMixer: Display: Creating Master Submix 'MasterSubmixDefault' +LogAudioMixer: Display: Creating Master Submix 'MasterReverbSubmixDefault' +LogAudioMixer: FMixerPlatformXAudio2::StartAudioStream() called. InstanceID=4 +LogAudioMixer: Display: Output buffers initialized: Frames=1024, Channels=2, Samples=2048, InstanceID=4 +LogAudioMixer: Display: Starting AudioMixerPlatformInterface::RunInternal(), InstanceID=4 +LogAudioMixer: Display: FMixerPlatformXAudio2::SubmitBuffer() called for the first time. InstanceID=4 +LogInit: FAudioDevice initialized with ID 4. +LogAudio: Display: Audio Device (ID: 4) registered with world 'TestMap'. +LogAudioMixer: Initializing Audio Bus Subsystem for audio device with ID 4 +LogLoad: Game class is 'GameModeBase' +LogWorld: Bringing World /Game/UEDPIE_0_TestMap.TestMap up for play (max tick rate 60) at 2026.02.20-17.38.08 +LogWorld: Bringing up level for play took: 0.000919 +LogOnline: OSS: Created online subsystem instance for: :Context_3 +LogElevenLabsWS: Connecting to ElevenLabs: wss://api.elevenlabs.io/v1/convai/conversation?agent_id=agent_5301kc1qkq49fn2av43nrbsar65k +PIE: Server logged in +PIE: Play in editor total start time 0,075 seconds. +LogElevenLabsWS: WebSocket connected. Sending conversation_initiation_client_data... +LogElevenLabsWS: Sending initiation: { +"type": "conversation_initiation_client_data", +"conversation_config_override": +{ +"agent": +{ +"turn": +{ +"turn_timeout": 1 +} +}, +"tts": +{ +"optimize_streaming_latency": 3 +} +}, +"custom_llm_extra_body": +{ +"enable_intermediate_response": true +} +} +LogElevenLabsWS: Received message type: conversation_initiation_metadata +LogElevenLabsWS: [T+0.00s] Conversation initiated. ID=conv_2901khxymx7xed88dq8tw35b2pfk +LogElevenLabsAgent: [T+0.00s] Agent connected. ConversationID=conv_2901khxymx7xed88dq8tw35b2pfk +LogBlueprintUserMessages: [test_AI_Actor_C_2] Connected +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsWS: [T+2.18s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+2.18s] [Turn 1] Mic opened — user speaking. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+4.60s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+4.60s] [Turn 1] Mic closed — user spoke 2.42s. Waiting for server response (timeout 10s)... +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: [T+5.66s] Agent started generating (1061 ms after turn end — includes VAD silence timeout + LLM start). +LogElevenLabsAgent: [T+5.66s] [Turn 1] Agent generating. (1.06s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Agent Start Generating +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Warning: [T+7.95s] [LATENCY] First audio: 3346 ms after turn end (3346 ms after last chunk) +LogElevenLabsAgent: [T+7.95s] [Turn 1] Agent speaking — first audio chunk. (3.35s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Speaking +LogElevenLabsWS: Received message type: user_transcript +LogElevenLabsWS: Warning: [T+7.95s] [LATENCY] User transcript: 3346 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Transcript : Hello, how are you? +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Received message type: agent_response +LogElevenLabsWS: Warning: [T+9.60s] [LATENCY] Agent text response: 4994 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Text Response : Hello! I'm doing well, thank you for asking. How can I assist you today? +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Speaking +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsWS: [T+18.84s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+18.84s] [Turn 2] Mic opened — user speaking. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+21.67s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+21.67s] [Turn 2] Mic closed — user spoke 2.83s. Waiting for server response (timeout 10s)... +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: [T+22.45s] Agent started generating (777 ms after turn end — includes VAD silence timeout + LLM start). +LogElevenLabsAgent: [T+22.45s] [Turn 2] Agent generating. (0.78s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Agent Start Generating +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Warning: [T+23.85s] [LATENCY] First audio: 2180 ms after turn end (2180 ms after last chunk) +LogElevenLabsAgent: [T+23.85s] [Turn 2] Agent speaking — first audio chunk. (2.18s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Speaking +LogElevenLabsWS: Received message type: user_transcript +LogElevenLabsWS: Warning: [T+23.90s] [LATENCY] User transcript: 2227 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Transcript : Nothing special, just talking. +LogElevenLabsAgent: Warning: [Turn 2] Agent silence hard-timeout (2s) without agent_response — declaring agent stopped. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Speaking +LogElevenLabsWS: Received message type: audio +LogElevenLabsAgent: [T+28.55s] [Turn 2] Agent speaking — first audio chunk. (6.88s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Speaking +LogElevenLabsWS: Received message type: agent_response +LogElevenLabsWS: Warning: [T+28.58s] [LATENCY] Agent text response: 6910 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Text Response : That's perfectly fine! I'm here to chat if you have any questions or just want to talk. What's on your mind? +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Speaking +LogEOSSDK: LogEOS: Updating Product SDK Config, Time: 347.575653 +LogEOSSDK: LogEOS: SDK Config Product Update Request Completed - No Change +LogEOSSDK: LogEOS: ScheduleNextSDKConfigDataUpdate - Time: 347.774902, Update Interval: 340.359497 +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsWS: [T+41.83s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+41.83s] [Turn 3] Mic opened — user speaking. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+44.86s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+44.86s] [Turn 3] Mic closed — user spoke 3.03s. Waiting for server response (timeout 10s)... +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: [T+45.20s] Agent started generating (343 ms after turn end — includes VAD silence timeout + LLM start). +LogElevenLabsAgent: [T+45.20s] [Turn 3] Agent generating. (0.34s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Agent Start Generating +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Warning: [T+48.22s] [LATENCY] First audio: 3361 ms after turn end (3361 ms after last chunk) +LogElevenLabsAgent: [T+48.22s] [Turn 3] Agent speaking — first audio chunk. (3.36s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Speaking +LogElevenLabsWS: Received message type: user_transcript +LogElevenLabsWS: Warning: [T+48.22s] [LATENCY] User transcript: 3361 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Transcript : I wanna... Hi agent! +LogElevenLabsWS: Received message type: agent_response +LogElevenLabsWS: Warning: [T+48.25s] [LATENCY] Agent text response: 3393 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Text Response : Hello! It's good to hear from you. How can I help you today? +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Speaking +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsWS: [T+57.55s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+57.55s] [Turn 4] Mic opened — user speaking. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+60.33s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+60.33s] [Turn 4] Mic closed — user spoke 2.78s. Waiting for server response (timeout 10s)... +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: [T+60.39s] Agent started generating (61 ms after turn end — includes VAD silence timeout + LLM start). +LogElevenLabsAgent: [T+60.39s] [Turn 4] Agent generating. (0.06s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Agent Start Generating +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Warning: [T+62.44s] [LATENCY] First audio: 2111 ms after turn end (2111 ms after last chunk) +LogElevenLabsAgent: [T+62.44s] [Turn 4] Agent speaking — first audio chunk. (2.11s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Speaking +LogElevenLabsWS: Received message type: user_transcript +LogElevenLabsWS: Warning: [T+62.44s] [LATENCY] User transcript: 2112 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Transcript : Do you speak French? +LogElevenLabsAgent: Warning: [Turn 4] Agent silence hard-timeout (2s) without agent_response — declaring agent stopped. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Speaking +LogElevenLabsWS: Received message type: audio +LogElevenLabsAgent: [T+66.24s] [Turn 4] Agent speaking — first audio chunk. (5.91s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Speaking +LogElevenLabsWS: Received message type: agent_response +LogElevenLabsWS: Warning: [T+66.27s] [LATENCY] Agent text response: 5944 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Text Response : Yes, I can communicate in French. Would you like to ask me something in French or perhaps practice a bit? +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Speaking +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsWS: [T+74.14s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+74.14s] [Turn 5] Mic opened — user speaking. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+76.88s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+76.88s] [Turn 5] Mic closed — user spoke 2.75s. Waiting for server response (timeout 10s)... +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: [T+77.28s] Agent started generating (393 ms after turn end — includes VAD silence timeout + LLM start). +LogElevenLabsAgent: [T+77.28s] [Turn 5] Agent generating. (0.39s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Agent Start Generating +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Warning: [T+78.31s] [LATENCY] First audio: 1428 ms after turn end (1428 ms after last chunk) +LogElevenLabsAgent: [T+78.31s] [Turn 5] Agent speaking — first audio chunk. (1.43s after turn end) +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Speaking +LogElevenLabsWS: Received message type: user_transcript +LogElevenLabsWS: Warning: [T+78.34s] [LATENCY] User transcript: 1460 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Transcript : Yes, tell me something in French. +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Received message type: audio +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsAgent: StartListening: interrupting agent (speaking) to allow user to speak. +LogElevenLabsWS: Sending interrupt — ignoring incoming content until server acks. +LogElevenLabsAgent: [T+84.24s] [Turn 5] Agent stopped speaking (spoke 5.93s, full turn round-trip 7.36s). +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Speaking +LogElevenLabsWS: [T+84.24s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+84.24s] [Turn 6] Mic opened — user speaking. +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Received message type: agent_response +LogElevenLabsWS: Warning: [T+84.43s] [LATENCY] Agent text response: 7549 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Text Response : Bien sûr! Je peux vous dire: "Bonjour, comment allez-vous aujourd'hui?" This means "Hello, how are you today?" +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+88.56s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+88.56s] [Turn 6] Mic closed — user spoke 4.32s. Waiting for server response (timeout 10s)... +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Warning: [T+92.37s] [LATENCY] First audio: 3811 ms after turn end (3811 ms after last chunk) +LogElevenLabsWS: Received message type: user_transcript +LogElevenLabsWS: Warning: [T+92.37s] [LATENCY] User transcript: 3811 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Transcript : Mais c'est super! Tu parles très bien! +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Received message type: agent_response +LogElevenLabsWS: Warning: [T+97.23s] [LATENCY] Agent text response: 8677 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Text Response : Merci beaucoup! Je suis ravie que vous le pensiez. Y a-t-il autre chose que vous aimeriez savoir ou discuter en français? +LogElevenLabsAgent: Warning: [T+98.56s] [Turn 6] Response timeout — server did not start generating after 10.0s. Firing OnAgentResponseTimeout. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsWS: [T+110.60s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+110.60s] [Turn 7] Mic opened — user speaking. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+113.82s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+113.82s] [Turn 7] Mic closed — user spoke 3.22s. Waiting for server response (timeout 10s)... +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Warning: [T+119.37s] [LATENCY] First audio: 5546 ms after turn end (5546 ms after last chunk) +LogElevenLabsWS: Received message type: user_transcript +LogElevenLabsWS: Warning: [T+119.37s] [LATENCY] User transcript: 5546 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Transcript : And do you speak German? +LogElevenLabsWS: Received message type: agent_response +LogElevenLabsWS: Warning: [T+119.40s] [LATENCY] Agent text response: 5577 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Text Response : Yes, I can also communicate in German. Would you like me to say something in German for you? +LogElevenLabsAgent: Warning: [T+123.83s] [Turn 7] Response timeout — server did not start generating after 10.0s. Firing OnAgentResponseTimeout. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsWS: [T+131.53s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+131.53s] [Turn 8] Mic opened — user speaking. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+134.42s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+134.42s] [Turn 8] Mic closed — user spoke 2.88s. Waiting for server response (timeout 10s)... +LogElevenLabsAgent: Warning: [T+144.42s] [Turn 8] Response timeout — server did not start generating after 10.0s. Firing OnAgentResponseTimeout. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsWS: [T+148.56s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+148.56s] [Turn 9] Mic opened — user speaking. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+150.35s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+150.35s] [Turn 9] Mic closed — user spoke 1.79s. Waiting for server response (timeout 10s)... +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Warning: [T+151.56s] [LATENCY] First audio: 1211 ms after turn end (1211 ms after last chunk) +LogElevenLabsWS: Received message type: user_transcript +LogElevenLabsWS: Warning: [T+151.59s] [LATENCY] User transcript: 1245 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Transcript : Hello. +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Received message type: agent_response +LogElevenLabsWS: Warning: [T+152.87s] [LATENCY] Agent text response: 2526 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Text Response : Hello! How can I help you today? +LogElevenLabsAgent: Warning: [T+160.36s] [Turn 9] Response timeout — server did not start generating after 10.0s. Firing OnAgentResponseTimeout. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Start Listening +LogElevenLabsWS: [T+162.59s] User turn started — mic open, audio chunks will follow. +LogAudioCaptureCore: Display: WasapiCapture AudioFormat SampeRate: 48000, BitDepth: 32-Bit Floating Point +LogElevenLabsMic: Capture device: Microphone Array (Intel® Smart Sound Technology for Digital Microphones) | Rate=48000 | Channels=2 +LogElevenLabsMic: Audio capture started. +LogElevenLabsAgent: [T+162.59s] [Turn 10] Mic opened — user speaking. +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Stop Listening +LogElevenLabsMic: Audio capture stopped. +LogElevenLabsWS: [T+165.69s] User turn ended — server VAD silence detection started (turn_timeout=1s). +LogElevenLabsAgent: [T+165.69s] [Turn 10] Mic closed — user spoke 3.10s. Waiting for server response (timeout 10s)... +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: agent_chat_response_part +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Warning: [T+168.47s] [LATENCY] First audio: 2778 ms after turn end (2778 ms after last chunk) +LogElevenLabsWS: Received message type: user_transcript +LogElevenLabsWS: Warning: [T+168.50s] [LATENCY] User transcript: 2810 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Transcript : Why are you so slow? +LogElevenLabsAgent: Warning: [T+175.70s] [Turn 10] Response timeout — server did not start generating after 10.0s. Firing OnAgentResponseTimeout. +LogElevenLabsWS: Received message type: audio +LogElevenLabsWS: Received message type: agent_response +LogElevenLabsWS: Warning: [T+175.85s] [LATENCY] Agent text response: 10161 ms after turn end +LogBlueprintUserMessages: [test_AI_Actor_C_2] [AI Agent] Text Response : I apologize if my response seemed slow. I'm designed to be as quick as possible while still being helpful. Is there something specific I can do to improve your experience? +LogSlate: Updating window title bar state: overlay mode, drag disabled, window buttons hidden, title bar hidden +LogWorld: BeginTearingDown for /Game/UEDPIE_0_TestMap + +Show less + + +Excellent logs — je vois deux bugs distincts : + +Bug A — bIgnoreIncomingContent jamais effacé : après l'interruption au Turn 5, le serveur n'envoie pas d'interruption_event (ou on ne le reçoit pas). Le flag reste true → tout le contenu du Turn 6, 7, 8, 9, 10 est silencieusement ignoré → timeout systématique. Preuve : les [LATENCY] First audio du WebSocketProxy s'affichent (l'audio arrive) mais JAMAIS le Agent speaking — first audio chunk de l'AgentComponent (l'audio est jeté). + +Bug B — HardSilenceTimeoutTicks = 120 (2s) trop court : l'agent répondait en deux batches audio avec 4.7s de gap (Turn 2 : T+23.85s puis T+28.55s). Le timeout de 2s s'est déclenché pendant le gap → faux "Stop Speaking" → Blueprint ouvre le mic → puis le 2e batch arrive → "Start Speaking" à nouveau. + +Fixes : \ No newline at end of file diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index a948b06..01836b7 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -42,6 +42,52 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe { Super::TickComponent(DeltaTime, TickType, ThisTickFunction); + // Response timeout: if the server hasn't started generating within ResponseTimeoutSeconds + // after the user stopped speaking, notify Blueprint so it can react (e.g. show "try again"). + if (bWaitingForAgentResponse && ResponseTimeoutSeconds > 0.0f && TurnEndTime > 0.0) + { + const double WaitTime = FPlatformTime::Seconds() - TurnEndTime; + if (WaitTime >= static_cast(ResponseTimeoutSeconds)) + { + bWaitingForAgentResponse = false; + const double T = FPlatformTime::Seconds() - SessionStartTime; + UE_LOG(LogElevenLabsAgent, Warning, + TEXT("[T+%.2fs] [Turn %d] Response timeout — server did not start generating after %.1fs. Firing OnAgentResponseTimeout."), + T, LastClosedTurnIndex, WaitTime); + OnAgentResponseTimeout.Broadcast(); + } + } + + // Generating timeout (ISSUE-1): if the server sent agent_chat_response_part + // (bAgentGenerating=true) but no audio ever arrived (bAgentSpeaking=false), + // force-clear bAgentGenerating after 10s so StartListening() is no longer blocked. + // Normal path: first audio chunk → EnqueueAgentAudio → bAgentGenerating=false. + // This fallback covers the rare case where TTS produces nothing (e.g. empty response). + if (bAgentGenerating && !bAgentSpeaking) + { + if (++GeneratingTickCount >= HardSilenceTimeoutTicks) + { + bAgentGenerating = false; + GeneratingTickCount = 0; + const double T = FPlatformTime::Seconds() - SessionStartTime; + UE_LOG(LogElevenLabsAgent, Warning, + TEXT("[T+%.2fs] [Turn %d] Generating timeout (10s) — server generated but no audio arrived. Clearing bAgentGenerating."), + T, LastClosedTurnIndex); + } + } + else + { + GeneratingTickCount = 0; + } + + // Silence detection. + // ISSUE-8: broadcast OnAgentStoppedSpeaking OUTSIDE AudioQueueLock. + // OnProceduralUnderflow (audio thread) also acquires AudioQueueLock — if we broadcast + // while holding the lock, the audio thread blocks for the full duration of any Blueprint + // handler bound to OnAgentStoppedSpeaking, causing audio glitches / starvation. + // Fix: set a local flag inside the lock, release the lock, then broadcast. + bool bShouldBroadcastStopped = false; + bool bHardTimeoutFired = false; if (bAgentSpeaking) { FScopeLock Lock(&AudioQueueLock); @@ -57,20 +103,17 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe const bool bResponseConfirmed = bAgentResponseReceived && SilentTickCount >= SilenceThresholdTicks; // Hard-timeout fallback: if agent_response never arrives (or is very late), - // stop after 2s of silence to avoid leaving the state machine stuck. + // stop after 10s of silence to avoid leaving the state machine stuck. + // 10s was chosen to bridge observed inter-batch TTS gaps of up to ~5s. const bool bHardTimeout = SilentTickCount >= HardSilenceTimeoutTicks; if (bResponseConfirmed || bHardTimeout) { - if (bHardTimeout && !bAgentResponseReceived) - { - UE_LOG(LogElevenLabsAgent, Warning, - TEXT("Agent silence hard-timeout (2s) without agent_response — declaring agent stopped.")); - } + bHardTimeoutFired = bHardTimeout && !bAgentResponseReceived; bAgentSpeaking = false; bAgentResponseReceived = false; SilentTickCount = 0; - OnAgentStoppedSpeaking.Broadcast(); + bShouldBroadcastStopped = true; } } else @@ -78,6 +121,17 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe SilentTickCount = 0; } } + // Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time. + if (bShouldBroadcastStopped) + { + if (bHardTimeoutFired) + { + UE_LOG(LogElevenLabsAgent, Warning, + TEXT("[Turn %d] Agent silence hard-timeout (10s) without agent_response — declaring agent stopped."), + LastClosedTurnIndex); + } + OnAgentStoppedSpeaking.Broadcast(); + } } // ───────────────────────────────────────────────────────────────────────────── @@ -116,6 +170,10 @@ void UElevenLabsConversationalAgentComponent::StartConversation() void UElevenLabsConversationalAgentComponent::EndConversation() { StopListening(); + // ISSUE-4: StopListening() may set bWaitingForAgentResponse=true (normal turn end path). + // Cancel it immediately — there is no response coming because we are ending the session. + // Without this, TickComponent could fire OnAgentResponseTimeout after EndConversation(). + bWaitingForAgentResponse = false; StopAgentAudio(); if (WebSocketProxy) @@ -161,7 +219,10 @@ void UElevenLabsConversationalAgentComponent::StartListening() return; } } + bWaitingForAgentResponse = false; // New user turn — cancel any pending response timeout. + ++TurnIndex; bIsListening = true; + TurnStartTime = FPlatformTime::Seconds(); if (TurnMode == EElevenLabsTurnMode::Client) { @@ -186,7 +247,8 @@ void UElevenLabsConversationalAgentComponent::StartListening() &UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured); Mic->StartCapture(); - UE_LOG(LogElevenLabsAgent, Log, TEXT("Microphone capture started.")); + const double T = TurnStartTime - SessionStartTime; + UE_LOG(LogElevenLabsAgent, Log, TEXT("[T+%.2fs] [Turn %d] Mic opened — user speaking."), T, TurnIndex); } void UElevenLabsConversationalAgentComponent::StopListening() @@ -215,7 +277,34 @@ void UElevenLabsConversationalAgentComponent::StopListening() WebSocketProxy->SendUserTurnEnd(); } - UE_LOG(LogElevenLabsAgent, Log, TEXT("Microphone capture stopped.")); + TurnEndTime = FPlatformTime::Seconds(); + // Start the response timeout clock — but only when the server hasn't already started + // generating. When StopListening() is called from HandleAgentResponseStarted() as part + // of collision avoidance, bAgentGenerating is already true, meaning the server IS already + // generating; starting a timeout here would be incorrect. + // Also update LastClosedTurnIndex only for normal turn ends (not collision avoidance), + // so server-response log lines always show which user turn they are answering. + if (!bAgentGenerating) + { + bWaitingForAgentResponse = true; + LastClosedTurnIndex = TurnIndex; + } + const double T = TurnEndTime - SessionStartTime; + const double TurnDuration = TurnStartTime > 0.0 ? TurnEndTime - TurnStartTime : 0.0; + if (bWaitingForAgentResponse) + { + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d] Mic closed — user spoke %.2fs. Waiting for server response (timeout %.0fs)..."), + T, TurnIndex, TurnDuration, ResponseTimeoutSeconds); + } + else + { + // Collision avoidance: StopListening was called from HandleAgentResponseStarted + // while server was already generating — no need to wait or time out. + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d] Mic closed (collision avoidance) — user spoke %.2fs. Server is already generating Turn %d response."), + T, TurnIndex, TurnDuration, LastClosedTurnIndex); + } } void UElevenLabsConversationalAgentComponent::SendTextMessage(const FString& Text) @@ -230,6 +319,7 @@ void UElevenLabsConversationalAgentComponent::SendTextMessage(const FString& Tex void UElevenLabsConversationalAgentComponent::InterruptAgent() { + bWaitingForAgentResponse = false; // Interrupting — no response expected from previous turn. if (WebSocketProxy) WebSocketProxy->SendInterrupt(); StopAgentAudio(); } @@ -253,7 +343,10 @@ const FElevenLabsConversationInfo& UElevenLabsConversationalAgentComponent::GetC // ───────────────────────────────────────────────────────────────────────────── void UElevenLabsConversationalAgentComponent::HandleConnected(const FElevenLabsConversationInfo& Info) { - UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent connected. ConversationID=%s"), *Info.ConversationID); + SessionStartTime = FPlatformTime::Seconds(); + TurnIndex = 0; + LastClosedTurnIndex = 0; + UE_LOG(LogElevenLabsAgent, Log, TEXT("[T+0.00s] Agent connected. ConversationID=%s"), *Info.ConversationID); OnAgentConnected.Broadcast(Info); // In Client turn mode (push-to-talk), the user controls listening manually via @@ -270,10 +363,19 @@ void UElevenLabsConversationalAgentComponent::HandleConnected(const FElevenLabsC void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCode, const FString& Reason) { UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent disconnected. Code=%d Reason=%s"), StatusCode, *Reason); + + // ISSUE-13: stop audio playback and clear the queue if the WebSocket drops while the + // agent is speaking. Without this the audio component kept playing buffered PCM after + // disconnect. StopAgentAudio also fires OnAgentStoppedSpeaking if bAgentSpeaking was true, + // giving Blueprint a chance to clean up (e.g. show disconnected state). + StopAgentAudio(); + bIsListening = false; - bAgentSpeaking = false; - bAgentGenerating = false; - bAgentResponseReceived = false; + // bAgentSpeaking / bAgentGenerating / bAgentResponseReceived already cleared by StopAgentAudio. + bWaitingForAgentResponse = false; + GeneratingTickCount = 0; + TurnIndex = 0; + LastClosedTurnIndex = 0; MicAccumulationBuffer.Reset(); OnAgentDisconnected.Broadcast(StatusCode, Reason); } @@ -312,6 +414,7 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& void UElevenLabsConversationalAgentComponent::HandleInterrupted() { + bWaitingForAgentResponse = false; // Interrupted — no response expected from previous turn. StopAgentAudio(); OnAgentInterrupted.Broadcast(); } @@ -322,13 +425,26 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted() // Set bAgentGenerating BEFORE StopListening so that any StartListening call // triggered by the Blueprint's OnAgentStartedGenerating handler is blocked. bAgentGenerating = true; + bWaitingForAgentResponse = false; // Server is generating — response timeout cancelled. + const double Now = FPlatformTime::Seconds(); + const double T = Now - SessionStartTime; + const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0; if (bIsListening) { + // Collision: server started generating Turn N's response while Turn M (M>N) mic was open. + // Log both turn indices so the timeline is unambiguous. UE_LOG(LogElevenLabsAgent, Log, - TEXT("Agent started generating while mic was open — stopping listening to avoid turn collision.")); + TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"), + T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd); StopListening(); } + else + { + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"), + T, LastClosedTurnIndex, LatencyFromTurnEnd); + } OnAgentStartedGenerating.Broadcast(); } @@ -383,10 +499,18 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray 0.0 ? AgentSpeakStart - TurnEndTime : 0.0; + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d] Agent speaking — first audio chunk. (%.2fs after turn end)"), + T, LastClosedTurnIndex, LatencyFromTurnEnd); + OnAgentStartedSpeaking.Broadcast(); if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) @@ -403,16 +527,37 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio() AudioPlaybackComponent->Stop(); } - FScopeLock Lock(&AudioQueueLock); - AudioQueue.Empty(); - - bAgentGenerating = false; // Always clear — covers interruptions during generation phase. - bAgentResponseReceived = false; // Reset — next response will re-confirm when done. - - if (bAgentSpeaking) + // ISSUE-8: capture the "was speaking" state inside the lock, then broadcast OUTSIDE. + // OnProceduralUnderflow (audio thread) also acquires AudioQueueLock — broadcasting + // while holding it would block the audio thread for the full Blueprint handler duration. + bool bWasSpeaking = false; + double Now = 0.0; { - bAgentSpeaking = false; - SilentTickCount = 0; + FScopeLock Lock(&AudioQueueLock); + AudioQueue.Empty(); + + bAgentGenerating = false; // Always clear — covers interruptions during generation phase. + bAgentResponseReceived = false; // Reset — next response will re-confirm when done. + + if (bAgentSpeaking) + { + bAgentSpeaking = false; + SilentTickCount = 0; + bWasSpeaking = true; + Now = FPlatformTime::Seconds(); + } + } + + // Broadcast outside the lock. + if (bWasSpeaking) + { + const double T = Now - SessionStartTime; + const double AgentSpokeDuration = AgentSpeakStart > 0.0 ? Now - AgentSpeakStart : 0.0; + const double TotalTurnDuration = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0; + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d] Agent stopped speaking (spoke %.2fs, full turn round-trip %.2fs)."), + T, LastClosedTurnIndex, AgentSpokeDuration, TotalTurnDuration); + OnAgentStoppedSpeaking.Broadcast(); } } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index d8b3f04..23e1a8b 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -157,6 +157,21 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd() // If we reset the flag here, the next agent_chat_response_part would re-fire OnAgentResponseStarted // in a loop: part arrives → event → StopListening → SendUserTurnEnd → flag reset → part arrives → loop. // The flag is only reset in SendUserTurnStart() at the beginning of a new user turn. + + // Clear the interrupt-ignore flag if it was never cleared by an "interruption" server ack. + // The ElevenLabs server does not always send the "interruption" acknowledgement reliably. + // By the time the user has spoken a full new turn (seconds of audio), any in-flight content + // from the previously interrupted generation has long since arrived — it is safe to resume + // normal content processing so the server's response to this new turn is not silently discarded. + if (bIgnoreIncomingContent) + { + bIgnoreIncomingContent = false; + const double T = UserTurnEndTime - SessionStartTime; + UE_LOG(LogElevenLabsWS, Log, + TEXT("[T+%.2fs] Cleared interrupt-ignore flag at turn end (server 'interruption' ack was not received — resuming content processing)."), + T); + } + const double T = UserTurnEndTime - SessionStartTime; UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn ended — server VAD silence detection started (turn_timeout=1s)."), T); } @@ -554,6 +569,25 @@ void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr& void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr& Root) { + // ISSUE-19: discard agent_response that belongs to an interrupted generation. + // A stale agent_response from the cancelled turn would set bAgentResponseReceived=true + // on the component, allowing the silence-detection Tick to fire OnAgentStoppedSpeaking + // at the wrong time (no audio is currently playing for the new turn yet). + if (bIgnoreIncomingContent) + { + UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_response (interrupt pending server ack).")); + return; + } + + // ISSUE-22: reset bAgentResponseStartedFired so OnAgentResponseStarted fires again on + // the next turn. In Server VAD mode SendUserTurnStart() is never called — it is the only + // other place that resets this flag — so without this reset, OnAgentResponseStarted fires + // only on the very first turn of the session, and all subsequent turns in Server VAD mode + // never receive it (bAgentGenerating stays false, StartListening is never blocked, etc.). + // agent_response marks the definitive end of the current turn's generation: every + // subsequent agent_chat_response_part is guaranteed to belong to a new turn. + bAgentResponseStartedFired = false; + // { "type": "agent_response", // "agent_response_event": { "agent_response": "..." } // } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index bee6ec3..83346fd 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -43,6 +43,15 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentInterrupted); */ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedGenerating); +/** + * Fired when the server has not started generating a response within ResponseTimeoutSeconds + * after the user stopped speaking (StopListening was called). + * This typically indicates a server-side latency spike or a dropped turn. + * Blueprint can react by showing a "please try again" prompt or automatically + * re-opening the microphone so the user can repeat their message. + */ +DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentResponseTimeout); + // ───────────────────────────────────────────────────────────────────────────── // UElevenLabsConversationalAgentComponent // @@ -129,6 +138,17 @@ public: UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events") bool bEnableAgentTextResponse = true; + /** + * How many seconds to wait for the server to start generating a response + * after the user stops speaking (StopListening) before firing OnAgentResponseTimeout. + * Set to 0 to disable. Default: 10 seconds. + * + * A typical healthy round-trip is 0.1–0.8s to first agent_chat_response_part. + * Values above 10s are extremely unusual and almost always indicate a server issue. + */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", meta = (ClampMin = "0.0")) + float ResponseTimeoutSeconds = 10.0f; + // ── Events ──────────────────────────────────────────────────────────────── UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") @@ -166,6 +186,14 @@ public: UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") FOnAgentStartedGenerating OnAgentStartedGenerating; + /** + * Fired when the server has not started generating within ResponseTimeoutSeconds + * after StopListening was called. Bind here to give the user feedback such as + * "I didn't get a response, please try again" or to automatically re-open the mic. + */ + UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") + FOnAgentResponseTimeout OnAgentResponseTimeout; + // ── Control ─────────────────────────────────────────────────────────────── /** @@ -285,6 +313,27 @@ private: // True from the first agent_chat_response_part until the first audio chunk arrives. // Used to block StartListening() while the server is processing the previous turn. bool bAgentGenerating = false; + // True between StopListening() and HandleAgentResponseStarted() (first chat response part). + // Used to fire OnAgentResponseTimeout if the server takes too long to start generating. + bool bWaitingForAgentResponse = false; + + // ── Turn tracking ───────────────────────────────────────────────────────── + // Monotonically increasing counter. Incremented each time StartListening() successfully + // opens the microphone. Appears as [Turn N] in all log lines so every mic session and + // its corresponding server response can be correlated at a glance. + int32 TurnIndex = 0; + // The TurnIndex of the last normal (non-collision-avoidance) StopListening() call. + // Server-response events (generating, speaking, stopped) log with THIS index so you can + // see which user turn the server is answering — even if a newer turn has since opened. + // Example: "[Turn 2] Agent generating" while "[Turn 3] mic open" = late server response. + int32 LastClosedTurnIndex = 0; + + // ── Session timing ──────────────────────────────────────────────────────── + // All timestamps are from FPlatformTime::Seconds(), used to compute [T+Xs] log prefixes. + double SessionStartTime = 0.0; // Set in HandleConnected — start of conversation. + double TurnStartTime = 0.0; // Set in StartListening — when mic opens. + double TurnEndTime = 0.0; // Set in StopListening — when mic closes. + double AgentSpeakStart = 0.0; // Set in EnqueueAgentAudio (first chunk) — when audio begins. // Accumulates incoming PCM bytes until the audio component needs data. TArray AudioQueue; @@ -293,14 +342,22 @@ private: // Silence detection: how many consecutive ticks with an empty audio queue. int32 SilentTickCount = 0; + // Generating timeout: how many consecutive ticks bAgentGenerating has been true + // without any audio arriving. If this reaches HardSilenceTimeoutTicks, bAgentGenerating + // is force-cleared so StartListening() is no longer blocked. This covers the edge case + // where the server sends agent_chat_response_part but the TTS pipeline produces no audio. + int32 GeneratingTickCount = 0; + // Primary threshold: fire OnAgentStoppedSpeaking after this many silent ticks // once the server has confirmed the full response (bAgentResponseReceived=true). // 30 ticks ≈ 0.5s at 60fps — enough to bridge brief inter-chunk gaps in the TTS stream. static constexpr int32 SilenceThresholdTicks = 30; - // Hard-timeout fallback: fire even without agent_response confirmation after 2s - // of silence (handles edge cases where agent_response is very late or missing). - static constexpr int32 HardSilenceTimeoutTicks = 120; // 2s at 60fps + // Hard-timeout fallback: fire even without agent_response confirmation after 10s + // of silence. This covers edge cases where agent_response is very late or missing, + // while being long enough to bridge inter-batch TTS gaps (observed up to ~5s). + // Previously 2s — raised after logs showed premature firing during multi-batch responses. + static constexpr int32 HardSilenceTimeoutTicks = 600; // 10s at 60fps // True once the server sends agent_response for the current turn. // The server sends the full text when generation is complete — this is the