diff --git a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset index d0e446f..83bff19 100644 Binary files a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset and b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index 01836b7..a505247 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -158,6 +158,8 @@ void UElevenLabsConversationalAgentComponent::StartConversation() &UElevenLabsConversationalAgentComponent::HandleInterrupted); WebSocketProxy->OnAgentResponseStarted.AddDynamic(this, &UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted); + WebSocketProxy->OnAgentResponsePart.AddDynamic(this, + &UElevenLabsConversationalAgentComponent::HandleAgentResponsePart); } // Pass configuration to the proxy before connecting. @@ -266,7 +268,23 @@ void UElevenLabsConversationalAgentComponent::StopListening() // Flush any partially-accumulated mic audio before signalling end-of-turn. // This ensures the final words aren't discarded just because the last callback // didn't push the buffer over the MicChunkMinBytes threshold. - if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected()) + // + // EXCEPT during collision avoidance: bAgentGenerating is already true when + // HandleAgentResponseStarted calls StopListening (it sets the flag before calling us). + // Flushing audio to a server that is mid-generation can cause it to re-enter + // "user speaking" state and stall waiting for more audio that never arrives, + // leaving both sides stuck — no audio for the collision response and no response + // for subsequent turns. + if (bAgentGenerating) + { + if (MicAccumulationBuffer.Num() > 0) + { + UE_LOG(LogElevenLabsAgent, Log, + TEXT("StopListening: discarding %d bytes of accumulated mic audio (collision — server is mid-generation)."), + MicAccumulationBuffer.Num()); + } + } + else if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected()) { WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); } @@ -423,7 +441,8 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted() { // The server has started generating a response (first agent_chat_response_part). // Set bAgentGenerating BEFORE StopListening so that any StartListening call - // triggered by the Blueprint's OnAgentStartedGenerating handler is blocked. + // triggered by the Blueprint's OnAgentStartedGenerating handler is blocked, + // and so that StopListening knows to skip the mic buffer flush (collision path). bAgentGenerating = true; bWaitingForAgentResponse = false; // Server is generating — response timeout cancelled. @@ -433,21 +452,39 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted() if (bIsListening) { // Collision: server started generating Turn N's response while Turn M (M>N) mic was open. - // Log both turn indices so the timeline is unambiguous. + // The server's VAD detected a pause in the user's speech and started generating + // prematurely — the user hasn't finished speaking yet. + // + // Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's + // bAgentGenerating guard). Flushing would send audio to a server that is mid-generation, + // causing it to re-enter "user speaking" state and stall — both sides stuck. + // + // Do NOT send an interrupt here: the ElevenLabs server does not always send the + // interruption ack, which would leave bIgnoreIncomingContent=true and silently + // discard all subsequent content. Instead, let the server's response play out: + // - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally. + // - If audio never arrives → generating timeout (10s) clears bAgentGenerating. + // Either way the state machine recovers and Blueprint can reopen the mic. UE_LOG(LogElevenLabsAgent, Log, TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"), T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd); StopListening(); } - else - { - UE_LOG(LogElevenLabsAgent, Log, - TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"), - T, LastClosedTurnIndex, LatencyFromTurnEnd); - } + + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"), + T, LastClosedTurnIndex, LatencyFromTurnEnd); OnAgentStartedGenerating.Broadcast(); } +void UElevenLabsConversationalAgentComponent::HandleAgentResponsePart(const FString& PartialText) +{ + if (bEnableAgentPartialResponse) + { + OnAgentPartialResponse.Broadcast(PartialText); + } +} + // ───────────────────────────────────────────────────────────────────────────── // Audio playback // ───────────────────────────────────────────────────────────────────────────── diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index 23e1a8b..de6ffee 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -400,7 +400,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) } else if (MsgType == ElevenLabsMessageType::AgentChatResponsePart) { - HandleAgentChatResponsePart(); + HandleAgentChatResponsePart(Root); } else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection) { @@ -602,7 +602,7 @@ void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr& Root) { // Ignore response parts that belong to a generation we have already interrupted. // Without this guard, old parts arriving after SendInterrupt() would re-trigger @@ -628,7 +628,21 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart() T, LatencyFromTurnEnd); OnAgentResponseStarted.Broadcast(); } - // Subsequent parts logged at Verbose only (can be dozens per response). + + // Extract the streaming text fragment and broadcast it. + // API structure: + // { "type": "agent_chat_response_part", + // "agent_chat_response_part_event": { "agent_response_part": "partial text" } + // } + const TSharedPtr* PartEvent = nullptr; + if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent) + { + FString PartText; + if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty()) + { + OnAgentResponsePart.Broadcast(PartText); + } + } } void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr& Root) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index 83346fd..05d0665 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -43,6 +43,15 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentInterrupted); */ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedGenerating); +/** + * Fired for every agent_chat_response_part — streams the agent's text as the LLM + * generates it, token by token. Use this for real-time subtitles / text display. + * Each call provides the text fragment from that individual part (NOT accumulated). + * The final complete text is still available via OnAgentTextResponse (agent_response). + */ +DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentPartialResponse, + const FString&, PartialText); + /** * Fired when the server has not started generating a response within ResponseTimeoutSeconds * after the user stopped speaking (StopListening was called). @@ -138,6 +147,15 @@ public: UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events") bool bEnableAgentTextResponse = true; + /** + * Forward streaming text parts (agent_chat_response_part events) to the + * OnAgentPartialResponse delegate. Each part is a text fragment as the LLM + * generates it — use this for real-time subtitles that appear while the agent + * speaks, instead of waiting for the full text (OnAgentTextResponse). + */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events") + bool bEnableAgentPartialResponse = false; + /** * How many seconds to wait for the server to start generating a response * after the user stops speaking (StopListening) before firing OnAgentResponseTimeout. @@ -168,6 +186,14 @@ public: UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") FOnAgentTextResponse OnAgentTextResponse; + /** + * Streaming text fragments as the LLM generates them. + * Fires for every agent_chat_response_part — each call gives one text chunk. + * Enable with bEnableAgentPartialResponse. + */ + UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") + FOnAgentPartialResponse OnAgentPartialResponse; + UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") FOnAgentStartedSpeaking OnAgentStartedSpeaking; @@ -285,6 +311,9 @@ private: UFUNCTION() void HandleAgentResponseStarted(); + UFUNCTION() + void HandleAgentResponsePart(const FString& PartialText); + // ── Audio playback ──────────────────────────────────────────────────────── void InitAudioPlayback(); void EnqueueAgentAudio(const TArray& PCMData); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h index 09a4d28..b75d6e7 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h @@ -43,6 +43,11 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsInterrupted); */ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsAgentResponseStarted); +/** Fired for every agent_chat_response_part — streams the LLM text as it is generated. + * PartialText is the text fragment from this individual part (NOT accumulated). */ +DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnElevenLabsAgentResponsePart, + const FString&, PartialText); + // ───────────────────────────────────────────────────────────────────────────── // WebSocket Proxy @@ -94,6 +99,10 @@ public: UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") FOnElevenLabsAgentResponseStarted OnAgentResponseStarted; + /** Fired for every agent_chat_response_part with the streaming text fragment. */ + UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") + FOnElevenLabsAgentResponsePart OnAgentResponsePart; + // ── Lifecycle ───────────────────────────────────────────────────────────── /** @@ -182,7 +191,7 @@ private: void HandleAudioResponse(const TSharedPtr& Payload); void HandleTranscript(const TSharedPtr& Payload); void HandleAgentResponse(const TSharedPtr& Payload); - void HandleAgentChatResponsePart(); + void HandleAgentChatResponsePart(const TSharedPtr& Payload); void HandleInterruption(const TSharedPtr& Payload); void HandlePing(const TSharedPtr& Payload);