diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index 296e037..268e9e2 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -109,18 +109,21 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray& PCMData) void UElevenLabsWebSocketProxy::SendUserTurnStart() { + // In client turn mode, signal that the user is active/speaking. + // API message: { "type": "user_activity" } if (!IsConnected()) return; TSharedPtr Msg = MakeShareable(new FJsonObject()); - Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnStart); + Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity); SendJsonMessage(Msg); } void UElevenLabsWebSocketProxy::SendUserTurnEnd() { - if (!IsConnected()) return; - TSharedPtr Msg = MakeShareable(new FJsonObject()); - Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnEnd); - SendJsonMessage(Msg); + // In client turn mode, stopping user_activity signals end of user turn. + // The API uses user_activity for ongoing speech; simply stop sending it. + // No explicit end message is required — silence is detected server-side. + // We still log for debug visibility. + UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity.")); } void UElevenLabsWebSocketProxy::SendInterrupt() @@ -189,7 +192,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) { HandleAudioResponse(Root); } - else if (MsgType == ElevenLabsMessageType::Transcript) + else if (MsgType == ElevenLabsMessageType::UserTranscript) { HandleTranscript(Root); } @@ -197,6 +200,11 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) { HandleAgentResponse(Root); } + else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection) + { + // Silently ignore for now — corrected text after interruption. + UE_LOG(LogElevenLabsWS, Verbose, TEXT("agent_response_correction received (ignored).")); + } else if (MsgType == ElevenLabsMessageType::InterruptionEvent) { HandleInterruption(Root); @@ -273,22 +281,23 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr& Root) { - // Expected structure: - // { "type": "transcript", - // "transcript_event": { "speaker": "user"|"agent", "message": "...", "event_id": 1 } + // API structure: + // { "type": "user_transcript", + // "user_transcription_event": { "user_transcript": "Hello" } // } + // This message only carries the user's speech-to-text — speaker is always "user". const TSharedPtr* TranscriptEvent = nullptr; - if (!Root->TryGetObjectField(TEXT("transcript_event"), TranscriptEvent) || !TranscriptEvent) + if (!Root->TryGetObjectField(TEXT("user_transcription_event"), TranscriptEvent) || !TranscriptEvent) { + UE_LOG(LogElevenLabsWS, Warning, TEXT("user_transcript message missing 'user_transcription_event' field.")); return; } FElevenLabsTranscriptSegment Segment; - (*TranscriptEvent)->TryGetStringField(TEXT("speaker"), Segment.Speaker); - (*TranscriptEvent)->TryGetStringField(TEXT("message"), Segment.Text); - - // ElevenLabs marks final vs. interim via "is_final" - (*TranscriptEvent)->TryGetBoolField(TEXT("is_final"), Segment.bIsFinal); + Segment.Speaker = TEXT("user"); + (*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text); + // user_transcript messages are always final (interim results are not sent for user speech) + Segment.bIsFinal = true; OnTranscript.Broadcast(Segment); } @@ -318,7 +327,8 @@ void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr& Root) { // Reply with a pong to keep the connection alive. - // { "type": "ping", "ping_event": { "event_id": 1 } } + // Incoming: { "type": "ping", "ping_event": { "event_id": 1, "ping_ms": 150 } } + // Reply: { "type": "pong", "event_id": 1 } ← event_id is top-level, no wrapper object int32 EventID = 0; const TSharedPtr* PingEvent = nullptr; if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent) @@ -328,9 +338,7 @@ void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr& Root) TSharedPtr Pong = MakeShareable(new FJsonObject()); Pong->SetStringField(TEXT("type"), TEXT("pong")); - TSharedPtr PongEvent = MakeShareable(new FJsonObject()); - PongEvent->SetNumberField(TEXT("event_id"), EventID); - Pong->SetObjectField(TEXT("pong_event"), PongEvent); + Pong->SetNumberField(TEXT("event_id"), EventID); // top-level, not nested SendJsonMessage(Pong); } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h index 7f3dcd8..b2af349 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h @@ -36,16 +36,21 @@ namespace ElevenLabsMessageType { // Client → Server static const FString AudioChunk = TEXT("user_audio_chunk"); - static const FString UserTurnStart = TEXT("user_turn_start"); - static const FString UserTurnEnd = TEXT("user_turn_end"); - static const FString Interrupt = TEXT("interrupt"); - static const FString ClientToolResult = TEXT("client_tool_result"); + // Client turn mode: signal user is currently active/speaking + static const FString UserActivity = TEXT("user_activity"); + // Client turn mode: send a text message without audio + static const FString UserMessage = TEXT("user_message"); + static const FString Interrupt = TEXT("interrupt"); + static const FString ClientToolResult = TEXT("client_tool_result"); + static const FString ConversationClientData = TEXT("conversation_initiation_client_data"); // Server → Client static const FString ConversationInitiation = TEXT("conversation_initiation_metadata"); static const FString AudioResponse = TEXT("audio"); - static const FString Transcript = TEXT("transcript"); + // User speech-to-text transcript (speaker is always the user) + static const FString UserTranscript = TEXT("user_transcript"); static const FString AgentResponse = TEXT("agent_response"); + static const FString AgentResponseCorrection= TEXT("agent_response_correction"); static const FString InterruptionEvent = TEXT("interruption"); static const FString PingEvent = TEXT("ping"); static const FString ClientToolCall = TEXT("client_tool_call"); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h index 2e86a21..f9d545a 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h @@ -120,11 +120,19 @@ public: // ── Turn control (only relevant in Client turn mode) ────────────────────── - /** Signal that the user has started speaking (Client turn mode). */ + /** + * Signal that the user is actively speaking (Client turn mode). + * Sends a { "type": "user_activity" } message to the server. + * Call this periodically while the user is speaking (e.g. every audio chunk). + */ UFUNCTION(BlueprintCallable, Category = "ElevenLabs") void SendUserTurnStart(); - /** Signal that the user has finished speaking (Client turn mode). */ + /** + * Signal that the user has finished speaking (Client turn mode). + * No explicit API message — simply stop sending user_activity. + * The server detects silence and hands the turn to the agent. + */ UFUNCTION(BlueprintCallable, Category = "ElevenLabs") void SendUserTurnEnd();