Fix 3 WebSocket protocol bugs found by API cross-check

Bug 1 — Transcript handler: wrong type string + wrong JSON fields - type was "transcript", API sends "user_transcript" - event key was "transcript_event", API uses "user_transcription_event" - field was "message", API uses "user_transcript" - removed non-existent "speaker"/"is_final" fields; speaker is always "user" Bug 2 — Pong format: event_id must be top-level, not nested in pong_event - Was: {"type":"pong","pong_event":{"event_id":1}} - Fixed: {"type":"pong","event_id":1} Bug 3 — Client turn mode: user_turn_start/end don't exist in the API - SendUserTurnStart now sends {"type":"user_activity"} (correct API message) - SendUserTurnEnd now a no-op with log (no explicit end message in API) - Renamed constants in ElevenLabsDefinitions.h accordingly Also added AgentResponseCorrection and ConversationClientData constants. Compiles cleanly on UE 5.5 Win64. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 13:43:08 +01:00
parent dbd61615a9
commit ae2c9b92e8
3 changed files with 47 additions and 26 deletions
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -109,18 +109,21 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)

 void UElevenLabsWebSocketProxy::SendUserTurnStart()
 {
+	// In client turn mode, signal that the user is active/speaking.
+	// API message: { "type": "user_activity" }
 	if (!IsConnected()) return;
 	TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
-	Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnStart);
+	Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity);
 	SendJsonMessage(Msg);
 }

 void UElevenLabsWebSocketProxy::SendUserTurnEnd()
 {
-	if (!IsConnected()) return;
-	TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
-	Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnEnd);
-	SendJsonMessage(Msg);
+	// In client turn mode, stopping user_activity signals end of user turn.
+	// The API uses user_activity for ongoing speech; simply stop sending it.
+	// No explicit end message is required — silence is detected server-side.
+	// We still log for debug visibility.
+	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity."));
 }

 void UElevenLabsWebSocketProxy::SendInterrupt()
@@ -189,7 +192,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 	{
 		HandleAudioResponse(Root);
 	}
-	else if (MsgType == ElevenLabsMessageType::Transcript)
+	else if (MsgType == ElevenLabsMessageType::UserTranscript)
 	{
 		HandleTranscript(Root);
 	}
@@ -197,6 +200,11 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 	{
 		HandleAgentResponse(Root);
 	}
+	else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
+	{
+		// Silently ignore for now — corrected text after interruption.
+		UE_LOG(LogElevenLabsWS, Verbose, TEXT("agent_response_correction received (ignored)."));
+	}
 	else if (MsgType == ElevenLabsMessageType::InterruptionEvent)
 	{
 		HandleInterruption(Root);
@@ -273,22 +281,23 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject

 void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr<FJsonObject>& Root)
 {
-	// Expected structure:
-	// { "type": "transcript",
-	//   "transcript_event": { "speaker": "user"|"agent", "message": "...", "event_id": 1 }
+	// API structure:
+	// { "type": "user_transcript",
+	//   "user_transcription_event": { "user_transcript": "Hello" }
 	// }
+	// This message only carries the user's speech-to-text — speaker is always "user".
 	const TSharedPtr<FJsonObject>* TranscriptEvent = nullptr;
-	if (!Root->TryGetObjectField(TEXT("transcript_event"), TranscriptEvent) || !TranscriptEvent)
+	if (!Root->TryGetObjectField(TEXT("user_transcription_event"), TranscriptEvent) || !TranscriptEvent)
 	{
+		UE_LOG(LogElevenLabsWS, Warning, TEXT("user_transcript message missing 'user_transcription_event' field."));
 		return;
 	}

 	FElevenLabsTranscriptSegment Segment;
-	(*TranscriptEvent)->TryGetStringField(TEXT("speaker"), Segment.Speaker);
-	(*TranscriptEvent)->TryGetStringField(TEXT("message"), Segment.Text);
-
-	// ElevenLabs marks final vs. interim via "is_final"
-	(*TranscriptEvent)->TryGetBoolField(TEXT("is_final"), Segment.bIsFinal);
+	Segment.Speaker = TEXT("user");
+	(*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text);
+	// user_transcript messages are always final (interim results are not sent for user speech)
+	Segment.bIsFinal = true;

 	OnTranscript.Broadcast(Segment);
 }
@@ -318,7 +327,8 @@ void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>
 void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)
 {
 	// Reply with a pong to keep the connection alive.
-	// { "type": "ping", "ping_event": { "event_id": 1 } }
+	// Incoming: { "type": "ping", "ping_event": { "event_id": 1, "ping_ms": 150 } }
+	// Reply:    { "type": "pong", "event_id": 1 }  ← event_id is top-level, no wrapper object
 	int32 EventID = 0;
 	const TSharedPtr<FJsonObject>* PingEvent = nullptr;
 	if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent)
@@ -328,9 +338,7 @@ void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)

 	TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject());
 	Pong->SetStringField(TEXT("type"), TEXT("pong"));
-	TSharedPtr<FJsonObject> PongEvent = MakeShareable(new FJsonObject());
-	PongEvent->SetNumberField(TEXT("event_id"), EventID);
-	Pong->SetObjectField(TEXT("pong_event"), PongEvent);
+	Pong->SetNumberField(TEXT("event_id"), EventID);  // top-level, not nested
 	SendJsonMessage(Pong);
 }

--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h
@@ -36,16 +36,21 @@ namespace ElevenLabsMessageType
 {
 	// Client → Server
 	static const FString AudioChunk			= TEXT("user_audio_chunk");
-	static const FString UserTurnStart		= TEXT("user_turn_start");
-	static const FString UserTurnEnd		= TEXT("user_turn_end");
+	// Client turn mode: signal user is currently active/speaking
+	static const FString UserActivity			= TEXT("user_activity");
+	// Client turn mode: send a text message without audio
+	static const FString UserMessage			= TEXT("user_message");
 	static const FString Interrupt				= TEXT("interrupt");
 	static const FString ClientToolResult		= TEXT("client_tool_result");
+	static const FString ConversationClientData	= TEXT("conversation_initiation_client_data");

 	// Server → Client
 	static const FString ConversationInitiation	= TEXT("conversation_initiation_metadata");
 	static const FString AudioResponse			= TEXT("audio");
-	static const FString Transcript				= TEXT("transcript");
+	// User speech-to-text transcript (speaker is always the user)
+	static const FString UserTranscript			= TEXT("user_transcript");
 	static const FString AgentResponse			= TEXT("agent_response");
+	static const FString AgentResponseCorrection= TEXT("agent_response_correction");
 	static const FString InterruptionEvent		= TEXT("interruption");
 	static const FString PingEvent				= TEXT("ping");
 	static const FString ClientToolCall			= TEXT("client_tool_call");
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
@@ -120,11 +120,19 @@ public:

 	// ── Turn control (only relevant in Client turn mode) ──────────────────────

-	/** Signal that the user has started speaking (Client turn mode). */
+	/**
+	 * Signal that the user is actively speaking (Client turn mode).
+	 * Sends a { "type": "user_activity" } message to the server.
+	 * Call this periodically while the user is speaking (e.g. every audio chunk).
+	 */
 	UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
 	void SendUserTurnStart();

-	/** Signal that the user has finished speaking (Client turn mode). */
+	/**
+	 * Signal that the user has finished speaking (Client turn mode).
+	 * No explicit API message — simply stop sending user_activity.
+	 * The server detects silence and hands the turn to the agent.
+	 */
 	UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
 	void SendUserTurnEnd();