Fix 3 WebSocket protocol bugs found by API cross-check

Bug 1 — Transcript handler: wrong type string + wrong JSON fields
  - type was "transcript", API sends "user_transcript"
  - event key was "transcript_event", API uses "user_transcription_event"
  - field was "message", API uses "user_transcript"
  - removed non-existent "speaker"/"is_final" fields; speaker is always "user"

Bug 2 — Pong format: event_id must be top-level, not nested in pong_event
  - Was: {"type":"pong","pong_event":{"event_id":1}}
  - Fixed: {"type":"pong","event_id":1}

Bug 3 — Client turn mode: user_turn_start/end don't exist in the API
  - SendUserTurnStart now sends {"type":"user_activity"} (correct API message)
  - SendUserTurnEnd now a no-op with log (no explicit end message in API)
  - Renamed constants in ElevenLabsDefinitions.h accordingly

Also added AgentResponseCorrection and ConversationClientData constants.
Compiles cleanly on UE 5.5 Win64.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-19 13:43:08 +01:00
parent dbd61615a9
commit ae2c9b92e8
3 changed files with 47 additions and 26 deletions

View File

@ -109,18 +109,21 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
void UElevenLabsWebSocketProxy::SendUserTurnStart() void UElevenLabsWebSocketProxy::SendUserTurnStart()
{ {
// In client turn mode, signal that the user is active/speaking.
// API message: { "type": "user_activity" }
if (!IsConnected()) return; if (!IsConnected()) return;
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject()); TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnStart); Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity);
SendJsonMessage(Msg); SendJsonMessage(Msg);
} }
void UElevenLabsWebSocketProxy::SendUserTurnEnd() void UElevenLabsWebSocketProxy::SendUserTurnEnd()
{ {
if (!IsConnected()) return; // In client turn mode, stopping user_activity signals end of user turn.
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject()); // The API uses user_activity for ongoing speech; simply stop sending it.
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnEnd); // No explicit end message is required — silence is detected server-side.
SendJsonMessage(Msg); // We still log for debug visibility.
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity."));
} }
void UElevenLabsWebSocketProxy::SendInterrupt() void UElevenLabsWebSocketProxy::SendInterrupt()
@ -189,7 +192,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
{ {
HandleAudioResponse(Root); HandleAudioResponse(Root);
} }
else if (MsgType == ElevenLabsMessageType::Transcript) else if (MsgType == ElevenLabsMessageType::UserTranscript)
{ {
HandleTranscript(Root); HandleTranscript(Root);
} }
@ -197,6 +200,11 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
{ {
HandleAgentResponse(Root); HandleAgentResponse(Root);
} }
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
{
// Silently ignore for now — corrected text after interruption.
UE_LOG(LogElevenLabsWS, Verbose, TEXT("agent_response_correction received (ignored)."));
}
else if (MsgType == ElevenLabsMessageType::InterruptionEvent) else if (MsgType == ElevenLabsMessageType::InterruptionEvent)
{ {
HandleInterruption(Root); HandleInterruption(Root);
@ -273,22 +281,23 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject
void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr<FJsonObject>& Root) void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr<FJsonObject>& Root)
{ {
// Expected structure: // API structure:
// { "type": "transcript", // { "type": "user_transcript",
// "transcript_event": { "speaker": "user"|"agent", "message": "...", "event_id": 1 } // "user_transcription_event": { "user_transcript": "Hello" }
// } // }
// This message only carries the user's speech-to-text — speaker is always "user".
const TSharedPtr<FJsonObject>* TranscriptEvent = nullptr; const TSharedPtr<FJsonObject>* TranscriptEvent = nullptr;
if (!Root->TryGetObjectField(TEXT("transcript_event"), TranscriptEvent) || !TranscriptEvent) if (!Root->TryGetObjectField(TEXT("user_transcription_event"), TranscriptEvent) || !TranscriptEvent)
{ {
UE_LOG(LogElevenLabsWS, Warning, TEXT("user_transcript message missing 'user_transcription_event' field."));
return; return;
} }
FElevenLabsTranscriptSegment Segment; FElevenLabsTranscriptSegment Segment;
(*TranscriptEvent)->TryGetStringField(TEXT("speaker"), Segment.Speaker); Segment.Speaker = TEXT("user");
(*TranscriptEvent)->TryGetStringField(TEXT("message"), Segment.Text); (*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text);
// user_transcript messages are always final (interim results are not sent for user speech)
// ElevenLabs marks final vs. interim via "is_final" Segment.bIsFinal = true;
(*TranscriptEvent)->TryGetBoolField(TEXT("is_final"), Segment.bIsFinal);
OnTranscript.Broadcast(Segment); OnTranscript.Broadcast(Segment);
} }
@ -318,7 +327,8 @@ void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>
void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root) void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)
{ {
// Reply with a pong to keep the connection alive. // Reply with a pong to keep the connection alive.
// { "type": "ping", "ping_event": { "event_id": 1 } } // Incoming: { "type": "ping", "ping_event": { "event_id": 1, "ping_ms": 150 } }
// Reply: { "type": "pong", "event_id": 1 } ← event_id is top-level, no wrapper object
int32 EventID = 0; int32 EventID = 0;
const TSharedPtr<FJsonObject>* PingEvent = nullptr; const TSharedPtr<FJsonObject>* PingEvent = nullptr;
if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent) if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent)
@ -328,9 +338,7 @@ void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)
TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject()); TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject());
Pong->SetStringField(TEXT("type"), TEXT("pong")); Pong->SetStringField(TEXT("type"), TEXT("pong"));
TSharedPtr<FJsonObject> PongEvent = MakeShareable(new FJsonObject()); Pong->SetNumberField(TEXT("event_id"), EventID); // top-level, not nested
PongEvent->SetNumberField(TEXT("event_id"), EventID);
Pong->SetObjectField(TEXT("pong_event"), PongEvent);
SendJsonMessage(Pong); SendJsonMessage(Pong);
} }

View File

@ -36,16 +36,21 @@ namespace ElevenLabsMessageType
{ {
// Client → Server // Client → Server
static const FString AudioChunk = TEXT("user_audio_chunk"); static const FString AudioChunk = TEXT("user_audio_chunk");
static const FString UserTurnStart = TEXT("user_turn_start"); // Client turn mode: signal user is currently active/speaking
static const FString UserTurnEnd = TEXT("user_turn_end"); static const FString UserActivity = TEXT("user_activity");
static const FString Interrupt = TEXT("interrupt"); // Client turn mode: send a text message without audio
static const FString ClientToolResult = TEXT("client_tool_result"); static const FString UserMessage = TEXT("user_message");
static const FString Interrupt = TEXT("interrupt");
static const FString ClientToolResult = TEXT("client_tool_result");
static const FString ConversationClientData = TEXT("conversation_initiation_client_data");
// Server → Client // Server → Client
static const FString ConversationInitiation = TEXT("conversation_initiation_metadata"); static const FString ConversationInitiation = TEXT("conversation_initiation_metadata");
static const FString AudioResponse = TEXT("audio"); static const FString AudioResponse = TEXT("audio");
static const FString Transcript = TEXT("transcript"); // User speech-to-text transcript (speaker is always the user)
static const FString UserTranscript = TEXT("user_transcript");
static const FString AgentResponse = TEXT("agent_response"); static const FString AgentResponse = TEXT("agent_response");
static const FString AgentResponseCorrection= TEXT("agent_response_correction");
static const FString InterruptionEvent = TEXT("interruption"); static const FString InterruptionEvent = TEXT("interruption");
static const FString PingEvent = TEXT("ping"); static const FString PingEvent = TEXT("ping");
static const FString ClientToolCall = TEXT("client_tool_call"); static const FString ClientToolCall = TEXT("client_tool_call");

View File

@ -120,11 +120,19 @@ public:
// ── Turn control (only relevant in Client turn mode) ────────────────────── // ── Turn control (only relevant in Client turn mode) ──────────────────────
/** Signal that the user has started speaking (Client turn mode). */ /**
* Signal that the user is actively speaking (Client turn mode).
* Sends a { "type": "user_activity" } message to the server.
* Call this periodically while the user is speaking (e.g. every audio chunk).
*/
UFUNCTION(BlueprintCallable, Category = "ElevenLabs") UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
void SendUserTurnStart(); void SendUserTurnStart();
/** Signal that the user has finished speaking (Client turn mode). */ /**
* Signal that the user has finished speaking (Client turn mode).
* No explicit API message simply stop sending user_activity.
* The server detects silence and hands the turn to the agent.
*/
UFUNCTION(BlueprintCallable, Category = "ElevenLabs") UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
void SendUserTurnEnd(); void SendUserTurnEnd();