Fix 3 WebSocket protocol bugs found by API cross-check

Bug 1 — Transcript handler: wrong type string + wrong JSON fields
  - type was "transcript", API sends "user_transcript"
  - event key was "transcript_event", API uses "user_transcription_event"
  - field was "message", API uses "user_transcript"
  - removed non-existent "speaker"/"is_final" fields; speaker is always "user"

Bug 2 — Pong format: event_id must be top-level, not nested in pong_event
  - Was: {"type":"pong","pong_event":{"event_id":1}}
  - Fixed: {"type":"pong","event_id":1}

Bug 3 — Client turn mode: user_turn_start/end don't exist in the API
  - SendUserTurnStart now sends {"type":"user_activity"} (correct API message)
  - SendUserTurnEnd now a no-op with log (no explicit end message in API)
  - Renamed constants in ElevenLabsDefinitions.h accordingly

Also added AgentResponseCorrection and ConversationClientData constants.
Compiles cleanly on UE 5.5 Win64.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-19 13:43:08 +01:00
parent dbd61615a9
commit ae2c9b92e8
3 changed files with 47 additions and 26 deletions

View File

@ -109,18 +109,21 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
void UElevenLabsWebSocketProxy::SendUserTurnStart()
{
// In client turn mode, signal that the user is active/speaking.
// API message: { "type": "user_activity" }
if (!IsConnected()) return;
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnStart);
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity);
SendJsonMessage(Msg);
}
void UElevenLabsWebSocketProxy::SendUserTurnEnd()
{
if (!IsConnected()) return;
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnEnd);
SendJsonMessage(Msg);
// In client turn mode, stopping user_activity signals end of user turn.
// The API uses user_activity for ongoing speech; simply stop sending it.
// No explicit end message is required — silence is detected server-side.
// We still log for debug visibility.
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity."));
}
void UElevenLabsWebSocketProxy::SendInterrupt()
@ -189,7 +192,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
{
HandleAudioResponse(Root);
}
else if (MsgType == ElevenLabsMessageType::Transcript)
else if (MsgType == ElevenLabsMessageType::UserTranscript)
{
HandleTranscript(Root);
}
@ -197,6 +200,11 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
{
HandleAgentResponse(Root);
}
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
{
// Silently ignore for now — corrected text after interruption.
UE_LOG(LogElevenLabsWS, Verbose, TEXT("agent_response_correction received (ignored)."));
}
else if (MsgType == ElevenLabsMessageType::InterruptionEvent)
{
HandleInterruption(Root);
@ -273,22 +281,23 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject
void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr<FJsonObject>& Root)
{
// Expected structure:
// { "type": "transcript",
// "transcript_event": { "speaker": "user"|"agent", "message": "...", "event_id": 1 }
// API structure:
// { "type": "user_transcript",
// "user_transcription_event": { "user_transcript": "Hello" }
// }
// This message only carries the user's speech-to-text — speaker is always "user".
const TSharedPtr<FJsonObject>* TranscriptEvent = nullptr;
if (!Root->TryGetObjectField(TEXT("transcript_event"), TranscriptEvent) || !TranscriptEvent)
if (!Root->TryGetObjectField(TEXT("user_transcription_event"), TranscriptEvent) || !TranscriptEvent)
{
UE_LOG(LogElevenLabsWS, Warning, TEXT("user_transcript message missing 'user_transcription_event' field."));
return;
}
FElevenLabsTranscriptSegment Segment;
(*TranscriptEvent)->TryGetStringField(TEXT("speaker"), Segment.Speaker);
(*TranscriptEvent)->TryGetStringField(TEXT("message"), Segment.Text);
// ElevenLabs marks final vs. interim via "is_final"
(*TranscriptEvent)->TryGetBoolField(TEXT("is_final"), Segment.bIsFinal);
Segment.Speaker = TEXT("user");
(*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text);
// user_transcript messages are always final (interim results are not sent for user speech)
Segment.bIsFinal = true;
OnTranscript.Broadcast(Segment);
}
@ -318,7 +327,8 @@ void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>
void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)
{
// Reply with a pong to keep the connection alive.
// { "type": "ping", "ping_event": { "event_id": 1 } }
// Incoming: { "type": "ping", "ping_event": { "event_id": 1, "ping_ms": 150 } }
// Reply: { "type": "pong", "event_id": 1 } ← event_id is top-level, no wrapper object
int32 EventID = 0;
const TSharedPtr<FJsonObject>* PingEvent = nullptr;
if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent)
@ -328,9 +338,7 @@ void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)
TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject());
Pong->SetStringField(TEXT("type"), TEXT("pong"));
TSharedPtr<FJsonObject> PongEvent = MakeShareable(new FJsonObject());
PongEvent->SetNumberField(TEXT("event_id"), EventID);
Pong->SetObjectField(TEXT("pong_event"), PongEvent);
Pong->SetNumberField(TEXT("event_id"), EventID); // top-level, not nested
SendJsonMessage(Pong);
}

View File

@ -36,16 +36,21 @@ namespace ElevenLabsMessageType
{
// Client → Server
static const FString AudioChunk = TEXT("user_audio_chunk");
static const FString UserTurnStart = TEXT("user_turn_start");
static const FString UserTurnEnd = TEXT("user_turn_end");
// Client turn mode: signal user is currently active/speaking
static const FString UserActivity = TEXT("user_activity");
// Client turn mode: send a text message without audio
static const FString UserMessage = TEXT("user_message");
static const FString Interrupt = TEXT("interrupt");
static const FString ClientToolResult = TEXT("client_tool_result");
static const FString ConversationClientData = TEXT("conversation_initiation_client_data");
// Server → Client
static const FString ConversationInitiation = TEXT("conversation_initiation_metadata");
static const FString AudioResponse = TEXT("audio");
static const FString Transcript = TEXT("transcript");
// User speech-to-text transcript (speaker is always the user)
static const FString UserTranscript = TEXT("user_transcript");
static const FString AgentResponse = TEXT("agent_response");
static const FString AgentResponseCorrection= TEXT("agent_response_correction");
static const FString InterruptionEvent = TEXT("interruption");
static const FString PingEvent = TEXT("ping");
static const FString ClientToolCall = TEXT("client_tool_call");

View File

@ -120,11 +120,19 @@ public:
// ── Turn control (only relevant in Client turn mode) ──────────────────────
/** Signal that the user has started speaking (Client turn mode). */
/**
* Signal that the user is actively speaking (Client turn mode).
* Sends a { "type": "user_activity" } message to the server.
* Call this periodically while the user is speaking (e.g. every audio chunk).
*/
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
void SendUserTurnStart();
/** Signal that the user has finished speaking (Client turn mode). */
/**
* Signal that the user has finished speaking (Client turn mode).
* No explicit API message simply stop sending user_activity.
* The server detects silence and hands the turn to the agent.
*/
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
void SendUserTurnEnd();