Fix 3 WebSocket protocol bugs found by API cross-check
Bug 1 — Transcript handler: wrong type string + wrong JSON fields
- type was "transcript", API sends "user_transcript"
- event key was "transcript_event", API uses "user_transcription_event"
- field was "message", API uses "user_transcript"
- removed non-existent "speaker"/"is_final" fields; speaker is always "user"
Bug 2 — Pong format: event_id must be top-level, not nested in pong_event
- Was: {"type":"pong","pong_event":{"event_id":1}}
- Fixed: {"type":"pong","event_id":1}
Bug 3 — Client turn mode: user_turn_start/end don't exist in the API
- SendUserTurnStart now sends {"type":"user_activity"} (correct API message)
- SendUserTurnEnd now a no-op with log (no explicit end message in API)
- Renamed constants in ElevenLabsDefinitions.h accordingly
Also added AgentResponseCorrection and ConversationClientData constants.
Compiles cleanly on UE 5.5 Win64.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
dbd61615a9
commit
ae2c9b92e8
@ -109,18 +109,21 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
|
|||||||
|
|
||||||
void UElevenLabsWebSocketProxy::SendUserTurnStart()
|
void UElevenLabsWebSocketProxy::SendUserTurnStart()
|
||||||
{
|
{
|
||||||
|
// In client turn mode, signal that the user is active/speaking.
|
||||||
|
// API message: { "type": "user_activity" }
|
||||||
if (!IsConnected()) return;
|
if (!IsConnected()) return;
|
||||||
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
|
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
|
||||||
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnStart);
|
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity);
|
||||||
SendJsonMessage(Msg);
|
SendJsonMessage(Msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsWebSocketProxy::SendUserTurnEnd()
|
void UElevenLabsWebSocketProxy::SendUserTurnEnd()
|
||||||
{
|
{
|
||||||
if (!IsConnected()) return;
|
// In client turn mode, stopping user_activity signals end of user turn.
|
||||||
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
|
// The API uses user_activity for ongoing speech; simply stop sending it.
|
||||||
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserTurnEnd);
|
// No explicit end message is required — silence is detected server-side.
|
||||||
SendJsonMessage(Msg);
|
// We still log for debug visibility.
|
||||||
|
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity."));
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsWebSocketProxy::SendInterrupt()
|
void UElevenLabsWebSocketProxy::SendInterrupt()
|
||||||
@ -189,7 +192,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
|||||||
{
|
{
|
||||||
HandleAudioResponse(Root);
|
HandleAudioResponse(Root);
|
||||||
}
|
}
|
||||||
else if (MsgType == ElevenLabsMessageType::Transcript)
|
else if (MsgType == ElevenLabsMessageType::UserTranscript)
|
||||||
{
|
{
|
||||||
HandleTranscript(Root);
|
HandleTranscript(Root);
|
||||||
}
|
}
|
||||||
@ -197,6 +200,11 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
|||||||
{
|
{
|
||||||
HandleAgentResponse(Root);
|
HandleAgentResponse(Root);
|
||||||
}
|
}
|
||||||
|
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
|
||||||
|
{
|
||||||
|
// Silently ignore for now — corrected text after interruption.
|
||||||
|
UE_LOG(LogElevenLabsWS, Verbose, TEXT("agent_response_correction received (ignored)."));
|
||||||
|
}
|
||||||
else if (MsgType == ElevenLabsMessageType::InterruptionEvent)
|
else if (MsgType == ElevenLabsMessageType::InterruptionEvent)
|
||||||
{
|
{
|
||||||
HandleInterruption(Root);
|
HandleInterruption(Root);
|
||||||
@ -273,22 +281,23 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject
|
|||||||
|
|
||||||
void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr<FJsonObject>& Root)
|
void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr<FJsonObject>& Root)
|
||||||
{
|
{
|
||||||
// Expected structure:
|
// API structure:
|
||||||
// { "type": "transcript",
|
// { "type": "user_transcript",
|
||||||
// "transcript_event": { "speaker": "user"|"agent", "message": "...", "event_id": 1 }
|
// "user_transcription_event": { "user_transcript": "Hello" }
|
||||||
// }
|
// }
|
||||||
|
// This message only carries the user's speech-to-text — speaker is always "user".
|
||||||
const TSharedPtr<FJsonObject>* TranscriptEvent = nullptr;
|
const TSharedPtr<FJsonObject>* TranscriptEvent = nullptr;
|
||||||
if (!Root->TryGetObjectField(TEXT("transcript_event"), TranscriptEvent) || !TranscriptEvent)
|
if (!Root->TryGetObjectField(TEXT("user_transcription_event"), TranscriptEvent) || !TranscriptEvent)
|
||||||
{
|
{
|
||||||
|
UE_LOG(LogElevenLabsWS, Warning, TEXT("user_transcript message missing 'user_transcription_event' field."));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
FElevenLabsTranscriptSegment Segment;
|
FElevenLabsTranscriptSegment Segment;
|
||||||
(*TranscriptEvent)->TryGetStringField(TEXT("speaker"), Segment.Speaker);
|
Segment.Speaker = TEXT("user");
|
||||||
(*TranscriptEvent)->TryGetStringField(TEXT("message"), Segment.Text);
|
(*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text);
|
||||||
|
// user_transcript messages are always final (interim results are not sent for user speech)
|
||||||
// ElevenLabs marks final vs. interim via "is_final"
|
Segment.bIsFinal = true;
|
||||||
(*TranscriptEvent)->TryGetBoolField(TEXT("is_final"), Segment.bIsFinal);
|
|
||||||
|
|
||||||
OnTranscript.Broadcast(Segment);
|
OnTranscript.Broadcast(Segment);
|
||||||
}
|
}
|
||||||
@ -318,7 +327,8 @@ void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>
|
|||||||
void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)
|
void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)
|
||||||
{
|
{
|
||||||
// Reply with a pong to keep the connection alive.
|
// Reply with a pong to keep the connection alive.
|
||||||
// { "type": "ping", "ping_event": { "event_id": 1 } }
|
// Incoming: { "type": "ping", "ping_event": { "event_id": 1, "ping_ms": 150 } }
|
||||||
|
// Reply: { "type": "pong", "event_id": 1 } ← event_id is top-level, no wrapper object
|
||||||
int32 EventID = 0;
|
int32 EventID = 0;
|
||||||
const TSharedPtr<FJsonObject>* PingEvent = nullptr;
|
const TSharedPtr<FJsonObject>* PingEvent = nullptr;
|
||||||
if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent)
|
if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent)
|
||||||
@ -328,9 +338,7 @@ void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)
|
|||||||
|
|
||||||
TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject());
|
TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject());
|
||||||
Pong->SetStringField(TEXT("type"), TEXT("pong"));
|
Pong->SetStringField(TEXT("type"), TEXT("pong"));
|
||||||
TSharedPtr<FJsonObject> PongEvent = MakeShareable(new FJsonObject());
|
Pong->SetNumberField(TEXT("event_id"), EventID); // top-level, not nested
|
||||||
PongEvent->SetNumberField(TEXT("event_id"), EventID);
|
|
||||||
Pong->SetObjectField(TEXT("pong_event"), PongEvent);
|
|
||||||
SendJsonMessage(Pong);
|
SendJsonMessage(Pong);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -36,16 +36,21 @@ namespace ElevenLabsMessageType
|
|||||||
{
|
{
|
||||||
// Client → Server
|
// Client → Server
|
||||||
static const FString AudioChunk = TEXT("user_audio_chunk");
|
static const FString AudioChunk = TEXT("user_audio_chunk");
|
||||||
static const FString UserTurnStart = TEXT("user_turn_start");
|
// Client turn mode: signal user is currently active/speaking
|
||||||
static const FString UserTurnEnd = TEXT("user_turn_end");
|
static const FString UserActivity = TEXT("user_activity");
|
||||||
static const FString Interrupt = TEXT("interrupt");
|
// Client turn mode: send a text message without audio
|
||||||
static const FString ClientToolResult = TEXT("client_tool_result");
|
static const FString UserMessage = TEXT("user_message");
|
||||||
|
static const FString Interrupt = TEXT("interrupt");
|
||||||
|
static const FString ClientToolResult = TEXT("client_tool_result");
|
||||||
|
static const FString ConversationClientData = TEXT("conversation_initiation_client_data");
|
||||||
|
|
||||||
// Server → Client
|
// Server → Client
|
||||||
static const FString ConversationInitiation = TEXT("conversation_initiation_metadata");
|
static const FString ConversationInitiation = TEXT("conversation_initiation_metadata");
|
||||||
static const FString AudioResponse = TEXT("audio");
|
static const FString AudioResponse = TEXT("audio");
|
||||||
static const FString Transcript = TEXT("transcript");
|
// User speech-to-text transcript (speaker is always the user)
|
||||||
|
static const FString UserTranscript = TEXT("user_transcript");
|
||||||
static const FString AgentResponse = TEXT("agent_response");
|
static const FString AgentResponse = TEXT("agent_response");
|
||||||
|
static const FString AgentResponseCorrection= TEXT("agent_response_correction");
|
||||||
static const FString InterruptionEvent = TEXT("interruption");
|
static const FString InterruptionEvent = TEXT("interruption");
|
||||||
static const FString PingEvent = TEXT("ping");
|
static const FString PingEvent = TEXT("ping");
|
||||||
static const FString ClientToolCall = TEXT("client_tool_call");
|
static const FString ClientToolCall = TEXT("client_tool_call");
|
||||||
|
|||||||
@ -120,11 +120,19 @@ public:
|
|||||||
|
|
||||||
// ── Turn control (only relevant in Client turn mode) ──────────────────────
|
// ── Turn control (only relevant in Client turn mode) ──────────────────────
|
||||||
|
|
||||||
/** Signal that the user has started speaking (Client turn mode). */
|
/**
|
||||||
|
* Signal that the user is actively speaking (Client turn mode).
|
||||||
|
* Sends a { "type": "user_activity" } message to the server.
|
||||||
|
* Call this periodically while the user is speaking (e.g. every audio chunk).
|
||||||
|
*/
|
||||||
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
|
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
|
||||||
void SendUserTurnStart();
|
void SendUserTurnStart();
|
||||||
|
|
||||||
/** Signal that the user has finished speaking (Client turn mode). */
|
/**
|
||||||
|
* Signal that the user has finished speaking (Client turn mode).
|
||||||
|
* No explicit API message — simply stop sending user_activity.
|
||||||
|
* The server detects silence and hands the turn to the agent.
|
||||||
|
*/
|
||||||
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
|
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
|
||||||
void SendUserTurnEnd();
|
void SendUserTurnEnd();
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user