Fix: distinguish binary audio frames from binary JSON frames

ElevenLabs sends two kinds of binary WebSocket frames:
  1. JSON control messages (starts with '{') — decode as UTF-8, route to OnWsMessage
  2. Raw PCM audio (binary, does not start with '{') — broadcast directly as audio

Previously all binary frames were decoded as UTF-8 JSON, causing
"Failed to parse WebSocket message as JSON" for every audio frame.

Fix: peek at first byte of assembled frame buffer:
  - '{' → UTF-8 JSON path (null-terminated, routed to existing message handler)
  - anything else → raw PCM path (broadcast directly to OnAudioReceived)

Also: improved "Failed to parse JSON" log to show first 80 chars of message,
and added verbose hex dump of binary audio frame prefix for diagnostics.
Compiles cleanly on UE 5.5 Win64.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-19 13:54:34 +01:00
parent 669c503d06
commit 483456728d

View File

@ -186,7 +186,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
TSharedRef<TJsonReader<>> Reader = TJsonReaderFactory<>::Create(Message);
if (!FJsonSerializer::Deserialize(Reader, Root) || !Root.IsValid())
{
UE_LOG(LogElevenLabsWS, Warning, TEXT("Failed to parse WebSocket message as JSON."));
UE_LOG(LogElevenLabsWS, Warning, TEXT("Failed to parse WebSocket message as JSON (first 80 chars): %.80s"), *Message);
return;
}
@ -237,9 +237,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size, SIZE_T BytesRemaining)
{
// ElevenLabs sends its JSON messages as binary WebSocket frames (not text frames).
// Accumulate fragments until BytesRemaining == 0, then parse the complete message.
// Accumulate fragments until BytesRemaining == 0.
const uint8* Bytes = static_cast<const uint8*>(Data);
BinaryFrameBuffer.Append(Bytes, Size);
@ -249,14 +247,48 @@ void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size,
return;
}
// Full message received — interpret as UTF-8 JSON
const FString JsonString = FString(UTF8_TO_TCHAR(
reinterpret_cast<const char*>(BinaryFrameBuffer.GetData())));
const int32 TotalSize = BinaryFrameBuffer.Num();
BinaryFrameBuffer.Reset();
// Peek at first byte to distinguish JSON (starts with '{') from raw binary audio.
const bool bLooksLikeJson = (TotalSize > 0 && BinaryFrameBuffer[0] == '{');
// Route through the existing text message handler
OnWsMessage(JsonString);
if (bLooksLikeJson)
{
// Null-terminate safely then decode as UTF-8 JSON
BinaryFrameBuffer.Add(0);
const FString JsonString = FString(UTF8_TO_TCHAR(
reinterpret_cast<const char*>(BinaryFrameBuffer.GetData())));
BinaryFrameBuffer.Reset();
const UElevenLabsSettings* Settings = FPS_AI_Agent_ElevenLabsModule::Get().GetSettings();
if (Settings->bVerboseLogging)
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Binary JSON frame (%d bytes): %.120s"), TotalSize, *JsonString);
}
OnWsMessage(JsonString);
}
else
{
// Raw binary audio frame — PCM bytes sent directly without Base64/JSON wrapper.
// Log first few bytes as hex to help diagnose the format.
const UElevenLabsSettings* Settings = FPS_AI_Agent_ElevenLabsModule::Get().GetSettings();
if (Settings->bVerboseLogging)
{
FString HexPreview;
const int32 PreviewBytes = FMath::Min(TotalSize, 8);
for (int32 i = 0; i < PreviewBytes; i++)
{
HexPreview += FString::Printf(TEXT("%02X "), BinaryFrameBuffer[i]);
}
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Binary audio frame: %d bytes | first bytes: %s"), TotalSize, *HexPreview);
}
// Broadcast raw PCM bytes directly to the audio queue.
TArray<uint8> PCMData = MoveTemp(BinaryFrameBuffer);
BinaryFrameBuffer.Reset();
OnAudioReceived.Broadcast(PCMData);
}
}
// ─────────────────────────────────────────────────────────────────────────────