From 483456728dc469289cd84fb365408cb7803fb06a Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Thu, 19 Feb 2026 13:54:34 +0100 Subject: [PATCH] Fix: distinguish binary audio frames from binary JSON frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ElevenLabs sends two kinds of binary WebSocket frames: 1. JSON control messages (starts with '{') — decode as UTF-8, route to OnWsMessage 2. Raw PCM audio (binary, does not start with '{') — broadcast directly as audio Previously all binary frames were decoded as UTF-8 JSON, causing "Failed to parse WebSocket message as JSON" for every audio frame. Fix: peek at first byte of assembled frame buffer: - '{' → UTF-8 JSON path (null-terminated, routed to existing message handler) - anything else → raw PCM path (broadcast directly to OnAudioReceived) Also: improved "Failed to parse JSON" log to show first 80 chars of message, and added verbose hex dump of binary audio frame prefix for diagnostics. Compiles cleanly on UE 5.5 Win64. Co-Authored-By: Claude Opus 4.6 --- .../Private/ElevenLabsWebSocketProxy.cpp | 52 +++++++++++++++---- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index 356de64..6fa399c 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -186,7 +186,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) TSharedRef> Reader = TJsonReaderFactory<>::Create(Message); if (!FJsonSerializer::Deserialize(Reader, Root) || !Root.IsValid()) { - UE_LOG(LogElevenLabsWS, Warning, TEXT("Failed to parse WebSocket message as JSON.")); + UE_LOG(LogElevenLabsWS, Warning, TEXT("Failed to parse WebSocket message as JSON (first 80 chars): %.80s"), *Message); return; } @@ -237,9 +237,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size, SIZE_T BytesRemaining) { - // ElevenLabs sends its JSON messages as binary WebSocket frames (not text frames). - // Accumulate fragments until BytesRemaining == 0, then parse the complete message. - + // Accumulate fragments until BytesRemaining == 0. const uint8* Bytes = static_cast(Data); BinaryFrameBuffer.Append(Bytes, Size); @@ -249,14 +247,48 @@ void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size, return; } - // Full message received — interpret as UTF-8 JSON - const FString JsonString = FString(UTF8_TO_TCHAR( - reinterpret_cast(BinaryFrameBuffer.GetData()))); + const int32 TotalSize = BinaryFrameBuffer.Num(); - BinaryFrameBuffer.Reset(); + // Peek at first byte to distinguish JSON (starts with '{') from raw binary audio. + const bool bLooksLikeJson = (TotalSize > 0 && BinaryFrameBuffer[0] == '{'); - // Route through the existing text message handler - OnWsMessage(JsonString); + if (bLooksLikeJson) + { + // Null-terminate safely then decode as UTF-8 JSON + BinaryFrameBuffer.Add(0); + const FString JsonString = FString(UTF8_TO_TCHAR( + reinterpret_cast(BinaryFrameBuffer.GetData()))); + BinaryFrameBuffer.Reset(); + + const UElevenLabsSettings* Settings = FPS_AI_Agent_ElevenLabsModule::Get().GetSettings(); + if (Settings->bVerboseLogging) + { + UE_LOG(LogElevenLabsWS, Verbose, TEXT("Binary JSON frame (%d bytes): %.120s"), TotalSize, *JsonString); + } + + OnWsMessage(JsonString); + } + else + { + // Raw binary audio frame — PCM bytes sent directly without Base64/JSON wrapper. + // Log first few bytes as hex to help diagnose the format. + const UElevenLabsSettings* Settings = FPS_AI_Agent_ElevenLabsModule::Get().GetSettings(); + if (Settings->bVerboseLogging) + { + FString HexPreview; + const int32 PreviewBytes = FMath::Min(TotalSize, 8); + for (int32 i = 0; i < PreviewBytes; i++) + { + HexPreview += FString::Printf(TEXT("%02X "), BinaryFrameBuffer[i]); + } + UE_LOG(LogElevenLabsWS, Verbose, TEXT("Binary audio frame: %d bytes | first bytes: %s"), TotalSize, *HexPreview); + } + + // Broadcast raw PCM bytes directly to the audio queue. + TArray PCMData = MoveTemp(BinaryFrameBuffer); + BinaryFrameBuffer.Reset(); + OnAudioReceived.Broadcast(PCMData); + } } // ─────────────────────────────────────────────────────────────────────────────