Compare commits

..

5 Commits

Author SHA1 Message Date
4ee09f4e58 actor 2026-02-22 09:31:13 +01:00
a26361a7b2 Remove Units meta from MicChunkDurationMs to fix slider display
UE was converting the raw ms value to seconds in the Details panel,
showing "0.1 s" instead of "100". Removing Units="ms" lets the slider
display the integer value directly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 09:30:50 +01:00
cb78e3249c Add detailed English tooltips to all ElevenLabs UPROPERTY parameters
All configuration and event properties in ConversationalAgentComponent and
MicrophoneCaptureComponent now have explicit ToolTip meta for clear descriptions
in the Unreal Editor Details panel.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 09:18:50 +01:00
20a6e30377 v1.8.0: Server VAD interruption, partial response fix, configurable chunk size
- Server VAD + interruption: mic stays open while agent speaks, server
  detects user voice and triggers interruption automatically. Echo
  suppression disabled in this mode so audio reaches the server.
- Fix agent_chat_response_part parsing: ElevenLabs API now uses
  text_response_part.text instead of agent_chat_response_part_event.
  Added fallback for legacy format.
- Expose MicChunkDurationMs as UPROPERTY (20-500ms, default 100ms)
  instead of compile-time constant.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 09:13:02 +01:00
152fc6196d v1.7.1: Fix mic not heard + latency optimizations + thread safety
Fix regression from v1.7.0 where agent couldn't hear user speech:
- Restore AsyncTask game-thread dispatch for delegate broadcast (AddUObject
  weak pointer checks are not thread-safe from WASAPI thread)
- Keep early echo suppression in WASAPI callback (before resampling)
- Keep MicChunkMinBytes at 3200 (100ms) for lower latency
- Add thread safety: std::atomic<bool> for bIsListening/bAgentSpeaking/bCapturing,
  FCriticalSection for MicSendLock and WebSocketSendLock
- Add EchoSuppressFlag pointer from agent to mic component

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 08:46:15 +01:00
7 changed files with 213 additions and 150 deletions

View File

@ -247,6 +247,19 @@ void UElevenLabsConversationalAgentComponent::StartListening()
Mic->OnAudioCaptured.RemoveAll(this); Mic->OnAudioCaptured.RemoveAll(this);
Mic->OnAudioCaptured.AddUObject(this, Mic->OnAudioCaptured.AddUObject(this,
&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured); &UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
// capture entirely (before resampling) while the agent is speaking.
// In Server VAD + interruption mode, disable echo suppression so the server
// receives the user's voice even during agent playback — the server's own VAD
// handles echo filtering and interruption detection.
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
{
Mic->EchoSuppressFlag = nullptr;
}
else
{
Mic->EchoSuppressFlag = &bAgentSpeaking;
}
Mic->StartCapture(); Mic->StartCapture();
const double T = TurnStartTime - SessionStartTime; const double T = TurnStartTime - SessionStartTime;
@ -275,6 +288,8 @@ void UElevenLabsConversationalAgentComponent::StopListening()
// "user speaking" state and stall waiting for more audio that never arrives, // "user speaking" state and stall waiting for more audio that never arrives,
// leaving both sides stuck — no audio for the collision response and no response // leaving both sides stuck — no audio for the collision response and no response
// for subsequent turns. // for subsequent turns.
{
FScopeLock Lock(&MicSendLock);
if (bAgentGenerating) if (bAgentGenerating)
{ {
if (MicAccumulationBuffer.Num() > 0) if (MicAccumulationBuffer.Num() > 0)
@ -289,6 +304,7 @@ void UElevenLabsConversationalAgentComponent::StopListening()
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
} }
MicAccumulationBuffer.Reset(); MicAccumulationBuffer.Reset();
}
if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client) if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client)
{ {
@ -394,7 +410,10 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod
GeneratingTickCount = 0; GeneratingTickCount = 0;
TurnIndex = 0; TurnIndex = 0;
LastClosedTurnIndex = 0; LastClosedTurnIndex = 0;
{
FScopeLock Lock(&MicSendLock);
MicAccumulationBuffer.Reset(); MicAccumulationBuffer.Reset();
}
OnAgentDisconnected.Broadcast(StatusCode, Reason); OnAgentDisconnected.Broadcast(StatusCode, Reason);
} }
@ -450,24 +469,28 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
const double T = Now - SessionStartTime; const double T = Now - SessionStartTime;
const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0; const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
if (bIsListening) if (bIsListening)
{
// In Server VAD + interruption mode, keep the mic open so the server can
// detect if the user speaks over the agent and send an interruption event.
// The server handles echo filtering and VAD — we just keep streaming audio.
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
{
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"),
T, LastClosedTurnIndex, LatencyFromTurnEnd);
}
else
{ {
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open. // Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
// The server's VAD detected a pause in the user's speech and started generating
// prematurely — the user hasn't finished speaking yet.
//
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's // Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation, // bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
// causing it to re-enter "user speaking" state and stall — both sides stuck. // causing it to re-enter "user speaking" state and stall — both sides stuck.
//
// Do NOT send an interrupt here — just let the server's response play out:
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
// Either way the state machine recovers and Blueprint can reopen the mic.
UE_LOG(LogElevenLabsAgent, Log, UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"), TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd); T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
StopListening(); StopListening();
} }
}
UE_LOG(LogElevenLabsAgent, Log, UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"), TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"),
@ -606,18 +629,26 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
// Echo suppression: skip sending mic audio while the agent is speaking. // Echo suppression: skip sending mic audio while the agent is speaking.
// This prevents the agent from hearing its own voice through the speakers, // This prevents the agent from hearing its own voice through the speakers,
// which would confuse the server's VAD and STT. Matches the approach used // which would confuse the server's VAD and STT.
// in the official ElevenLabs C++ SDK (outputPlaying_ flag). // In Server VAD + interruption mode, keep sending audio so the server can
if (bAgentSpeaking) return; // detect the user speaking over the agent and trigger an interruption.
if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption))
{
return;
}
// Convert this callback's samples to int16 bytes and accumulate. // Convert this callback's samples to int16 bytes and accumulate.
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms // WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
// (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here // (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here
// until we have enough, then send the whole batch in one WebSocket frame. // until we have enough, then send the whole batch in one WebSocket frame.
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM); TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
// Lock: MicAccumulationBuffer is accessed from WASAPI thread (here) and
// game thread (StopListening flush). WebSocket send is also serialized.
FScopeLock Lock(&MicSendLock);
MicAccumulationBuffer.Append(PCMBytes); MicAccumulationBuffer.Append(PCMBytes);
if (MicAccumulationBuffer.Num() >= MicChunkMinBytes) if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes())
{ {
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
MicAccumulationBuffer.Reset(); MicAccumulationBuffer.Reset();

View File

@ -89,6 +89,12 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
UE_LOG(LogElevenLabsMic, Verbose, TEXT("Audio capture buffer overflow.")); UE_LOG(LogElevenLabsMic, Verbose, TEXT("Audio capture buffer overflow."));
} }
// Echo suppression: skip resampling + broadcasting entirely when agent is speaking.
if (EchoSuppressFlag && EchoSuppressFlag->load(std::memory_order_relaxed))
{
return;
}
// Device sends float32 interleaved samples; cast from the void* API. // Device sends float32 interleaved samples; cast from the void* API.
const float* FloatAudio = static_cast<const float*>(InAudio); const float* FloatAudio = static_cast<const float*>(InAudio);
@ -104,16 +110,21 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
} }
} }
// Fire the delegate on the game thread so subscribers don't need to be // Dispatch to game thread for delegate broadcast.
// thread-safe (WebSocket Send is not thread-safe in UE's implementation). // UE's FMulticastDelegate with AddUObject uses weak object pointer checks that
AsyncTask(ENamedThreads::GameThread, [this, Data = MoveTemp(Resampled)]() // are not thread-safe — broadcasting from the WASAPI thread causes the invocation
// to be silently skipped. The game thread dispatch adds ~8ms latency but is required.
if (bCapturing)
{
AsyncTask(ENamedThreads::GameThread, [this, Captured = MoveTemp(Resampled)]()
{ {
if (bCapturing) if (bCapturing)
{ {
OnAudioCaptured.Broadcast(Data); OnAudioCaptured.Broadcast(Captured);
} }
}); });
} }
}
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
// Resampling // Resampling

View File

@ -120,11 +120,14 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
// Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second). // Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second).
UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num()); UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num());
{
FScopeLock Lock(&WebSocketSendLock);
if (WebSocket.IsValid() && WebSocket->IsConnected()) if (WebSocket.IsValid() && WebSocket->IsConnected())
{ {
WebSocket->Send(AudioJson); WebSocket->Send(AudioJson);
} }
} }
}
void UElevenLabsWebSocketProxy::SendUserTurnStart() void UElevenLabsWebSocketProxy::SendUserTurnStart()
{ {
@ -585,20 +588,41 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
} }
// Extract the streaming text fragment and broadcast it. // Extract the streaming text fragment and broadcast it.
// API structure: // Current API structure (2026):
// { "type": "agent_chat_response_part",
// "text_response_part": { "text": "partial text", "type": "part"|"stop", "event_id": N }
// }
// Legacy structure (pre-2026):
// { "type": "agent_chat_response_part", // { "type": "agent_chat_response_part",
// "agent_chat_response_part_event": { "agent_response_part": "partial text" } // "agent_chat_response_part_event": { "agent_response_part": "partial text" }
// } // }
FString PartText;
bool bFound = false;
// Try current format: text_response_part.text
const TSharedPtr<FJsonObject>* TextPart = nullptr;
if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart)
{
(*TextPart)->TryGetStringField(TEXT("text"), PartText);
bFound = true;
}
// Fallback: legacy format agent_chat_response_part_event.agent_response_part
if (!bFound)
{
const TSharedPtr<FJsonObject>* PartEvent = nullptr; const TSharedPtr<FJsonObject>* PartEvent = nullptr;
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent) if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
{ {
FString PartText; (*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText);
if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty()) bFound = true;
}
}
if (bFound && !PartText.IsEmpty())
{ {
OnAgentResponsePart.Broadcast(PartText); OnAgentResponsePart.Broadcast(PartText);
} }
} }
}
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root) void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
{ {
@ -658,8 +682,11 @@ void UElevenLabsWebSocketProxy::SendJsonMessage(const TSharedPtr<FJsonObject>& J
UE_LOG(LogElevenLabsWS, Verbose, TEXT("<< %s"), *Out); UE_LOG(LogElevenLabsWS, Verbose, TEXT("<< %s"), *Out);
} }
{
FScopeLock Lock(&WebSocketSendLock);
WebSocket->Send(Out); WebSocket->Send(Out);
} }
}
FString UElevenLabsWebSocketProxy::BuildWebSocketURL(const FString& AgentIDOverride, const FString& APIKeyOverride) const FString UElevenLabsWebSocketProxy::BuildWebSocketURL(const FString& AgentIDOverride, const FString& APIKeyOverride) const
{ {

View File

@ -7,6 +7,7 @@
#include "ElevenLabsDefinitions.h" #include "ElevenLabsDefinitions.h"
#include "ElevenLabsWebSocketProxy.h" #include "ElevenLabsWebSocketProxy.h"
#include "Sound/SoundWaveProcedural.h" #include "Sound/SoundWaveProcedural.h"
#include <atomic>
#include "ElevenLabsConversationalAgentComponent.generated.h" #include "ElevenLabsConversationalAgentComponent.generated.h"
class UAudioComponent; class UAudioComponent;
@ -85,139 +86,113 @@ public:
// ── Configuration ───────────────────────────────────────────────────────── // ── Configuration ─────────────────────────────────────────────────────────
/** /** ElevenLabs Agent ID used for this conversation. Leave empty to use the default from Project Settings > ElevenLabs. */
* ElevenLabs Agent ID. Overrides the project-level default in Project Settings. UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
* Leave empty to use the project default. meta = (ToolTip = "ElevenLabs Agent ID. Leave empty to use the project default from Project Settings."))
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
FString AgentID; FString AgentID;
/** /** How turn-taking is managed between the user and the agent.\n- Server VAD (recommended): ElevenLabs automatically detects when the user stops speaking.\n- Client Controlled: You manually call StartListening/StopListening (push-to-talk with a key). */
* Turn mode: UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
* - Server VAD: ElevenLabs detects end-of-speech automatically (recommended). meta = (ToolTip = "Turn-taking mode.\n- Server VAD: ElevenLabs detects end-of-speech automatically (hands-free).\n- Client Controlled: You call StartListening/StopListening manually (push-to-talk)."))
* - Client Controlled: you call StartListening/StopListening manually (push-to-talk).
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server; EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
/** /** Automatically open the microphone as soon as the WebSocket connection is established. Only applies in Server VAD mode. In Client (push-to-talk) mode, you must call StartListening manually. */
* Automatically start listening (microphone capture) once the WebSocket is UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
* connected and the conversation is initiated. meta = (ToolTip = "Auto-open the microphone when the conversation starts.\nOnly applies in Server VAD mode. In push-to-talk mode, call StartListening() manually."))
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
bool bAutoStartListening = true; bool bAutoStartListening = true;
/** /** Let the LLM start generating a response during silence, before the VAD is fully confident the user has finished speaking. Saves 200-500ms of latency but may be unstable in long multi-turn sessions. Disabled by default. */
* Enable speculative turn: the LLM starts generating a response during UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
* silence before the VAD is fully confident the user has finished speaking. meta = (ToolTip = "Speculative turn: the LLM begins generating during silence before full turn-end confidence.\nReduces latency by 200-500ms. May be unstable in long sessions — test before enabling in production."))
* Reduces latency by 200-500ms but caused the server to silently stop
* processing user audio after 2 turns when combined with a short turn_timeout.
* Disabled by default until ElevenLabs confirms stability in multi-turn sessions.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
bool bSpeculativeTurn = false; bool bSpeculativeTurn = false;
/** /** How many milliseconds of microphone audio to accumulate before sending a chunk to ElevenLabs. Lower values reduce latency but may degrade voice detection accuracy. Higher values are more reliable but add delay. */
* Allow the user to interrupt the agent while it is playing audio (speaking). UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
* When true, calling StartListening() while the agent is audibly speaking automatically meta = (ClampMin = "20", ClampMax = "500",
* sends an interruption signal to the server and opens the mic no Blueprint nodes needed. ToolTip = "Mic audio chunk duration sent to ElevenLabs.\n- 50-80ms: lower latency, less reliable voice detection.\n- 100ms (default): good balance.\n- 150-250ms: more reliable, higher latency."))
* When false, StartListening() is silently ignored until the agent finishes speaking. int32 MicChunkDurationMs = 100;
*
* NOTE: interruption only applies during the audio-playback phase (bAgentSpeaking). /** Allow the user to interrupt the agent while it is speaking.\n- In Server VAD mode: the microphone stays open during agent speech and the server detects interruptions automatically.\n- In Client (push-to-talk) mode: pressing the talk key while the agent speaks sends an interrupt signal.\n- When disabled: the user must wait for the agent to finish speaking before talking. */
* While the agent is generating but has not yet started speaking, StartListening() is UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
* always blocked regardless of this flag this prevents Blueprint's OnAgentStartedGenerating meta = (ToolTip = "Allow the user to interrupt the agent while it speaks.\n- Server VAD: mic stays open, server detects user voice automatically.\n- Push-to-talk: pressing the talk key interrupts the agent.\n- Disabled: user must wait for the agent to finish."))
* handler (which often calls StartListening for bookkeeping) from accidentally cancelling
* the response before any audio plays.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
bool bAllowInterruption = true; bool bAllowInterruption = true;
/** /** Enable the OnAgentTranscript event, which provides real-time speech-to-text of what the user is saying. Disable if you don't need to display user speech to reduce processing overhead. */
* Forward user speech transcripts (user_transcript events) to the UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events",
* OnAgentTranscript delegate. Disable to reduce overhead if you don't meta = (ToolTip = "Fire OnAgentTranscript with real-time speech-to-text of user speech.\nDisable if you don't need to display what the user said."))
* need to display what the user said.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableUserTranscript = true; bool bEnableUserTranscript = true;
/** /** Enable the OnAgentTextResponse event, which provides the agent's complete text response once fully generated. Disable if you only need the audio output. */
* Forward agent text responses (agent_response events) to the UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events",
* OnAgentTextResponse delegate. Disable if you only need audio output. meta = (ToolTip = "Fire OnAgentTextResponse with the agent's complete text once fully generated.\nDisable if you only need the audio output."))
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableAgentTextResponse = true; bool bEnableAgentTextResponse = true;
/** /** Enable the OnAgentPartialResponse event, which streams the agent's text word-by-word as the LLM generates it. Use this for real-time subtitles that appear while the agent speaks, rather than waiting for the full response. */
* Forward streaming text parts (agent_chat_response_part events) to the UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events",
* OnAgentPartialResponse delegate. Each part is a text fragment as the LLM meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
* generates it use this for real-time subtitles that appear while the agent
* speaks, instead of waiting for the full text (OnAgentTextResponse).
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableAgentPartialResponse = false; bool bEnableAgentPartialResponse = false;
/** /** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
* How many seconds to wait for the server to start generating a response UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
* after the user stops speaking (StopListening) before firing OnAgentResponseTimeout. meta = (ClampMin = "0.0",
* Set to 0 to disable. Default: 10 seconds. ToolTip = "Seconds to wait for a server response after the user stops speaking.\nFires OnAgentResponseTimeout if exceeded. Normal latency is 0.1-0.8s.\nSet to 0 to disable. Default: 10s."))
*
* A typical healthy round-trip is 0.10.8s to first agent_chat_response_part.
* Values above 10s are extremely unusual and almost always indicate a server issue.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", meta = (ClampMin = "0.0"))
float ResponseTimeoutSeconds = 10.0f; float ResponseTimeoutSeconds = 10.0f;
// ── Events ──────────────────────────────────────────────────────────────── // ── Events ────────────────────────────────────────────────────────────────
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") /** Fired when the WebSocket connection is established and the conversation session is ready. Provides the ConversationID and AgentID. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the connection to ElevenLabs is established and the conversation is ready to begin."))
FOnAgentConnected OnAgentConnected; FOnAgentConnected OnAgentConnected;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") /** Fired when the WebSocket connection is closed (gracefully or due to an error). Provides the status code and reason. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the connection to ElevenLabs is closed. Check StatusCode and Reason for details."))
FOnAgentDisconnected OnAgentDisconnected; FOnAgentDisconnected OnAgentDisconnected;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") /** Fired on any connection or protocol error. The error message describes what went wrong. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires on connection or protocol errors. The ErrorMessage describes the issue."))
FOnAgentError OnAgentError; FOnAgentError OnAgentError;
/** Fired for every transcript segment (user speech or agent speech, tentative and final). */ /** Fired with real-time speech-to-text of the user's voice. Includes both tentative (in-progress) and final transcripts. Requires bEnableUserTranscript to be true. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Real-time speech-to-text of the user's voice.\nIncludes tentative and final transcripts. Enable with bEnableUserTranscript."))
FOnAgentTranscript OnAgentTranscript; FOnAgentTranscript OnAgentTranscript;
/** Final text response produced by the agent (mirrors the audio). */ /** Fired once when the agent's complete text response is available. This is the full text that corresponds to the audio the agent speaks. Requires bEnableAgentTextResponse to be true. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "The agent's complete text response (matches the spoken audio).\nFires once when the full text is ready. Enable with bEnableAgentTextResponse."))
FOnAgentTextResponse OnAgentTextResponse; FOnAgentTextResponse OnAgentTextResponse;
/** /** Fired repeatedly as the LLM generates text, providing one word/fragment at a time. Use for real-time subtitles. Each call gives a new fragment, NOT the accumulated text. Requires bEnableAgentPartialResponse to be true. */
* Streaming text fragments as the LLM generates them. UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
* Fires for every agent_chat_response_part each call gives one text chunk. meta = (ToolTip = "Streaming text fragments as the LLM generates them (word by word).\nIdeal for real-time subtitles. Enable with bEnableAgentPartialResponse."))
* Enable with bEnableAgentPartialResponse.
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnAgentPartialResponse OnAgentPartialResponse; FOnAgentPartialResponse OnAgentPartialResponse;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") /** Fired when the agent begins playing audio (first audio chunk received). Use this to trigger speech animations or UI indicators. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the agent starts speaking (first audio chunk). Use for lip-sync or UI feedback."))
FOnAgentStartedSpeaking OnAgentStartedSpeaking; FOnAgentStartedSpeaking OnAgentStartedSpeaking;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") /** Fired when the agent finishes playing all audio. Use this to re-open the microphone (in Server VAD mode without interruption) or update UI. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the agent finishes speaking. Use to re-open the mic or update UI."))
FOnAgentStoppedSpeaking OnAgentStoppedSpeaking; FOnAgentStoppedSpeaking OnAgentStoppedSpeaking;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") /** Fired when the agent's speech is interrupted (either by the user speaking over it, or by a manual InterruptAgent call). The audio playback is automatically stopped. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the agent is interrupted mid-speech. Audio is automatically stopped."))
FOnAgentInterrupted OnAgentInterrupted; FOnAgentInterrupted OnAgentInterrupted;
/** /** Fired when the server starts generating a response (before any audio arrives). Use this for "thinking..." UI feedback. In push-to-talk mode, the microphone is automatically closed when this fires. */
* Fired when the server starts generating a response (before audio). UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
* The component automatically stops the microphone when this fires while listening, meta = (ToolTip = "Fires when the server starts generating (before audio arrives).\nUse for 'thinking...' UI. Mic is auto-closed in push-to-talk mode."))
* so the Blueprint doesn't need to handle this manually for push-to-talk.
* Bind here if you need UI feedback ("agent is thinking...").
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnAgentStartedGenerating OnAgentStartedGenerating; FOnAgentStartedGenerating OnAgentStartedGenerating;
/** /** Fired if the server does not start generating a response within ResponseTimeoutSeconds after the user stops speaking. Use this to show a "try again" message or automatically re-open the microphone. */
* Fired when the server has not started generating within ResponseTimeoutSeconds UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
* after StopListening was called. Bind here to give the user feedback such as meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically."))
* "I didn't get a response, please try again" or to automatically re-open the mic.
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnAgentResponseTimeout OnAgentResponseTimeout; FOnAgentResponseTimeout OnAgentResponseTimeout;
// ── Control ─────────────────────────────────────────────────────────────── // ── Control ───────────────────────────────────────────────────────────────
@ -337,8 +312,9 @@ private:
USoundWaveProcedural* ProceduralSoundWave = nullptr; USoundWaveProcedural* ProceduralSoundWave = nullptr;
// ── State ───────────────────────────────────────────────────────────────── // ── State ─────────────────────────────────────────────────────────────────
bool bIsListening = false; // Atomic: read from WASAPI background thread (OnMicrophoneDataCaptured), written from game thread.
bool bAgentSpeaking = false; std::atomic<bool> bIsListening{false};
std::atomic<bool> bAgentSpeaking{false};
// True from the first agent_chat_response_part until the first audio chunk arrives. // True from the first agent_chat_response_part until the first audio chunk arrives.
// Used to block StartListening() while the server is processing the previous turn. // Used to block StartListening() while the server is processing the previous turn.
bool bAgentGenerating = false; bool bAgentGenerating = false;
@ -399,6 +375,12 @@ private:
// WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono). // WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono).
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT. // ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
// We accumulate here and only call SendAudioChunk once enough bytes are ready. // We accumulate here and only call SendAudioChunk once enough bytes are ready.
// MicSendLock protects MicAccumulationBuffer + WebSocket send (accessed from WASAPI thread
// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
TArray<uint8> MicAccumulationBuffer; TArray<uint8> MicAccumulationBuffer;
static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation) FCriticalSection MicSendLock;
/** Compute the minimum bytes from the user-facing MicChunkDurationMs.
* Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */
int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; }
}; };

View File

@ -5,6 +5,7 @@
#include "CoreMinimal.h" #include "CoreMinimal.h"
#include "Components/ActorComponent.h" #include "Components/ActorComponent.h"
#include "AudioCapture.h" #include "AudioCapture.h"
#include <atomic>
#include "ElevenLabsMicrophoneCaptureComponent.generated.h" #include "ElevenLabsMicrophoneCaptureComponent.generated.h"
// Delivers captured float PCM samples (16000 Hz mono, resampled from device rate). // Delivers captured float PCM samples (16000 Hz mono, resampled from device rate).
@ -27,17 +28,24 @@ class PS_AI_AGENT_ELEVENLABS_API UElevenLabsMicrophoneCaptureComponent : public
public: public:
UElevenLabsMicrophoneCaptureComponent(); UElevenLabsMicrophoneCaptureComponent();
/** Volume multiplier applied to captured samples before forwarding. */ /** Multiplier applied to the microphone input volume before sending to ElevenLabs. Increase if the agent has trouble hearing you, decrease if your audio is clipping. Default: 1.0 (no change). */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Microphone", UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Microphone",
meta = (ClampMin = "0.0", ClampMax = "4.0")) meta = (ClampMin = "0.0", ClampMax = "4.0",
ToolTip = "Microphone volume multiplier.\n1.0 = no change. Increase if the agent can't hear you, decrease if audio clips."))
float VolumeMultiplier = 1.0f; float VolumeMultiplier = 1.0f;
/** /**
* Delegate fired on the game thread each time a new chunk of PCM audio * Delegate fired on the game thread each time a new chunk of PCM audio is
* is captured. Samples are float32, resampled to 16000 Hz mono. * captured. Samples are float32, resampled to 16000 Hz mono.
* Audio is captured on a WASAPI background thread, resampled there (with
* echo suppression), then dispatched to the game thread for this broadcast.
*/ */
FOnElevenLabsAudioCaptured OnAudioCaptured; FOnElevenLabsAudioCaptured OnAudioCaptured;
/** Optional pointer to an atomic bool that suppresses capture when true.
* Set by the agent component for echo suppression (skip mic while agent speaks). */
std::atomic<bool>* EchoSuppressFlag = nullptr;
/** Open the default capture device and begin streaming audio. */ /** Open the default capture device and begin streaming audio. */
UFUNCTION(BlueprintCallable, Category = "ElevenLabs") UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
void StartCapture(); void StartCapture();
@ -65,7 +73,7 @@ private:
Audio::FAudioCapture AudioCapture; Audio::FAudioCapture AudioCapture;
Audio::FAudioCaptureDeviceParams DeviceParams; Audio::FAudioCaptureDeviceParams DeviceParams;
bool bCapturing = false; std::atomic<bool> bCapturing{false};
// Device sample rate discovered on StartCapture // Device sample rate discovered on StartCapture
int32 DeviceSampleRate = 44100; int32 DeviceSampleRate = 44100;

View File

@ -205,6 +205,10 @@ private:
EElevenLabsConnectionState ConnectionState = EElevenLabsConnectionState::Disconnected; EElevenLabsConnectionState ConnectionState = EElevenLabsConnectionState::Disconnected;
FElevenLabsConversationInfo ConversationInfo; FElevenLabsConversationInfo ConversationInfo;
// Serializes WebSocket->Send() calls — needed because SendAudioChunk can now be
// called from the WASAPI background thread while SendJsonMessage runs on game thread.
FCriticalSection WebSocketSendLock;
// Accumulation buffer for multi-fragment binary WebSocket frames. // Accumulation buffer for multi-fragment binary WebSocket frames.
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments. // ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
TArray<uint8> BinaryFrameBuffer; TArray<uint8> BinaryFrameBuffer;