Compare commits

...

5 Commits

Author SHA1 Message Date
4ee09f4e58 actor 2026-02-22 09:31:13 +01:00
a26361a7b2 Remove Units meta from MicChunkDurationMs to fix slider display
UE was converting the raw ms value to seconds in the Details panel,
showing "0.1 s" instead of "100". Removing Units="ms" lets the slider
display the integer value directly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 09:30:50 +01:00
cb78e3249c Add detailed English tooltips to all ElevenLabs UPROPERTY parameters
All configuration and event properties in ConversationalAgentComponent and
MicrophoneCaptureComponent now have explicit ToolTip meta for clear descriptions
in the Unreal Editor Details panel.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 09:18:50 +01:00
20a6e30377 v1.8.0: Server VAD interruption, partial response fix, configurable chunk size
- Server VAD + interruption: mic stays open while agent speaks, server
  detects user voice and triggers interruption automatically. Echo
  suppression disabled in this mode so audio reaches the server.
- Fix agent_chat_response_part parsing: ElevenLabs API now uses
  text_response_part.text instead of agent_chat_response_part_event.
  Added fallback for legacy format.
- Expose MicChunkDurationMs as UPROPERTY (20-500ms, default 100ms)
  instead of compile-time constant.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 09:13:02 +01:00
152fc6196d v1.7.1: Fix mic not heard + latency optimizations + thread safety
Fix regression from v1.7.0 where agent couldn't hear user speech:
- Restore AsyncTask game-thread dispatch for delegate broadcast (AddUObject
  weak pointer checks are not thread-safe from WASAPI thread)
- Keep early echo suppression in WASAPI callback (before resampling)
- Keep MicChunkMinBytes at 3200 (100ms) for lower latency
- Add thread safety: std::atomic<bool> for bIsListening/bAgentSpeaking/bCapturing,
  FCriticalSection for MicSendLock and WebSocketSendLock
- Add EchoSuppressFlag pointer from agent to mic component

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 08:46:15 +01:00
7 changed files with 213 additions and 150 deletions

View File

@ -247,6 +247,19 @@ void UElevenLabsConversationalAgentComponent::StartListening()
Mic->OnAudioCaptured.RemoveAll(this);
Mic->OnAudioCaptured.AddUObject(this,
&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
// capture entirely (before resampling) while the agent is speaking.
// In Server VAD + interruption mode, disable echo suppression so the server
// receives the user's voice even during agent playback — the server's own VAD
// handles echo filtering and interruption detection.
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
{
Mic->EchoSuppressFlag = nullptr;
}
else
{
Mic->EchoSuppressFlag = &bAgentSpeaking;
}
Mic->StartCapture();
const double T = TurnStartTime - SessionStartTime;
@ -275,20 +288,23 @@ void UElevenLabsConversationalAgentComponent::StopListening()
// "user speaking" state and stall waiting for more audio that never arrives,
// leaving both sides stuck — no audio for the collision response and no response
// for subsequent turns.
if (bAgentGenerating)
{
if (MicAccumulationBuffer.Num() > 0)
FScopeLock Lock(&MicSendLock);
if (bAgentGenerating)
{
UE_LOG(LogElevenLabsAgent, Log,
TEXT("StopListening: discarding %d bytes of accumulated mic audio (collision — server is mid-generation)."),
MicAccumulationBuffer.Num());
if (MicAccumulationBuffer.Num() > 0)
{
UE_LOG(LogElevenLabsAgent, Log,
TEXT("StopListening: discarding %d bytes of accumulated mic audio (collision — server is mid-generation)."),
MicAccumulationBuffer.Num());
}
}
else if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected())
{
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
}
MicAccumulationBuffer.Reset();
}
else if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected())
{
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
}
MicAccumulationBuffer.Reset();
if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client)
{
@ -394,7 +410,10 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod
GeneratingTickCount = 0;
TurnIndex = 0;
LastClosedTurnIndex = 0;
MicAccumulationBuffer.Reset();
{
FScopeLock Lock(&MicSendLock);
MicAccumulationBuffer.Reset();
}
OnAgentDisconnected.Broadcast(StatusCode, Reason);
}
@ -451,22 +470,26 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
if (bIsListening)
{
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
// The server's VAD detected a pause in the user's speech and started generating
// prematurely — the user hasn't finished speaking yet.
//
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
// causing it to re-enter "user speaking" state and stall — both sides stuck.
//
// Do NOT send an interrupt here — just let the server's response play out:
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
// Either way the state machine recovers and Blueprint can reopen the mic.
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
StopListening();
// In Server VAD + interruption mode, keep the mic open so the server can
// detect if the user speaks over the agent and send an interruption event.
// The server handles echo filtering and VAD — we just keep streaming audio.
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
{
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"),
T, LastClosedTurnIndex, LatencyFromTurnEnd);
}
else
{
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
// causing it to re-enter "user speaking" state and stall — both sides stuck.
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
StopListening();
}
}
UE_LOG(LogElevenLabsAgent, Log,
@ -606,18 +629,26 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
// Echo suppression: skip sending mic audio while the agent is speaking.
// This prevents the agent from hearing its own voice through the speakers,
// which would confuse the server's VAD and STT. Matches the approach used
// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
if (bAgentSpeaking) return;
// which would confuse the server's VAD and STT.
// In Server VAD + interruption mode, keep sending audio so the server can
// detect the user speaking over the agent and trigger an interruption.
if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption))
{
return;
}
// Convert this callback's samples to int16 bytes and accumulate.
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms
// (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
// (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here
// until we have enough, then send the whole batch in one WebSocket frame.
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
// Lock: MicAccumulationBuffer is accessed from WASAPI thread (here) and
// game thread (StopListening flush). WebSocket send is also serialized.
FScopeLock Lock(&MicSendLock);
MicAccumulationBuffer.Append(PCMBytes);
if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes())
{
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
MicAccumulationBuffer.Reset();

View File

@ -89,6 +89,12 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
UE_LOG(LogElevenLabsMic, Verbose, TEXT("Audio capture buffer overflow."));
}
// Echo suppression: skip resampling + broadcasting entirely when agent is speaking.
if (EchoSuppressFlag && EchoSuppressFlag->load(std::memory_order_relaxed))
{
return;
}
// Device sends float32 interleaved samples; cast from the void* API.
const float* FloatAudio = static_cast<const float*>(InAudio);
@ -104,15 +110,20 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
}
}
// Fire the delegate on the game thread so subscribers don't need to be
// thread-safe (WebSocket Send is not thread-safe in UE's implementation).
AsyncTask(ENamedThreads::GameThread, [this, Data = MoveTemp(Resampled)]()
// Dispatch to game thread for delegate broadcast.
// UE's FMulticastDelegate with AddUObject uses weak object pointer checks that
// are not thread-safe — broadcasting from the WASAPI thread causes the invocation
// to be silently skipped. The game thread dispatch adds ~8ms latency but is required.
if (bCapturing)
{
if (bCapturing)
AsyncTask(ENamedThreads::GameThread, [this, Captured = MoveTemp(Resampled)]()
{
OnAudioCaptured.Broadcast(Data);
}
});
if (bCapturing)
{
OnAudioCaptured.Broadcast(Captured);
}
});
}
}
// ─────────────────────────────────────────────────────────────────────────────

View File

@ -120,9 +120,12 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
// Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second).
UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num());
if (WebSocket.IsValid() && WebSocket->IsConnected())
{
WebSocket->Send(AudioJson);
FScopeLock Lock(&WebSocketSendLock);
if (WebSocket.IsValid() && WebSocket->IsConnected())
{
WebSocket->Send(AudioJson);
}
}
}
@ -585,19 +588,40 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
}
// Extract the streaming text fragment and broadcast it.
// API structure:
// Current API structure (2026):
// { "type": "agent_chat_response_part",
// "text_response_part": { "text": "partial text", "type": "part"|"stop", "event_id": N }
// }
// Legacy structure (pre-2026):
// { "type": "agent_chat_response_part",
// "agent_chat_response_part_event": { "agent_response_part": "partial text" }
// }
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
FString PartText;
bool bFound = false;
// Try current format: text_response_part.text
const TSharedPtr<FJsonObject>* TextPart = nullptr;
if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart)
{
FString PartText;
if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
(*TextPart)->TryGetStringField(TEXT("text"), PartText);
bFound = true;
}
// Fallback: legacy format agent_chat_response_part_event.agent_response_part
if (!bFound)
{
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
{
OnAgentResponsePart.Broadcast(PartText);
(*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText);
bFound = true;
}
}
if (bFound && !PartText.IsEmpty())
{
OnAgentResponsePart.Broadcast(PartText);
}
}
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
@ -658,7 +682,10 @@ void UElevenLabsWebSocketProxy::SendJsonMessage(const TSharedPtr<FJsonObject>& J
UE_LOG(LogElevenLabsWS, Verbose, TEXT("<< %s"), *Out);
}
WebSocket->Send(Out);
{
FScopeLock Lock(&WebSocketSendLock);
WebSocket->Send(Out);
}
}
FString UElevenLabsWebSocketProxy::BuildWebSocketURL(const FString& AgentIDOverride, const FString& APIKeyOverride) const

View File

@ -7,6 +7,7 @@
#include "ElevenLabsDefinitions.h"
#include "ElevenLabsWebSocketProxy.h"
#include "Sound/SoundWaveProcedural.h"
#include <atomic>
#include "ElevenLabsConversationalAgentComponent.generated.h"
class UAudioComponent;
@ -85,139 +86,113 @@ public:
// ── Configuration ─────────────────────────────────────────────────────────
/**
* ElevenLabs Agent ID. Overrides the project-level default in Project Settings.
* Leave empty to use the project default.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
/** ElevenLabs Agent ID used for this conversation. Leave empty to use the default from Project Settings > ElevenLabs. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
meta = (ToolTip = "ElevenLabs Agent ID. Leave empty to use the project default from Project Settings."))
FString AgentID;
/**
* Turn mode:
* - Server VAD: ElevenLabs detects end-of-speech automatically (recommended).
* - Client Controlled: you call StartListening/StopListening manually (push-to-talk).
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
/** How turn-taking is managed between the user and the agent.\n- Server VAD (recommended): ElevenLabs automatically detects when the user stops speaking.\n- Client Controlled: You manually call StartListening/StopListening (push-to-talk with a key). */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
meta = (ToolTip = "Turn-taking mode.\n- Server VAD: ElevenLabs detects end-of-speech automatically (hands-free).\n- Client Controlled: You call StartListening/StopListening manually (push-to-talk)."))
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
/**
* Automatically start listening (microphone capture) once the WebSocket is
* connected and the conversation is initiated.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
/** Automatically open the microphone as soon as the WebSocket connection is established. Only applies in Server VAD mode. In Client (push-to-talk) mode, you must call StartListening manually. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
meta = (ToolTip = "Auto-open the microphone when the conversation starts.\nOnly applies in Server VAD mode. In push-to-talk mode, call StartListening() manually."))
bool bAutoStartListening = true;
/**
* Enable speculative turn: the LLM starts generating a response during
* silence before the VAD is fully confident the user has finished speaking.
* Reduces latency by 200-500ms but caused the server to silently stop
* processing user audio after 2 turns when combined with a short turn_timeout.
* Disabled by default until ElevenLabs confirms stability in multi-turn sessions.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
/** Let the LLM start generating a response during silence, before the VAD is fully confident the user has finished speaking. Saves 200-500ms of latency but may be unstable in long multi-turn sessions. Disabled by default. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
meta = (ToolTip = "Speculative turn: the LLM begins generating during silence before full turn-end confidence.\nReduces latency by 200-500ms. May be unstable in long sessions — test before enabling in production."))
bool bSpeculativeTurn = false;
/**
* Allow the user to interrupt the agent while it is playing audio (speaking).
* When true, calling StartListening() while the agent is audibly speaking automatically
* sends an interruption signal to the server and opens the mic no Blueprint nodes needed.
* When false, StartListening() is silently ignored until the agent finishes speaking.
*
* NOTE: interruption only applies during the audio-playback phase (bAgentSpeaking).
* While the agent is generating but has not yet started speaking, StartListening() is
* always blocked regardless of this flag this prevents Blueprint's OnAgentStartedGenerating
* handler (which often calls StartListening for bookkeeping) from accidentally cancelling
* the response before any audio plays.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
/** How many milliseconds of microphone audio to accumulate before sending a chunk to ElevenLabs. Lower values reduce latency but may degrade voice detection accuracy. Higher values are more reliable but add delay. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
meta = (ClampMin = "20", ClampMax = "500",
ToolTip = "Mic audio chunk duration sent to ElevenLabs.\n- 50-80ms: lower latency, less reliable voice detection.\n- 100ms (default): good balance.\n- 150-250ms: more reliable, higher latency."))
int32 MicChunkDurationMs = 100;
/** Allow the user to interrupt the agent while it is speaking.\n- In Server VAD mode: the microphone stays open during agent speech and the server detects interruptions automatically.\n- In Client (push-to-talk) mode: pressing the talk key while the agent speaks sends an interrupt signal.\n- When disabled: the user must wait for the agent to finish speaking before talking. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
meta = (ToolTip = "Allow the user to interrupt the agent while it speaks.\n- Server VAD: mic stays open, server detects user voice automatically.\n- Push-to-talk: pressing the talk key interrupts the agent.\n- Disabled: user must wait for the agent to finish."))
bool bAllowInterruption = true;
/**
* Forward user speech transcripts (user_transcript events) to the
* OnAgentTranscript delegate. Disable to reduce overhead if you don't
* need to display what the user said.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
/** Enable the OnAgentTranscript event, which provides real-time speech-to-text of what the user is saying. Disable if you don't need to display user speech to reduce processing overhead. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fire OnAgentTranscript with real-time speech-to-text of user speech.\nDisable if you don't need to display what the user said."))
bool bEnableUserTranscript = true;
/**
* Forward agent text responses (agent_response events) to the
* OnAgentTextResponse delegate. Disable if you only need audio output.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
/** Enable the OnAgentTextResponse event, which provides the agent's complete text response once fully generated. Disable if you only need the audio output. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fire OnAgentTextResponse with the agent's complete text once fully generated.\nDisable if you only need the audio output."))
bool bEnableAgentTextResponse = true;
/**
* Forward streaming text parts (agent_chat_response_part events) to the
* OnAgentPartialResponse delegate. Each part is a text fragment as the LLM
* generates it use this for real-time subtitles that appear while the agent
* speaks, instead of waiting for the full text (OnAgentTextResponse).
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
/** Enable the OnAgentPartialResponse event, which streams the agent's text word-by-word as the LLM generates it. Use this for real-time subtitles that appear while the agent speaks, rather than waiting for the full response. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
bool bEnableAgentPartialResponse = false;
/**
* How many seconds to wait for the server to start generating a response
* after the user stops speaking (StopListening) before firing OnAgentResponseTimeout.
* Set to 0 to disable. Default: 10 seconds.
*
* A typical healthy round-trip is 0.10.8s to first agent_chat_response_part.
* Values above 10s are extremely unusual and almost always indicate a server issue.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", meta = (ClampMin = "0.0"))
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
meta = (ClampMin = "0.0",
ToolTip = "Seconds to wait for a server response after the user stops speaking.\nFires OnAgentResponseTimeout if exceeded. Normal latency is 0.1-0.8s.\nSet to 0 to disable. Default: 10s."))
float ResponseTimeoutSeconds = 10.0f;
// ── Events ────────────────────────────────────────────────────────────────
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired when the WebSocket connection is established and the conversation session is ready. Provides the ConversationID and AgentID. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the connection to ElevenLabs is established and the conversation is ready to begin."))
FOnAgentConnected OnAgentConnected;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired when the WebSocket connection is closed (gracefully or due to an error). Provides the status code and reason. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the connection to ElevenLabs is closed. Check StatusCode and Reason for details."))
FOnAgentDisconnected OnAgentDisconnected;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired on any connection or protocol error. The error message describes what went wrong. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires on connection or protocol errors. The ErrorMessage describes the issue."))
FOnAgentError OnAgentError;
/** Fired for every transcript segment (user speech or agent speech, tentative and final). */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired with real-time speech-to-text of the user's voice. Includes both tentative (in-progress) and final transcripts. Requires bEnableUserTranscript to be true. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Real-time speech-to-text of the user's voice.\nIncludes tentative and final transcripts. Enable with bEnableUserTranscript."))
FOnAgentTranscript OnAgentTranscript;
/** Final text response produced by the agent (mirrors the audio). */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired once when the agent's complete text response is available. This is the full text that corresponds to the audio the agent speaks. Requires bEnableAgentTextResponse to be true. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "The agent's complete text response (matches the spoken audio).\nFires once when the full text is ready. Enable with bEnableAgentTextResponse."))
FOnAgentTextResponse OnAgentTextResponse;
/**
* Streaming text fragments as the LLM generates them.
* Fires for every agent_chat_response_part each call gives one text chunk.
* Enable with bEnableAgentPartialResponse.
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired repeatedly as the LLM generates text, providing one word/fragment at a time. Use for real-time subtitles. Each call gives a new fragment, NOT the accumulated text. Requires bEnableAgentPartialResponse to be true. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Streaming text fragments as the LLM generates them (word by word).\nIdeal for real-time subtitles. Enable with bEnableAgentPartialResponse."))
FOnAgentPartialResponse OnAgentPartialResponse;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired when the agent begins playing audio (first audio chunk received). Use this to trigger speech animations or UI indicators. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the agent starts speaking (first audio chunk). Use for lip-sync or UI feedback."))
FOnAgentStartedSpeaking OnAgentStartedSpeaking;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired when the agent finishes playing all audio. Use this to re-open the microphone (in Server VAD mode without interruption) or update UI. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the agent finishes speaking. Use to re-open the mic or update UI."))
FOnAgentStoppedSpeaking OnAgentStoppedSpeaking;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired when the agent's speech is interrupted (either by the user speaking over it, or by a manual InterruptAgent call). The audio playback is automatically stopped. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the agent is interrupted mid-speech. Audio is automatically stopped."))
FOnAgentInterrupted OnAgentInterrupted;
/**
* Fired when the server starts generating a response (before audio).
* The component automatically stops the microphone when this fires while listening,
* so the Blueprint doesn't need to handle this manually for push-to-talk.
* Bind here if you need UI feedback ("agent is thinking...").
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired when the server starts generating a response (before any audio arrives). Use this for "thinking..." UI feedback. In push-to-talk mode, the microphone is automatically closed when this fires. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the server starts generating (before audio arrives).\nUse for 'thinking...' UI. Mic is auto-closed in push-to-talk mode."))
FOnAgentStartedGenerating OnAgentStartedGenerating;
/**
* Fired when the server has not started generating within ResponseTimeoutSeconds
* after StopListening was called. Bind here to give the user feedback such as
* "I didn't get a response, please try again" or to automatically re-open the mic.
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
/** Fired if the server does not start generating a response within ResponseTimeoutSeconds after the user stops speaking. Use this to show a "try again" message or automatically re-open the microphone. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically."))
FOnAgentResponseTimeout OnAgentResponseTimeout;
// ── Control ───────────────────────────────────────────────────────────────
@ -337,8 +312,9 @@ private:
USoundWaveProcedural* ProceduralSoundWave = nullptr;
// ── State ─────────────────────────────────────────────────────────────────
bool bIsListening = false;
bool bAgentSpeaking = false;
// Atomic: read from WASAPI background thread (OnMicrophoneDataCaptured), written from game thread.
std::atomic<bool> bIsListening{false};
std::atomic<bool> bAgentSpeaking{false};
// True from the first agent_chat_response_part until the first audio chunk arrives.
// Used to block StartListening() while the server is processing the previous turn.
bool bAgentGenerating = false;
@ -399,6 +375,12 @@ private:
// WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono).
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
// We accumulate here and only call SendAudioChunk once enough bytes are ready.
// MicSendLock protects MicAccumulationBuffer + WebSocket send (accessed from WASAPI thread
// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
TArray<uint8> MicAccumulationBuffer;
static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation)
FCriticalSection MicSendLock;
/** Compute the minimum bytes from the user-facing MicChunkDurationMs.
* Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */
int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; }
};

View File

@ -5,6 +5,7 @@
#include "CoreMinimal.h"
#include "Components/ActorComponent.h"
#include "AudioCapture.h"
#include <atomic>
#include "ElevenLabsMicrophoneCaptureComponent.generated.h"
// Delivers captured float PCM samples (16000 Hz mono, resampled from device rate).
@ -27,17 +28,24 @@ class PS_AI_AGENT_ELEVENLABS_API UElevenLabsMicrophoneCaptureComponent : public
public:
UElevenLabsMicrophoneCaptureComponent();
/** Volume multiplier applied to captured samples before forwarding. */
/** Multiplier applied to the microphone input volume before sending to ElevenLabs. Increase if the agent has trouble hearing you, decrease if your audio is clipping. Default: 1.0 (no change). */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Microphone",
meta = (ClampMin = "0.0", ClampMax = "4.0"))
meta = (ClampMin = "0.0", ClampMax = "4.0",
ToolTip = "Microphone volume multiplier.\n1.0 = no change. Increase if the agent can't hear you, decrease if audio clips."))
float VolumeMultiplier = 1.0f;
/**
* Delegate fired on the game thread each time a new chunk of PCM audio
* is captured. Samples are float32, resampled to 16000 Hz mono.
* Delegate fired on the game thread each time a new chunk of PCM audio is
* captured. Samples are float32, resampled to 16000 Hz mono.
* Audio is captured on a WASAPI background thread, resampled there (with
* echo suppression), then dispatched to the game thread for this broadcast.
*/
FOnElevenLabsAudioCaptured OnAudioCaptured;
/** Optional pointer to an atomic bool that suppresses capture when true.
* Set by the agent component for echo suppression (skip mic while agent speaks). */
std::atomic<bool>* EchoSuppressFlag = nullptr;
/** Open the default capture device and begin streaming audio. */
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
void StartCapture();
@ -65,7 +73,7 @@ private:
Audio::FAudioCapture AudioCapture;
Audio::FAudioCaptureDeviceParams DeviceParams;
bool bCapturing = false;
std::atomic<bool> bCapturing{false};
// Device sample rate discovered on StartCapture
int32 DeviceSampleRate = 44100;

View File

@ -205,6 +205,10 @@ private:
EElevenLabsConnectionState ConnectionState = EElevenLabsConnectionState::Disconnected;
FElevenLabsConversationInfo ConversationInfo;
// Serializes WebSocket->Send() calls — needed because SendAudioChunk can now be
// called from the WASAPI background thread while SendJsonMessage runs on game thread.
FCriticalSection WebSocketSendLock;
// Accumulation buffer for multi-fragment binary WebSocket frames.
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
TArray<uint8> BinaryFrameBuffer;