Compare commits

..

No commits in common. "4ee09f4e5800e8fa194ea49aaa94d500e2ea4dcb" and "f23acc8c1ce02d84b4984453782bc9b7b7ac730f" have entirely different histories.

7 changed files with 150 additions and 213 deletions

View File

@ -247,19 +247,6 @@ void UElevenLabsConversationalAgentComponent::StartListening()
Mic->OnAudioCaptured.RemoveAll(this); Mic->OnAudioCaptured.RemoveAll(this);
Mic->OnAudioCaptured.AddUObject(this, Mic->OnAudioCaptured.AddUObject(this,
&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured); &UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
// capture entirely (before resampling) while the agent is speaking.
// In Server VAD + interruption mode, disable echo suppression so the server
// receives the user's voice even during agent playback — the server's own VAD
// handles echo filtering and interruption detection.
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
{
Mic->EchoSuppressFlag = nullptr;
}
else
{
Mic->EchoSuppressFlag = &bAgentSpeaking;
}
Mic->StartCapture(); Mic->StartCapture();
const double T = TurnStartTime - SessionStartTime; const double T = TurnStartTime - SessionStartTime;
@ -288,8 +275,6 @@ void UElevenLabsConversationalAgentComponent::StopListening()
// "user speaking" state and stall waiting for more audio that never arrives, // "user speaking" state and stall waiting for more audio that never arrives,
// leaving both sides stuck — no audio for the collision response and no response // leaving both sides stuck — no audio for the collision response and no response
// for subsequent turns. // for subsequent turns.
{
FScopeLock Lock(&MicSendLock);
if (bAgentGenerating) if (bAgentGenerating)
{ {
if (MicAccumulationBuffer.Num() > 0) if (MicAccumulationBuffer.Num() > 0)
@ -304,7 +289,6 @@ void UElevenLabsConversationalAgentComponent::StopListening()
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
} }
MicAccumulationBuffer.Reset(); MicAccumulationBuffer.Reset();
}
if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client) if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client)
{ {
@ -410,10 +394,7 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod
GeneratingTickCount = 0; GeneratingTickCount = 0;
TurnIndex = 0; TurnIndex = 0;
LastClosedTurnIndex = 0; LastClosedTurnIndex = 0;
{
FScopeLock Lock(&MicSendLock);
MicAccumulationBuffer.Reset(); MicAccumulationBuffer.Reset();
}
OnAgentDisconnected.Broadcast(StatusCode, Reason); OnAgentDisconnected.Broadcast(StatusCode, Reason);
} }
@ -469,28 +450,24 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
const double T = Now - SessionStartTime; const double T = Now - SessionStartTime;
const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0; const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
if (bIsListening) if (bIsListening)
{
// In Server VAD + interruption mode, keep the mic open so the server can
// detect if the user speaks over the agent and send an interruption event.
// The server handles echo filtering and VAD — we just keep streaming audio.
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
{
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"),
T, LastClosedTurnIndex, LatencyFromTurnEnd);
}
else
{ {
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open. // Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
// The server's VAD detected a pause in the user's speech and started generating
// prematurely — the user hasn't finished speaking yet.
//
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's // Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation, // bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
// causing it to re-enter "user speaking" state and stall — both sides stuck. // causing it to re-enter "user speaking" state and stall — both sides stuck.
//
// Do NOT send an interrupt here — just let the server's response play out:
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
// Either way the state machine recovers and Blueprint can reopen the mic.
UE_LOG(LogElevenLabsAgent, Log, UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"), TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd); T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
StopListening(); StopListening();
} }
}
UE_LOG(LogElevenLabsAgent, Log, UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"), TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"),
@ -629,26 +606,18 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
// Echo suppression: skip sending mic audio while the agent is speaking. // Echo suppression: skip sending mic audio while the agent is speaking.
// This prevents the agent from hearing its own voice through the speakers, // This prevents the agent from hearing its own voice through the speakers,
// which would confuse the server's VAD and STT. // which would confuse the server's VAD and STT. Matches the approach used
// In Server VAD + interruption mode, keep sending audio so the server can // in the official ElevenLabs C++ SDK (outputPlaying_ flag).
// detect the user speaking over the agent and trigger an interruption. if (bAgentSpeaking) return;
if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption))
{
return;
}
// Convert this callback's samples to int16 bytes and accumulate. // Convert this callback's samples to int16 bytes and accumulate.
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms // WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms
// (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here // (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here
// until we have enough, then send the whole batch in one WebSocket frame. // until we have enough, then send the whole batch in one WebSocket frame.
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM); TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
// Lock: MicAccumulationBuffer is accessed from WASAPI thread (here) and
// game thread (StopListening flush). WebSocket send is also serialized.
FScopeLock Lock(&MicSendLock);
MicAccumulationBuffer.Append(PCMBytes); MicAccumulationBuffer.Append(PCMBytes);
if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes()) if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
{ {
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
MicAccumulationBuffer.Reset(); MicAccumulationBuffer.Reset();

View File

@ -89,12 +89,6 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
UE_LOG(LogElevenLabsMic, Verbose, TEXT("Audio capture buffer overflow.")); UE_LOG(LogElevenLabsMic, Verbose, TEXT("Audio capture buffer overflow."));
} }
// Echo suppression: skip resampling + broadcasting entirely when agent is speaking.
if (EchoSuppressFlag && EchoSuppressFlag->load(std::memory_order_relaxed))
{
return;
}
// Device sends float32 interleaved samples; cast from the void* API. // Device sends float32 interleaved samples; cast from the void* API.
const float* FloatAudio = static_cast<const float*>(InAudio); const float* FloatAudio = static_cast<const float*>(InAudio);
@ -110,21 +104,16 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
} }
} }
// Dispatch to game thread for delegate broadcast. // Fire the delegate on the game thread so subscribers don't need to be
// UE's FMulticastDelegate with AddUObject uses weak object pointer checks that // thread-safe (WebSocket Send is not thread-safe in UE's implementation).
// are not thread-safe — broadcasting from the WASAPI thread causes the invocation AsyncTask(ENamedThreads::GameThread, [this, Data = MoveTemp(Resampled)]()
// to be silently skipped. The game thread dispatch adds ~8ms latency but is required.
if (bCapturing)
{
AsyncTask(ENamedThreads::GameThread, [this, Captured = MoveTemp(Resampled)]()
{ {
if (bCapturing) if (bCapturing)
{ {
OnAudioCaptured.Broadcast(Captured); OnAudioCaptured.Broadcast(Data);
} }
}); });
} }
}
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
// Resampling // Resampling

View File

@ -120,14 +120,11 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
// Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second). // Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second).
UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num()); UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num());
{
FScopeLock Lock(&WebSocketSendLock);
if (WebSocket.IsValid() && WebSocket->IsConnected()) if (WebSocket.IsValid() && WebSocket->IsConnected())
{ {
WebSocket->Send(AudioJson); WebSocket->Send(AudioJson);
} }
} }
}
void UElevenLabsWebSocketProxy::SendUserTurnStart() void UElevenLabsWebSocketProxy::SendUserTurnStart()
{ {
@ -588,41 +585,20 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
} }
// Extract the streaming text fragment and broadcast it. // Extract the streaming text fragment and broadcast it.
// Current API structure (2026): // API structure:
// { "type": "agent_chat_response_part",
// "text_response_part": { "text": "partial text", "type": "part"|"stop", "event_id": N }
// }
// Legacy structure (pre-2026):
// { "type": "agent_chat_response_part", // { "type": "agent_chat_response_part",
// "agent_chat_response_part_event": { "agent_response_part": "partial text" } // "agent_chat_response_part_event": { "agent_response_part": "partial text" }
// } // }
FString PartText;
bool bFound = false;
// Try current format: text_response_part.text
const TSharedPtr<FJsonObject>* TextPart = nullptr;
if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart)
{
(*TextPart)->TryGetStringField(TEXT("text"), PartText);
bFound = true;
}
// Fallback: legacy format agent_chat_response_part_event.agent_response_part
if (!bFound)
{
const TSharedPtr<FJsonObject>* PartEvent = nullptr; const TSharedPtr<FJsonObject>* PartEvent = nullptr;
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent) if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
{ {
(*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText); FString PartText;
bFound = true; if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
}
}
if (bFound && !PartText.IsEmpty())
{ {
OnAgentResponsePart.Broadcast(PartText); OnAgentResponsePart.Broadcast(PartText);
} }
} }
}
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root) void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
{ {
@ -682,11 +658,8 @@ void UElevenLabsWebSocketProxy::SendJsonMessage(const TSharedPtr<FJsonObject>& J
UE_LOG(LogElevenLabsWS, Verbose, TEXT("<< %s"), *Out); UE_LOG(LogElevenLabsWS, Verbose, TEXT("<< %s"), *Out);
} }
{
FScopeLock Lock(&WebSocketSendLock);
WebSocket->Send(Out); WebSocket->Send(Out);
} }
}
FString UElevenLabsWebSocketProxy::BuildWebSocketURL(const FString& AgentIDOverride, const FString& APIKeyOverride) const FString UElevenLabsWebSocketProxy::BuildWebSocketURL(const FString& AgentIDOverride, const FString& APIKeyOverride) const
{ {

View File

@ -7,7 +7,6 @@
#include "ElevenLabsDefinitions.h" #include "ElevenLabsDefinitions.h"
#include "ElevenLabsWebSocketProxy.h" #include "ElevenLabsWebSocketProxy.h"
#include "Sound/SoundWaveProcedural.h" #include "Sound/SoundWaveProcedural.h"
#include <atomic>
#include "ElevenLabsConversationalAgentComponent.generated.h" #include "ElevenLabsConversationalAgentComponent.generated.h"
class UAudioComponent; class UAudioComponent;
@ -86,113 +85,139 @@ public:
// ── Configuration ───────────────────────────────────────────────────────── // ── Configuration ─────────────────────────────────────────────────────────
/** ElevenLabs Agent ID used for this conversation. Leave empty to use the default from Project Settings > ElevenLabs. */ /**
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", * ElevenLabs Agent ID. Overrides the project-level default in Project Settings.
meta = (ToolTip = "ElevenLabs Agent ID. Leave empty to use the project default from Project Settings.")) * Leave empty to use the project default.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
FString AgentID; FString AgentID;
/** How turn-taking is managed between the user and the agent.\n- Server VAD (recommended): ElevenLabs automatically detects when the user stops speaking.\n- Client Controlled: You manually call StartListening/StopListening (push-to-talk with a key). */ /**
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", * Turn mode:
meta = (ToolTip = "Turn-taking mode.\n- Server VAD: ElevenLabs detects end-of-speech automatically (hands-free).\n- Client Controlled: You call StartListening/StopListening manually (push-to-talk).")) * - Server VAD: ElevenLabs detects end-of-speech automatically (recommended).
* - Client Controlled: you call StartListening/StopListening manually (push-to-talk).
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server; EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
/** Automatically open the microphone as soon as the WebSocket connection is established. Only applies in Server VAD mode. In Client (push-to-talk) mode, you must call StartListening manually. */ /**
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", * Automatically start listening (microphone capture) once the WebSocket is
meta = (ToolTip = "Auto-open the microphone when the conversation starts.\nOnly applies in Server VAD mode. In push-to-talk mode, call StartListening() manually.")) * connected and the conversation is initiated.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
bool bAutoStartListening = true; bool bAutoStartListening = true;
/** Let the LLM start generating a response during silence, before the VAD is fully confident the user has finished speaking. Saves 200-500ms of latency but may be unstable in long multi-turn sessions. Disabled by default. */ /**
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency", * Enable speculative turn: the LLM starts generating a response during
meta = (ToolTip = "Speculative turn: the LLM begins generating during silence before full turn-end confidence.\nReduces latency by 200-500ms. May be unstable in long sessions — test before enabling in production.")) * silence before the VAD is fully confident the user has finished speaking.
* Reduces latency by 200-500ms but caused the server to silently stop
* processing user audio after 2 turns when combined with a short turn_timeout.
* Disabled by default until ElevenLabs confirms stability in multi-turn sessions.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
bool bSpeculativeTurn = false; bool bSpeculativeTurn = false;
/** How many milliseconds of microphone audio to accumulate before sending a chunk to ElevenLabs. Lower values reduce latency but may degrade voice detection accuracy. Higher values are more reliable but add delay. */ /**
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency", * Allow the user to interrupt the agent while it is playing audio (speaking).
meta = (ClampMin = "20", ClampMax = "500", * When true, calling StartListening() while the agent is audibly speaking automatically
ToolTip = "Mic audio chunk duration sent to ElevenLabs.\n- 50-80ms: lower latency, less reliable voice detection.\n- 100ms (default): good balance.\n- 150-250ms: more reliable, higher latency.")) * sends an interruption signal to the server and opens the mic no Blueprint nodes needed.
int32 MicChunkDurationMs = 100; * When false, StartListening() is silently ignored until the agent finishes speaking.
*
/** Allow the user to interrupt the agent while it is speaking.\n- In Server VAD mode: the microphone stays open during agent speech and the server detects interruptions automatically.\n- In Client (push-to-talk) mode: pressing the talk key while the agent speaks sends an interrupt signal.\n- When disabled: the user must wait for the agent to finish speaking before talking. */ * NOTE: interruption only applies during the audio-playback phase (bAgentSpeaking).
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", * While the agent is generating but has not yet started speaking, StartListening() is
meta = (ToolTip = "Allow the user to interrupt the agent while it speaks.\n- Server VAD: mic stays open, server detects user voice automatically.\n- Push-to-talk: pressing the talk key interrupts the agent.\n- Disabled: user must wait for the agent to finish.")) * always blocked regardless of this flag this prevents Blueprint's OnAgentStartedGenerating
* handler (which often calls StartListening for bookkeeping) from accidentally cancelling
* the response before any audio plays.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
bool bAllowInterruption = true; bool bAllowInterruption = true;
/** Enable the OnAgentTranscript event, which provides real-time speech-to-text of what the user is saying. Disable if you don't need to display user speech to reduce processing overhead. */ /**
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events", * Forward user speech transcripts (user_transcript events) to the
meta = (ToolTip = "Fire OnAgentTranscript with real-time speech-to-text of user speech.\nDisable if you don't need to display what the user said.")) * OnAgentTranscript delegate. Disable to reduce overhead if you don't
* need to display what the user said.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableUserTranscript = true; bool bEnableUserTranscript = true;
/** Enable the OnAgentTextResponse event, which provides the agent's complete text response once fully generated. Disable if you only need the audio output. */ /**
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events", * Forward agent text responses (agent_response events) to the
meta = (ToolTip = "Fire OnAgentTextResponse with the agent's complete text once fully generated.\nDisable if you only need the audio output.")) * OnAgentTextResponse delegate. Disable if you only need audio output.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableAgentTextResponse = true; bool bEnableAgentTextResponse = true;
/** Enable the OnAgentPartialResponse event, which streams the agent's text word-by-word as the LLM generates it. Use this for real-time subtitles that appear while the agent speaks, rather than waiting for the full response. */ /**
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events", * Forward streaming text parts (agent_chat_response_part events) to the
meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text.")) * OnAgentPartialResponse delegate. Each part is a text fragment as the LLM
* generates it use this for real-time subtitles that appear while the agent
* speaks, instead of waiting for the full text (OnAgentTextResponse).
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableAgentPartialResponse = false; bool bEnableAgentPartialResponse = false;
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */ /**
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", * How many seconds to wait for the server to start generating a response
meta = (ClampMin = "0.0", * after the user stops speaking (StopListening) before firing OnAgentResponseTimeout.
ToolTip = "Seconds to wait for a server response after the user stops speaking.\nFires OnAgentResponseTimeout if exceeded. Normal latency is 0.1-0.8s.\nSet to 0 to disable. Default: 10s.")) * Set to 0 to disable. Default: 10 seconds.
*
* A typical healthy round-trip is 0.10.8s to first agent_chat_response_part.
* Values above 10s are extremely unusual and almost always indicate a server issue.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", meta = (ClampMin = "0.0"))
float ResponseTimeoutSeconds = 10.0f; float ResponseTimeoutSeconds = 10.0f;
// ── Events ──────────────────────────────────────────────────────────────── // ── Events ────────────────────────────────────────────────────────────────
/** Fired when the WebSocket connection is established and the conversation session is ready. Provides the ConversationID and AgentID. */ UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the connection to ElevenLabs is established and the conversation is ready to begin."))
FOnAgentConnected OnAgentConnected; FOnAgentConnected OnAgentConnected;
/** Fired when the WebSocket connection is closed (gracefully or due to an error). Provides the status code and reason. */ UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the connection to ElevenLabs is closed. Check StatusCode and Reason for details."))
FOnAgentDisconnected OnAgentDisconnected; FOnAgentDisconnected OnAgentDisconnected;
/** Fired on any connection or protocol error. The error message describes what went wrong. */ UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires on connection or protocol errors. The ErrorMessage describes the issue."))
FOnAgentError OnAgentError; FOnAgentError OnAgentError;
/** Fired with real-time speech-to-text of the user's voice. Includes both tentative (in-progress) and final transcripts. Requires bEnableUserTranscript to be true. */ /** Fired for every transcript segment (user speech or agent speech, tentative and final). */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events", UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
meta = (ToolTip = "Real-time speech-to-text of the user's voice.\nIncludes tentative and final transcripts. Enable with bEnableUserTranscript."))
FOnAgentTranscript OnAgentTranscript; FOnAgentTranscript OnAgentTranscript;
/** Fired once when the agent's complete text response is available. This is the full text that corresponds to the audio the agent speaks. Requires bEnableAgentTextResponse to be true. */ /** Final text response produced by the agent (mirrors the audio). */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events", UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
meta = (ToolTip = "The agent's complete text response (matches the spoken audio).\nFires once when the full text is ready. Enable with bEnableAgentTextResponse."))
FOnAgentTextResponse OnAgentTextResponse; FOnAgentTextResponse OnAgentTextResponse;
/** Fired repeatedly as the LLM generates text, providing one word/fragment at a time. Use for real-time subtitles. Each call gives a new fragment, NOT the accumulated text. Requires bEnableAgentPartialResponse to be true. */ /**
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events", * Streaming text fragments as the LLM generates them.
meta = (ToolTip = "Streaming text fragments as the LLM generates them (word by word).\nIdeal for real-time subtitles. Enable with bEnableAgentPartialResponse.")) * Fires for every agent_chat_response_part each call gives one text chunk.
* Enable with bEnableAgentPartialResponse.
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnAgentPartialResponse OnAgentPartialResponse; FOnAgentPartialResponse OnAgentPartialResponse;
/** Fired when the agent begins playing audio (first audio chunk received). Use this to trigger speech animations or UI indicators. */ UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the agent starts speaking (first audio chunk). Use for lip-sync or UI feedback."))
FOnAgentStartedSpeaking OnAgentStartedSpeaking; FOnAgentStartedSpeaking OnAgentStartedSpeaking;
/** Fired when the agent finishes playing all audio. Use this to re-open the microphone (in Server VAD mode without interruption) or update UI. */ UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the agent finishes speaking. Use to re-open the mic or update UI."))
FOnAgentStoppedSpeaking OnAgentStoppedSpeaking; FOnAgentStoppedSpeaking OnAgentStoppedSpeaking;
/** Fired when the agent's speech is interrupted (either by the user speaking over it, or by a manual InterruptAgent call). The audio playback is automatically stopped. */ UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
meta = (ToolTip = "Fires when the agent is interrupted mid-speech. Audio is automatically stopped."))
FOnAgentInterrupted OnAgentInterrupted; FOnAgentInterrupted OnAgentInterrupted;
/** Fired when the server starts generating a response (before any audio arrives). Use this for "thinking..." UI feedback. In push-to-talk mode, the microphone is automatically closed when this fires. */ /**
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events", * Fired when the server starts generating a response (before audio).
meta = (ToolTip = "Fires when the server starts generating (before audio arrives).\nUse for 'thinking...' UI. Mic is auto-closed in push-to-talk mode.")) * The component automatically stops the microphone when this fires while listening,
* so the Blueprint doesn't need to handle this manually for push-to-talk.
* Bind here if you need UI feedback ("agent is thinking...").
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnAgentStartedGenerating OnAgentStartedGenerating; FOnAgentStartedGenerating OnAgentStartedGenerating;
/** Fired if the server does not start generating a response within ResponseTimeoutSeconds after the user stops speaking. Use this to show a "try again" message or automatically re-open the microphone. */ /**
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events", * Fired when the server has not started generating within ResponseTimeoutSeconds
meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically.")) * after StopListening was called. Bind here to give the user feedback such as
* "I didn't get a response, please try again" or to automatically re-open the mic.
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnAgentResponseTimeout OnAgentResponseTimeout; FOnAgentResponseTimeout OnAgentResponseTimeout;
// ── Control ─────────────────────────────────────────────────────────────── // ── Control ───────────────────────────────────────────────────────────────
@ -312,9 +337,8 @@ private:
USoundWaveProcedural* ProceduralSoundWave = nullptr; USoundWaveProcedural* ProceduralSoundWave = nullptr;
// ── State ───────────────────────────────────────────────────────────────── // ── State ─────────────────────────────────────────────────────────────────
// Atomic: read from WASAPI background thread (OnMicrophoneDataCaptured), written from game thread. bool bIsListening = false;
std::atomic<bool> bIsListening{false}; bool bAgentSpeaking = false;
std::atomic<bool> bAgentSpeaking{false};
// True from the first agent_chat_response_part until the first audio chunk arrives. // True from the first agent_chat_response_part until the first audio chunk arrives.
// Used to block StartListening() while the server is processing the previous turn. // Used to block StartListening() while the server is processing the previous turn.
bool bAgentGenerating = false; bool bAgentGenerating = false;
@ -375,12 +399,6 @@ private:
// WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono). // WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono).
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT. // ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
// We accumulate here and only call SendAudioChunk once enough bytes are ready. // We accumulate here and only call SendAudioChunk once enough bytes are ready.
// MicSendLock protects MicAccumulationBuffer + WebSocket send (accessed from WASAPI thread
// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
TArray<uint8> MicAccumulationBuffer; TArray<uint8> MicAccumulationBuffer;
FCriticalSection MicSendLock; static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation)
/** Compute the minimum bytes from the user-facing MicChunkDurationMs.
* Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */
int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; }
}; };

View File

@ -5,7 +5,6 @@
#include "CoreMinimal.h" #include "CoreMinimal.h"
#include "Components/ActorComponent.h" #include "Components/ActorComponent.h"
#include "AudioCapture.h" #include "AudioCapture.h"
#include <atomic>
#include "ElevenLabsMicrophoneCaptureComponent.generated.h" #include "ElevenLabsMicrophoneCaptureComponent.generated.h"
// Delivers captured float PCM samples (16000 Hz mono, resampled from device rate). // Delivers captured float PCM samples (16000 Hz mono, resampled from device rate).
@ -28,24 +27,17 @@ class PS_AI_AGENT_ELEVENLABS_API UElevenLabsMicrophoneCaptureComponent : public
public: public:
UElevenLabsMicrophoneCaptureComponent(); UElevenLabsMicrophoneCaptureComponent();
/** Multiplier applied to the microphone input volume before sending to ElevenLabs. Increase if the agent has trouble hearing you, decrease if your audio is clipping. Default: 1.0 (no change). */ /** Volume multiplier applied to captured samples before forwarding. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Microphone", UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Microphone",
meta = (ClampMin = "0.0", ClampMax = "4.0", meta = (ClampMin = "0.0", ClampMax = "4.0"))
ToolTip = "Microphone volume multiplier.\n1.0 = no change. Increase if the agent can't hear you, decrease if audio clips."))
float VolumeMultiplier = 1.0f; float VolumeMultiplier = 1.0f;
/** /**
* Delegate fired on the game thread each time a new chunk of PCM audio is * Delegate fired on the game thread each time a new chunk of PCM audio
* captured. Samples are float32, resampled to 16000 Hz mono. * is captured. Samples are float32, resampled to 16000 Hz mono.
* Audio is captured on a WASAPI background thread, resampled there (with
* echo suppression), then dispatched to the game thread for this broadcast.
*/ */
FOnElevenLabsAudioCaptured OnAudioCaptured; FOnElevenLabsAudioCaptured OnAudioCaptured;
/** Optional pointer to an atomic bool that suppresses capture when true.
* Set by the agent component for echo suppression (skip mic while agent speaks). */
std::atomic<bool>* EchoSuppressFlag = nullptr;
/** Open the default capture device and begin streaming audio. */ /** Open the default capture device and begin streaming audio. */
UFUNCTION(BlueprintCallable, Category = "ElevenLabs") UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
void StartCapture(); void StartCapture();
@ -73,7 +65,7 @@ private:
Audio::FAudioCapture AudioCapture; Audio::FAudioCapture AudioCapture;
Audio::FAudioCaptureDeviceParams DeviceParams; Audio::FAudioCaptureDeviceParams DeviceParams;
std::atomic<bool> bCapturing{false}; bool bCapturing = false;
// Device sample rate discovered on StartCapture // Device sample rate discovered on StartCapture
int32 DeviceSampleRate = 44100; int32 DeviceSampleRate = 44100;

View File

@ -205,10 +205,6 @@ private:
EElevenLabsConnectionState ConnectionState = EElevenLabsConnectionState::Disconnected; EElevenLabsConnectionState ConnectionState = EElevenLabsConnectionState::Disconnected;
FElevenLabsConversationInfo ConversationInfo; FElevenLabsConversationInfo ConversationInfo;
// Serializes WebSocket->Send() calls — needed because SendAudioChunk can now be
// called from the WASAPI background thread while SendJsonMessage runs on game thread.
FCriticalSection WebSocketSendLock;
// Accumulation buffer for multi-fragment binary WebSocket frames. // Accumulation buffer for multi-fragment binary WebSocket frames.
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments. // ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
TArray<uint8> BinaryFrameBuffer; TArray<uint8> BinaryFrameBuffer;