Compare commits
5 Commits
f23acc8c1c
...
4ee09f4e58
| Author | SHA1 | Date | |
|---|---|---|---|
| 4ee09f4e58 | |||
| a26361a7b2 | |||
| cb78e3249c | |||
| 20a6e30377 | |||
| 152fc6196d |
Binary file not shown.
@ -247,6 +247,19 @@ void UElevenLabsConversationalAgentComponent::StartListening()
|
||||
Mic->OnAudioCaptured.RemoveAll(this);
|
||||
Mic->OnAudioCaptured.AddUObject(this,
|
||||
&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
|
||||
// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
|
||||
// capture entirely (before resampling) while the agent is speaking.
|
||||
// In Server VAD + interruption mode, disable echo suppression so the server
|
||||
// receives the user's voice even during agent playback — the server's own VAD
|
||||
// handles echo filtering and interruption detection.
|
||||
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
|
||||
{
|
||||
Mic->EchoSuppressFlag = nullptr;
|
||||
}
|
||||
else
|
||||
{
|
||||
Mic->EchoSuppressFlag = &bAgentSpeaking;
|
||||
}
|
||||
Mic->StartCapture();
|
||||
|
||||
const double T = TurnStartTime - SessionStartTime;
|
||||
@ -275,20 +288,23 @@ void UElevenLabsConversationalAgentComponent::StopListening()
|
||||
// "user speaking" state and stall waiting for more audio that never arrives,
|
||||
// leaving both sides stuck — no audio for the collision response and no response
|
||||
// for subsequent turns.
|
||||
if (bAgentGenerating)
|
||||
{
|
||||
if (MicAccumulationBuffer.Num() > 0)
|
||||
FScopeLock Lock(&MicSendLock);
|
||||
if (bAgentGenerating)
|
||||
{
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("StopListening: discarding %d bytes of accumulated mic audio (collision — server is mid-generation)."),
|
||||
MicAccumulationBuffer.Num());
|
||||
if (MicAccumulationBuffer.Num() > 0)
|
||||
{
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("StopListening: discarding %d bytes of accumulated mic audio (collision — server is mid-generation)."),
|
||||
MicAccumulationBuffer.Num());
|
||||
}
|
||||
}
|
||||
else if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected())
|
||||
{
|
||||
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
|
||||
}
|
||||
MicAccumulationBuffer.Reset();
|
||||
}
|
||||
else if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected())
|
||||
{
|
||||
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
|
||||
}
|
||||
MicAccumulationBuffer.Reset();
|
||||
|
||||
if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client)
|
||||
{
|
||||
@ -394,7 +410,10 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod
|
||||
GeneratingTickCount = 0;
|
||||
TurnIndex = 0;
|
||||
LastClosedTurnIndex = 0;
|
||||
MicAccumulationBuffer.Reset();
|
||||
{
|
||||
FScopeLock Lock(&MicSendLock);
|
||||
MicAccumulationBuffer.Reset();
|
||||
}
|
||||
OnAgentDisconnected.Broadcast(StatusCode, Reason);
|
||||
}
|
||||
|
||||
@ -451,22 +470,26 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
|
||||
const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
|
||||
if (bIsListening)
|
||||
{
|
||||
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
|
||||
// The server's VAD detected a pause in the user's speech and started generating
|
||||
// prematurely — the user hasn't finished speaking yet.
|
||||
//
|
||||
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
|
||||
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
|
||||
// causing it to re-enter "user speaking" state and stall — both sides stuck.
|
||||
//
|
||||
// Do NOT send an interrupt here — just let the server's response play out:
|
||||
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
|
||||
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
|
||||
// Either way the state machine recovers and Blueprint can reopen the mic.
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
|
||||
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
|
||||
StopListening();
|
||||
// In Server VAD + interruption mode, keep the mic open so the server can
|
||||
// detect if the user speaks over the agent and send an interruption event.
|
||||
// The server handles echo filtering and VAD — we just keep streaming audio.
|
||||
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
|
||||
{
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"),
|
||||
T, LastClosedTurnIndex, LatencyFromTurnEnd);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
|
||||
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
|
||||
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
|
||||
// causing it to re-enter "user speaking" state and stall — both sides stuck.
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
|
||||
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
|
||||
StopListening();
|
||||
}
|
||||
}
|
||||
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
@ -606,18 +629,26 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
|
||||
|
||||
// Echo suppression: skip sending mic audio while the agent is speaking.
|
||||
// This prevents the agent from hearing its own voice through the speakers,
|
||||
// which would confuse the server's VAD and STT. Matches the approach used
|
||||
// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
|
||||
if (bAgentSpeaking) return;
|
||||
// which would confuse the server's VAD and STT.
|
||||
// In Server VAD + interruption mode, keep sending audio so the server can
|
||||
// detect the user speaking over the agent and trigger an interruption.
|
||||
if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Convert this callback's samples to int16 bytes and accumulate.
|
||||
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms
|
||||
// (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here
|
||||
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
|
||||
// (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here
|
||||
// until we have enough, then send the whole batch in one WebSocket frame.
|
||||
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
|
||||
|
||||
// Lock: MicAccumulationBuffer is accessed from WASAPI thread (here) and
|
||||
// game thread (StopListening flush). WebSocket send is also serialized.
|
||||
FScopeLock Lock(&MicSendLock);
|
||||
MicAccumulationBuffer.Append(PCMBytes);
|
||||
|
||||
if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
|
||||
if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes())
|
||||
{
|
||||
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
|
||||
MicAccumulationBuffer.Reset();
|
||||
|
||||
@ -89,6 +89,12 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
|
||||
UE_LOG(LogElevenLabsMic, Verbose, TEXT("Audio capture buffer overflow."));
|
||||
}
|
||||
|
||||
// Echo suppression: skip resampling + broadcasting entirely when agent is speaking.
|
||||
if (EchoSuppressFlag && EchoSuppressFlag->load(std::memory_order_relaxed))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Device sends float32 interleaved samples; cast from the void* API.
|
||||
const float* FloatAudio = static_cast<const float*>(InAudio);
|
||||
|
||||
@ -104,15 +110,20 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
|
||||
}
|
||||
}
|
||||
|
||||
// Fire the delegate on the game thread so subscribers don't need to be
|
||||
// thread-safe (WebSocket Send is not thread-safe in UE's implementation).
|
||||
AsyncTask(ENamedThreads::GameThread, [this, Data = MoveTemp(Resampled)]()
|
||||
// Dispatch to game thread for delegate broadcast.
|
||||
// UE's FMulticastDelegate with AddUObject uses weak object pointer checks that
|
||||
// are not thread-safe — broadcasting from the WASAPI thread causes the invocation
|
||||
// to be silently skipped. The game thread dispatch adds ~8ms latency but is required.
|
||||
if (bCapturing)
|
||||
{
|
||||
if (bCapturing)
|
||||
AsyncTask(ENamedThreads::GameThread, [this, Captured = MoveTemp(Resampled)]()
|
||||
{
|
||||
OnAudioCaptured.Broadcast(Data);
|
||||
}
|
||||
});
|
||||
if (bCapturing)
|
||||
{
|
||||
OnAudioCaptured.Broadcast(Captured);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@ -120,9 +120,12 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
|
||||
// Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second).
|
||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num());
|
||||
|
||||
if (WebSocket.IsValid() && WebSocket->IsConnected())
|
||||
{
|
||||
WebSocket->Send(AudioJson);
|
||||
FScopeLock Lock(&WebSocketSendLock);
|
||||
if (WebSocket.IsValid() && WebSocket->IsConnected())
|
||||
{
|
||||
WebSocket->Send(AudioJson);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -585,19 +588,40 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
|
||||
}
|
||||
|
||||
// Extract the streaming text fragment and broadcast it.
|
||||
// API structure:
|
||||
// Current API structure (2026):
|
||||
// { "type": "agent_chat_response_part",
|
||||
// "text_response_part": { "text": "partial text", "type": "part"|"stop", "event_id": N }
|
||||
// }
|
||||
// Legacy structure (pre-2026):
|
||||
// { "type": "agent_chat_response_part",
|
||||
// "agent_chat_response_part_event": { "agent_response_part": "partial text" }
|
||||
// }
|
||||
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
|
||||
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
|
||||
FString PartText;
|
||||
bool bFound = false;
|
||||
|
||||
// Try current format: text_response_part.text
|
||||
const TSharedPtr<FJsonObject>* TextPart = nullptr;
|
||||
if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart)
|
||||
{
|
||||
FString PartText;
|
||||
if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
|
||||
(*TextPart)->TryGetStringField(TEXT("text"), PartText);
|
||||
bFound = true;
|
||||
}
|
||||
|
||||
// Fallback: legacy format agent_chat_response_part_event.agent_response_part
|
||||
if (!bFound)
|
||||
{
|
||||
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
|
||||
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
|
||||
{
|
||||
OnAgentResponsePart.Broadcast(PartText);
|
||||
(*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText);
|
||||
bFound = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (bFound && !PartText.IsEmpty())
|
||||
{
|
||||
OnAgentResponsePart.Broadcast(PartText);
|
||||
}
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
|
||||
@ -658,7 +682,10 @@ void UElevenLabsWebSocketProxy::SendJsonMessage(const TSharedPtr<FJsonObject>& J
|
||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("<< %s"), *Out);
|
||||
}
|
||||
|
||||
WebSocket->Send(Out);
|
||||
{
|
||||
FScopeLock Lock(&WebSocketSendLock);
|
||||
WebSocket->Send(Out);
|
||||
}
|
||||
}
|
||||
|
||||
FString UElevenLabsWebSocketProxy::BuildWebSocketURL(const FString& AgentIDOverride, const FString& APIKeyOverride) const
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
#include "ElevenLabsDefinitions.h"
|
||||
#include "ElevenLabsWebSocketProxy.h"
|
||||
#include "Sound/SoundWaveProcedural.h"
|
||||
#include <atomic>
|
||||
#include "ElevenLabsConversationalAgentComponent.generated.h"
|
||||
|
||||
class UAudioComponent;
|
||||
@ -85,139 +86,113 @@ public:
|
||||
|
||||
// ── Configuration ─────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* ElevenLabs Agent ID. Overrides the project-level default in Project Settings.
|
||||
* Leave empty to use the project default.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
|
||||
/** ElevenLabs Agent ID used for this conversation. Leave empty to use the default from Project Settings > ElevenLabs. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
|
||||
meta = (ToolTip = "ElevenLabs Agent ID. Leave empty to use the project default from Project Settings."))
|
||||
FString AgentID;
|
||||
|
||||
/**
|
||||
* Turn mode:
|
||||
* - Server VAD: ElevenLabs detects end-of-speech automatically (recommended).
|
||||
* - Client Controlled: you call StartListening/StopListening manually (push-to-talk).
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
|
||||
/** How turn-taking is managed between the user and the agent.\n- Server VAD (recommended): ElevenLabs automatically detects when the user stops speaking.\n- Client Controlled: You manually call StartListening/StopListening (push-to-talk with a key). */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
|
||||
meta = (ToolTip = "Turn-taking mode.\n- Server VAD: ElevenLabs detects end-of-speech automatically (hands-free).\n- Client Controlled: You call StartListening/StopListening manually (push-to-talk)."))
|
||||
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
|
||||
|
||||
/**
|
||||
* Automatically start listening (microphone capture) once the WebSocket is
|
||||
* connected and the conversation is initiated.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
|
||||
/** Automatically open the microphone as soon as the WebSocket connection is established. Only applies in Server VAD mode. In Client (push-to-talk) mode, you must call StartListening manually. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
|
||||
meta = (ToolTip = "Auto-open the microphone when the conversation starts.\nOnly applies in Server VAD mode. In push-to-talk mode, call StartListening() manually."))
|
||||
bool bAutoStartListening = true;
|
||||
|
||||
/**
|
||||
* Enable speculative turn: the LLM starts generating a response during
|
||||
* silence before the VAD is fully confident the user has finished speaking.
|
||||
* Reduces latency by 200-500ms but caused the server to silently stop
|
||||
* processing user audio after 2 turns when combined with a short turn_timeout.
|
||||
* Disabled by default until ElevenLabs confirms stability in multi-turn sessions.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
|
||||
/** Let the LLM start generating a response during silence, before the VAD is fully confident the user has finished speaking. Saves 200-500ms of latency but may be unstable in long multi-turn sessions. Disabled by default. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
||||
meta = (ToolTip = "Speculative turn: the LLM begins generating during silence before full turn-end confidence.\nReduces latency by 200-500ms. May be unstable in long sessions — test before enabling in production."))
|
||||
bool bSpeculativeTurn = false;
|
||||
|
||||
/**
|
||||
* Allow the user to interrupt the agent while it is playing audio (speaking).
|
||||
* When true, calling StartListening() while the agent is audibly speaking automatically
|
||||
* sends an interruption signal to the server and opens the mic — no Blueprint nodes needed.
|
||||
* When false, StartListening() is silently ignored until the agent finishes speaking.
|
||||
*
|
||||
* NOTE: interruption only applies during the audio-playback phase (bAgentSpeaking).
|
||||
* While the agent is generating but has not yet started speaking, StartListening() is
|
||||
* always blocked regardless of this flag — this prevents Blueprint's OnAgentStartedGenerating
|
||||
* handler (which often calls StartListening for bookkeeping) from accidentally cancelling
|
||||
* the response before any audio plays.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
|
||||
/** How many milliseconds of microphone audio to accumulate before sending a chunk to ElevenLabs. Lower values reduce latency but may degrade voice detection accuracy. Higher values are more reliable but add delay. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
||||
meta = (ClampMin = "20", ClampMax = "500",
|
||||
ToolTip = "Mic audio chunk duration sent to ElevenLabs.\n- 50-80ms: lower latency, less reliable voice detection.\n- 100ms (default): good balance.\n- 150-250ms: more reliable, higher latency."))
|
||||
int32 MicChunkDurationMs = 100;
|
||||
|
||||
/** Allow the user to interrupt the agent while it is speaking.\n- In Server VAD mode: the microphone stays open during agent speech and the server detects interruptions automatically.\n- In Client (push-to-talk) mode: pressing the talk key while the agent speaks sends an interrupt signal.\n- When disabled: the user must wait for the agent to finish speaking before talking. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
|
||||
meta = (ToolTip = "Allow the user to interrupt the agent while it speaks.\n- Server VAD: mic stays open, server detects user voice automatically.\n- Push-to-talk: pressing the talk key interrupts the agent.\n- Disabled: user must wait for the agent to finish."))
|
||||
bool bAllowInterruption = true;
|
||||
|
||||
/**
|
||||
* Forward user speech transcripts (user_transcript events) to the
|
||||
* OnAgentTranscript delegate. Disable to reduce overhead if you don't
|
||||
* need to display what the user said.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
||||
/** Enable the OnAgentTranscript event, which provides real-time speech-to-text of what the user is saying. Disable if you don't need to display user speech to reduce processing overhead. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fire OnAgentTranscript with real-time speech-to-text of user speech.\nDisable if you don't need to display what the user said."))
|
||||
bool bEnableUserTranscript = true;
|
||||
|
||||
/**
|
||||
* Forward agent text responses (agent_response events) to the
|
||||
* OnAgentTextResponse delegate. Disable if you only need audio output.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
||||
/** Enable the OnAgentTextResponse event, which provides the agent's complete text response once fully generated. Disable if you only need the audio output. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fire OnAgentTextResponse with the agent's complete text once fully generated.\nDisable if you only need the audio output."))
|
||||
bool bEnableAgentTextResponse = true;
|
||||
|
||||
/**
|
||||
* Forward streaming text parts (agent_chat_response_part events) to the
|
||||
* OnAgentPartialResponse delegate. Each part is a text fragment as the LLM
|
||||
* generates it — use this for real-time subtitles that appear while the agent
|
||||
* speaks, instead of waiting for the full text (OnAgentTextResponse).
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
||||
/** Enable the OnAgentPartialResponse event, which streams the agent's text word-by-word as the LLM generates it. Use this for real-time subtitles that appear while the agent speaks, rather than waiting for the full response. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
|
||||
bool bEnableAgentPartialResponse = false;
|
||||
|
||||
/**
|
||||
* How many seconds to wait for the server to start generating a response
|
||||
* after the user stops speaking (StopListening) before firing OnAgentResponseTimeout.
|
||||
* Set to 0 to disable. Default: 10 seconds.
|
||||
*
|
||||
* A typical healthy round-trip is 0.1–0.8s to first agent_chat_response_part.
|
||||
* Values above 10s are extremely unusual and almost always indicate a server issue.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", meta = (ClampMin = "0.0"))
|
||||
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
|
||||
meta = (ClampMin = "0.0",
|
||||
ToolTip = "Seconds to wait for a server response after the user stops speaking.\nFires OnAgentResponseTimeout if exceeded. Normal latency is 0.1-0.8s.\nSet to 0 to disable. Default: 10s."))
|
||||
float ResponseTimeoutSeconds = 10.0f;
|
||||
|
||||
// ── Events ────────────────────────────────────────────────────────────────
|
||||
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired when the WebSocket connection is established and the conversation session is ready. Provides the ConversationID and AgentID. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires when the connection to ElevenLabs is established and the conversation is ready to begin."))
|
||||
FOnAgentConnected OnAgentConnected;
|
||||
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired when the WebSocket connection is closed (gracefully or due to an error). Provides the status code and reason. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires when the connection to ElevenLabs is closed. Check StatusCode and Reason for details."))
|
||||
FOnAgentDisconnected OnAgentDisconnected;
|
||||
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired on any connection or protocol error. The error message describes what went wrong. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires on connection or protocol errors. The ErrorMessage describes the issue."))
|
||||
FOnAgentError OnAgentError;
|
||||
|
||||
/** Fired for every transcript segment (user speech or agent speech, tentative and final). */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired with real-time speech-to-text of the user's voice. Includes both tentative (in-progress) and final transcripts. Requires bEnableUserTranscript to be true. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Real-time speech-to-text of the user's voice.\nIncludes tentative and final transcripts. Enable with bEnableUserTranscript."))
|
||||
FOnAgentTranscript OnAgentTranscript;
|
||||
|
||||
/** Final text response produced by the agent (mirrors the audio). */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired once when the agent's complete text response is available. This is the full text that corresponds to the audio the agent speaks. Requires bEnableAgentTextResponse to be true. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "The agent's complete text response (matches the spoken audio).\nFires once when the full text is ready. Enable with bEnableAgentTextResponse."))
|
||||
FOnAgentTextResponse OnAgentTextResponse;
|
||||
|
||||
/**
|
||||
* Streaming text fragments as the LLM generates them.
|
||||
* Fires for every agent_chat_response_part — each call gives one text chunk.
|
||||
* Enable with bEnableAgentPartialResponse.
|
||||
*/
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired repeatedly as the LLM generates text, providing one word/fragment at a time. Use for real-time subtitles. Each call gives a new fragment, NOT the accumulated text. Requires bEnableAgentPartialResponse to be true. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Streaming text fragments as the LLM generates them (word by word).\nIdeal for real-time subtitles. Enable with bEnableAgentPartialResponse."))
|
||||
FOnAgentPartialResponse OnAgentPartialResponse;
|
||||
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired when the agent begins playing audio (first audio chunk received). Use this to trigger speech animations or UI indicators. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires when the agent starts speaking (first audio chunk). Use for lip-sync or UI feedback."))
|
||||
FOnAgentStartedSpeaking OnAgentStartedSpeaking;
|
||||
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired when the agent finishes playing all audio. Use this to re-open the microphone (in Server VAD mode without interruption) or update UI. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires when the agent finishes speaking. Use to re-open the mic or update UI."))
|
||||
FOnAgentStoppedSpeaking OnAgentStoppedSpeaking;
|
||||
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired when the agent's speech is interrupted (either by the user speaking over it, or by a manual InterruptAgent call). The audio playback is automatically stopped. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires when the agent is interrupted mid-speech. Audio is automatically stopped."))
|
||||
FOnAgentInterrupted OnAgentInterrupted;
|
||||
|
||||
/**
|
||||
* Fired when the server starts generating a response (before audio).
|
||||
* The component automatically stops the microphone when this fires while listening,
|
||||
* so the Blueprint doesn't need to handle this manually for push-to-talk.
|
||||
* Bind here if you need UI feedback ("agent is thinking...").
|
||||
*/
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired when the server starts generating a response (before any audio arrives). Use this for "thinking..." UI feedback. In push-to-talk mode, the microphone is automatically closed when this fires. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires when the server starts generating (before audio arrives).\nUse for 'thinking...' UI. Mic is auto-closed in push-to-talk mode."))
|
||||
FOnAgentStartedGenerating OnAgentStartedGenerating;
|
||||
|
||||
/**
|
||||
* Fired when the server has not started generating within ResponseTimeoutSeconds
|
||||
* after StopListening was called. Bind here to give the user feedback such as
|
||||
* "I didn't get a response, please try again" or to automatically re-open the mic.
|
||||
*/
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
/** Fired if the server does not start generating a response within ResponseTimeoutSeconds after the user stops speaking. Use this to show a "try again" message or automatically re-open the microphone. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically."))
|
||||
FOnAgentResponseTimeout OnAgentResponseTimeout;
|
||||
|
||||
// ── Control ───────────────────────────────────────────────────────────────
|
||||
@ -337,8 +312,9 @@ private:
|
||||
USoundWaveProcedural* ProceduralSoundWave = nullptr;
|
||||
|
||||
// ── State ─────────────────────────────────────────────────────────────────
|
||||
bool bIsListening = false;
|
||||
bool bAgentSpeaking = false;
|
||||
// Atomic: read from WASAPI background thread (OnMicrophoneDataCaptured), written from game thread.
|
||||
std::atomic<bool> bIsListening{false};
|
||||
std::atomic<bool> bAgentSpeaking{false};
|
||||
// True from the first agent_chat_response_part until the first audio chunk arrives.
|
||||
// Used to block StartListening() while the server is processing the previous turn.
|
||||
bool bAgentGenerating = false;
|
||||
@ -399,6 +375,12 @@ private:
|
||||
// WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono).
|
||||
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
|
||||
// We accumulate here and only call SendAudioChunk once enough bytes are ready.
|
||||
// MicSendLock protects MicAccumulationBuffer + WebSocket send (accessed from WASAPI thread
|
||||
// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
|
||||
TArray<uint8> MicAccumulationBuffer;
|
||||
static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation)
|
||||
FCriticalSection MicSendLock;
|
||||
|
||||
/** Compute the minimum bytes from the user-facing MicChunkDurationMs.
|
||||
* Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */
|
||||
int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; }
|
||||
};
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
#include "CoreMinimal.h"
|
||||
#include "Components/ActorComponent.h"
|
||||
#include "AudioCapture.h"
|
||||
#include <atomic>
|
||||
#include "ElevenLabsMicrophoneCaptureComponent.generated.h"
|
||||
|
||||
// Delivers captured float PCM samples (16000 Hz mono, resampled from device rate).
|
||||
@ -27,17 +28,24 @@ class PS_AI_AGENT_ELEVENLABS_API UElevenLabsMicrophoneCaptureComponent : public
|
||||
public:
|
||||
UElevenLabsMicrophoneCaptureComponent();
|
||||
|
||||
/** Volume multiplier applied to captured samples before forwarding. */
|
||||
/** Multiplier applied to the microphone input volume before sending to ElevenLabs. Increase if the agent has trouble hearing you, decrease if your audio is clipping. Default: 1.0 (no change). */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Microphone",
|
||||
meta = (ClampMin = "0.0", ClampMax = "4.0"))
|
||||
meta = (ClampMin = "0.0", ClampMax = "4.0",
|
||||
ToolTip = "Microphone volume multiplier.\n1.0 = no change. Increase if the agent can't hear you, decrease if audio clips."))
|
||||
float VolumeMultiplier = 1.0f;
|
||||
|
||||
/**
|
||||
* Delegate fired on the game thread each time a new chunk of PCM audio
|
||||
* is captured. Samples are float32, resampled to 16000 Hz mono.
|
||||
* Delegate fired on the game thread each time a new chunk of PCM audio is
|
||||
* captured. Samples are float32, resampled to 16000 Hz mono.
|
||||
* Audio is captured on a WASAPI background thread, resampled there (with
|
||||
* echo suppression), then dispatched to the game thread for this broadcast.
|
||||
*/
|
||||
FOnElevenLabsAudioCaptured OnAudioCaptured;
|
||||
|
||||
/** Optional pointer to an atomic bool that suppresses capture when true.
|
||||
* Set by the agent component for echo suppression (skip mic while agent speaks). */
|
||||
std::atomic<bool>* EchoSuppressFlag = nullptr;
|
||||
|
||||
/** Open the default capture device and begin streaming audio. */
|
||||
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
|
||||
void StartCapture();
|
||||
@ -65,7 +73,7 @@ private:
|
||||
|
||||
Audio::FAudioCapture AudioCapture;
|
||||
Audio::FAudioCaptureDeviceParams DeviceParams;
|
||||
bool bCapturing = false;
|
||||
std::atomic<bool> bCapturing{false};
|
||||
|
||||
// Device sample rate discovered on StartCapture
|
||||
int32 DeviceSampleRate = 44100;
|
||||
|
||||
@ -205,6 +205,10 @@ private:
|
||||
EElevenLabsConnectionState ConnectionState = EElevenLabsConnectionState::Disconnected;
|
||||
FElevenLabsConversationInfo ConversationInfo;
|
||||
|
||||
// Serializes WebSocket->Send() calls — needed because SendAudioChunk can now be
|
||||
// called from the WASAPI background thread while SendJsonMessage runs on game thread.
|
||||
FCriticalSection WebSocketSendLock;
|
||||
|
||||
// Accumulation buffer for multi-fragment binary WebSocket frames.
|
||||
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
|
||||
TArray<uint8> BinaryFrameBuffer;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user