Working !
This commit is contained in:
parent
f7f0b0c45b
commit
9f28ed7457
@ -0,0 +1,8 @@
|
||||
[FilterPlugin]
|
||||
; This section lists additional files which will be packaged along with your plugin. Paths should be listed relative to the root plugin directory, and
|
||||
; may include "...", "*", and "?" wildcards to match directories, files, and individual characters respectively.
|
||||
;
|
||||
; Examples:
|
||||
; /README.txt
|
||||
; /Extras/...
|
||||
; /Binaries/ThirdParty/*.dll
|
||||
@ -86,9 +86,9 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
|
||||
&UElevenLabsConversationalAgentComponent::HandleInterrupted);
|
||||
}
|
||||
|
||||
// Pass our TurnMode to the proxy so it sends the correct mode in
|
||||
// conversation_initiation_client_data and sends user_activity with each audio chunk.
|
||||
// Pass configuration to the proxy before connecting.
|
||||
WebSocketProxy->TurnMode = TurnMode;
|
||||
WebSocketProxy->bSpeculativeTurn = bSpeculativeTurn;
|
||||
|
||||
WebSocketProxy->Connect(AgentID);
|
||||
}
|
||||
@ -242,12 +242,18 @@ void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<u
|
||||
|
||||
void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
|
||||
{
|
||||
OnAgentTranscript.Broadcast(Segment);
|
||||
if (bEnableUserTranscript)
|
||||
{
|
||||
OnAgentTranscript.Broadcast(Segment);
|
||||
}
|
||||
}
|
||||
|
||||
void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& ResponseText)
|
||||
{
|
||||
OnAgentTextResponse.Broadcast(ResponseText);
|
||||
if (bEnableAgentTextResponse)
|
||||
{
|
||||
OnAgentTextResponse.Broadcast(ResponseText);
|
||||
}
|
||||
}
|
||||
|
||||
void UElevenLabsConversationalAgentComponent::HandleInterrupted()
|
||||
|
||||
@ -106,6 +106,9 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
|
||||
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
|
||||
|
||||
// Track when the last audio chunk was sent for latency measurement.
|
||||
LastAudioChunkSentTime = FPlatformTime::Seconds();
|
||||
|
||||
// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
|
||||
// The server's VAD detects silence to determine end-of-turn.
|
||||
// Do NOT send user_activity here — it resets the turn timeout timer
|
||||
@ -143,6 +146,9 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
|
||||
{
|
||||
// No explicit "end turn" message exists in the ElevenLabs API.
|
||||
// The server detects end-of-speech via VAD when we stop sending audio chunks.
|
||||
UserTurnEndTime = FPlatformTime::Seconds();
|
||||
bWaitingForResponse = true;
|
||||
bFirstAudioResponseLogged = false;
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
|
||||
}
|
||||
|
||||
@ -188,7 +194,7 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
|
||||
// "type": "conversation_initiation_client_data",
|
||||
// "conversation_config_override": {
|
||||
// "agent": {
|
||||
// "turn": { "turn_timeout": 3 }
|
||||
// "turn": { "turn_timeout": 3, "speculative_turn": true }
|
||||
// },
|
||||
// "tts": {
|
||||
// "optimize_streaming_latency": 3
|
||||
@ -206,10 +212,17 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
|
||||
// uses its VAD to detect the end of speech from the audio chunks it receives.
|
||||
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
||||
// Lower turn_timeout so the agent responds faster after the user stops speaking.
|
||||
// Default is 7s which feels very slow for push-to-talk.
|
||||
// Default is 7s. In push-to-talk (Client mode), the user explicitly signals
|
||||
// end-of-turn by releasing the key, so we can use a very short timeout (1s).
|
||||
if (TurnMode == EElevenLabsTurnMode::Client)
|
||||
{
|
||||
TurnObj->SetNumberField(TEXT("turn_timeout"), 3);
|
||||
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
|
||||
}
|
||||
// Speculative turn: start LLM generation during silence before the VAD is
|
||||
// fully confident the user finished speaking. Reduces latency by 200-500ms.
|
||||
if (bSpeculativeTurn)
|
||||
{
|
||||
TurnObj->SetBoolField(TEXT("speculative_turn"), true);
|
||||
}
|
||||
|
||||
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
||||
@ -293,14 +306,44 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::AudioResponse)
|
||||
{
|
||||
// Log time-to-first-audio: latency between end of user turn and first agent audio.
|
||||
if (bWaitingForResponse && !bFirstAudioResponseLogged)
|
||||
{
|
||||
const double Now = FPlatformTime::Seconds();
|
||||
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
|
||||
const double LatencyFromLastChunk = (Now - LastAudioChunkSentTime) * 1000.0;
|
||||
UE_LOG(LogElevenLabsWS, Warning,
|
||||
TEXT("[LATENCY] Time-to-first-audio: %.0f ms (from turn end), %.0f ms (from last chunk sent)"),
|
||||
LatencyFromTurnEnd, LatencyFromLastChunk);
|
||||
bFirstAudioResponseLogged = true;
|
||||
}
|
||||
HandleAudioResponse(Root);
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::UserTranscript)
|
||||
{
|
||||
// Log transcription latency.
|
||||
if (bWaitingForResponse)
|
||||
{
|
||||
const double Now = FPlatformTime::Seconds();
|
||||
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
|
||||
UE_LOG(LogElevenLabsWS, Warning,
|
||||
TEXT("[LATENCY] User transcript received: %.0f ms after turn end"),
|
||||
LatencyFromTurnEnd);
|
||||
bWaitingForResponse = false;
|
||||
}
|
||||
HandleTranscript(Root);
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::AgentResponse)
|
||||
{
|
||||
// Log agent text response latency.
|
||||
if (UserTurnEndTime > 0.0)
|
||||
{
|
||||
const double Now = FPlatformTime::Seconds();
|
||||
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
|
||||
UE_LOG(LogElevenLabsWS, Warning,
|
||||
TEXT("[LATENCY] Agent text response: %.0f ms after turn end"),
|
||||
LatencyFromTurnEnd);
|
||||
}
|
||||
HandleAgentResponse(Root);
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
|
||||
|
||||
@ -80,6 +80,29 @@ public:
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
|
||||
bool bAutoStartListening = true;
|
||||
|
||||
/**
|
||||
* Enable speculative turn: the LLM starts generating a response during
|
||||
* silence before the VAD is fully confident the user has finished speaking.
|
||||
* Reduces latency by 200-500ms but may occasionally produce premature responses.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
|
||||
bool bSpeculativeTurn = true;
|
||||
|
||||
/**
|
||||
* Forward user speech transcripts (user_transcript events) to the
|
||||
* OnAgentTranscript delegate. Disable to reduce overhead if you don't
|
||||
* need to display what the user said.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
||||
bool bEnableUserTranscript = true;
|
||||
|
||||
/**
|
||||
* Forward agent text responses (agent_response events) to the
|
||||
* OnAgentTextResponse delegate. Disable if you only need audio output.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
||||
bool bEnableAgentTextResponse = true;
|
||||
|
||||
// ── Events ────────────────────────────────────────────────────────────────
|
||||
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
|
||||
@ -184,9 +184,21 @@ private:
|
||||
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
|
||||
TArray<uint8> BinaryFrameBuffer;
|
||||
|
||||
// ── Latency tracking ─────────────────────────────────────────────────────
|
||||
// Timestamp of the last audio chunk sent (user speech).
|
||||
double LastAudioChunkSentTime = 0.0;
|
||||
// Timestamp when user turn ended (StopListening).
|
||||
double UserTurnEndTime = 0.0;
|
||||
// Whether we are waiting for the first response after user stopped speaking.
|
||||
bool bWaitingForResponse = false;
|
||||
// Whether we already logged the first audio response latency for this turn.
|
||||
bool bFirstAudioResponseLogged = false;
|
||||
|
||||
public:
|
||||
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
|
||||
// Controls the turn mode string sent in conversation_initiation_client_data
|
||||
// AND whether user_activity is sent automatically with each audio chunk.
|
||||
// Controls turn_timeout in conversation_initiation_client_data.
|
||||
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
|
||||
|
||||
// Speculative turn: start LLM generation during silence before full turn confidence.
|
||||
bool bSpeculativeTurn = true;
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user