Working !
This commit is contained in:
parent
f7f0b0c45b
commit
9f28ed7457
@ -0,0 +1,8 @@
|
|||||||
|
[FilterPlugin]
|
||||||
|
; This section lists additional files which will be packaged along with your plugin. Paths should be listed relative to the root plugin directory, and
|
||||||
|
; may include "...", "*", and "?" wildcards to match directories, files, and individual characters respectively.
|
||||||
|
;
|
||||||
|
; Examples:
|
||||||
|
; /README.txt
|
||||||
|
; /Extras/...
|
||||||
|
; /Binaries/ThirdParty/*.dll
|
||||||
@ -86,9 +86,9 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
|
|||||||
&UElevenLabsConversationalAgentComponent::HandleInterrupted);
|
&UElevenLabsConversationalAgentComponent::HandleInterrupted);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pass our TurnMode to the proxy so it sends the correct mode in
|
// Pass configuration to the proxy before connecting.
|
||||||
// conversation_initiation_client_data and sends user_activity with each audio chunk.
|
|
||||||
WebSocketProxy->TurnMode = TurnMode;
|
WebSocketProxy->TurnMode = TurnMode;
|
||||||
|
WebSocketProxy->bSpeculativeTurn = bSpeculativeTurn;
|
||||||
|
|
||||||
WebSocketProxy->Connect(AgentID);
|
WebSocketProxy->Connect(AgentID);
|
||||||
}
|
}
|
||||||
@ -241,14 +241,20 @@ void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<u
|
|||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
|
void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
|
||||||
|
{
|
||||||
|
if (bEnableUserTranscript)
|
||||||
{
|
{
|
||||||
OnAgentTranscript.Broadcast(Segment);
|
OnAgentTranscript.Broadcast(Segment);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& ResponseText)
|
void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& ResponseText)
|
||||||
|
{
|
||||||
|
if (bEnableAgentTextResponse)
|
||||||
{
|
{
|
||||||
OnAgentTextResponse.Broadcast(ResponseText);
|
OnAgentTextResponse.Broadcast(ResponseText);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void UElevenLabsConversationalAgentComponent::HandleInterrupted()
|
void UElevenLabsConversationalAgentComponent::HandleInterrupted()
|
||||||
{
|
{
|
||||||
|
|||||||
@ -106,6 +106,9 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
|
|||||||
|
|
||||||
UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
|
UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
|
||||||
|
|
||||||
|
// Track when the last audio chunk was sent for latency measurement.
|
||||||
|
LastAudioChunkSentTime = FPlatformTime::Seconds();
|
||||||
|
|
||||||
// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
|
// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
|
||||||
// The server's VAD detects silence to determine end-of-turn.
|
// The server's VAD detects silence to determine end-of-turn.
|
||||||
// Do NOT send user_activity here — it resets the turn timeout timer
|
// Do NOT send user_activity here — it resets the turn timeout timer
|
||||||
@ -143,6 +146,9 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
|
|||||||
{
|
{
|
||||||
// No explicit "end turn" message exists in the ElevenLabs API.
|
// No explicit "end turn" message exists in the ElevenLabs API.
|
||||||
// The server detects end-of-speech via VAD when we stop sending audio chunks.
|
// The server detects end-of-speech via VAD when we stop sending audio chunks.
|
||||||
|
UserTurnEndTime = FPlatformTime::Seconds();
|
||||||
|
bWaitingForResponse = true;
|
||||||
|
bFirstAudioResponseLogged = false;
|
||||||
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
|
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -188,7 +194,7 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
|
|||||||
// "type": "conversation_initiation_client_data",
|
// "type": "conversation_initiation_client_data",
|
||||||
// "conversation_config_override": {
|
// "conversation_config_override": {
|
||||||
// "agent": {
|
// "agent": {
|
||||||
// "turn": { "turn_timeout": 3 }
|
// "turn": { "turn_timeout": 3, "speculative_turn": true }
|
||||||
// },
|
// },
|
||||||
// "tts": {
|
// "tts": {
|
||||||
// "optimize_streaming_latency": 3
|
// "optimize_streaming_latency": 3
|
||||||
@ -206,10 +212,17 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
|
|||||||
// uses its VAD to detect the end of speech from the audio chunks it receives.
|
// uses its VAD to detect the end of speech from the audio chunks it receives.
|
||||||
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
||||||
// Lower turn_timeout so the agent responds faster after the user stops speaking.
|
// Lower turn_timeout so the agent responds faster after the user stops speaking.
|
||||||
// Default is 7s which feels very slow for push-to-talk.
|
// Default is 7s. In push-to-talk (Client mode), the user explicitly signals
|
||||||
|
// end-of-turn by releasing the key, so we can use a very short timeout (1s).
|
||||||
if (TurnMode == EElevenLabsTurnMode::Client)
|
if (TurnMode == EElevenLabsTurnMode::Client)
|
||||||
{
|
{
|
||||||
TurnObj->SetNumberField(TEXT("turn_timeout"), 3);
|
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
|
||||||
|
}
|
||||||
|
// Speculative turn: start LLM generation during silence before the VAD is
|
||||||
|
// fully confident the user finished speaking. Reduces latency by 200-500ms.
|
||||||
|
if (bSpeculativeTurn)
|
||||||
|
{
|
||||||
|
TurnObj->SetBoolField(TEXT("speculative_turn"), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
||||||
@ -293,14 +306,44 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
|||||||
}
|
}
|
||||||
else if (MsgType == ElevenLabsMessageType::AudioResponse)
|
else if (MsgType == ElevenLabsMessageType::AudioResponse)
|
||||||
{
|
{
|
||||||
|
// Log time-to-first-audio: latency between end of user turn and first agent audio.
|
||||||
|
if (bWaitingForResponse && !bFirstAudioResponseLogged)
|
||||||
|
{
|
||||||
|
const double Now = FPlatformTime::Seconds();
|
||||||
|
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
|
||||||
|
const double LatencyFromLastChunk = (Now - LastAudioChunkSentTime) * 1000.0;
|
||||||
|
UE_LOG(LogElevenLabsWS, Warning,
|
||||||
|
TEXT("[LATENCY] Time-to-first-audio: %.0f ms (from turn end), %.0f ms (from last chunk sent)"),
|
||||||
|
LatencyFromTurnEnd, LatencyFromLastChunk);
|
||||||
|
bFirstAudioResponseLogged = true;
|
||||||
|
}
|
||||||
HandleAudioResponse(Root);
|
HandleAudioResponse(Root);
|
||||||
}
|
}
|
||||||
else if (MsgType == ElevenLabsMessageType::UserTranscript)
|
else if (MsgType == ElevenLabsMessageType::UserTranscript)
|
||||||
{
|
{
|
||||||
|
// Log transcription latency.
|
||||||
|
if (bWaitingForResponse)
|
||||||
|
{
|
||||||
|
const double Now = FPlatformTime::Seconds();
|
||||||
|
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
|
||||||
|
UE_LOG(LogElevenLabsWS, Warning,
|
||||||
|
TEXT("[LATENCY] User transcript received: %.0f ms after turn end"),
|
||||||
|
LatencyFromTurnEnd);
|
||||||
|
bWaitingForResponse = false;
|
||||||
|
}
|
||||||
HandleTranscript(Root);
|
HandleTranscript(Root);
|
||||||
}
|
}
|
||||||
else if (MsgType == ElevenLabsMessageType::AgentResponse)
|
else if (MsgType == ElevenLabsMessageType::AgentResponse)
|
||||||
{
|
{
|
||||||
|
// Log agent text response latency.
|
||||||
|
if (UserTurnEndTime > 0.0)
|
||||||
|
{
|
||||||
|
const double Now = FPlatformTime::Seconds();
|
||||||
|
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
|
||||||
|
UE_LOG(LogElevenLabsWS, Warning,
|
||||||
|
TEXT("[LATENCY] Agent text response: %.0f ms after turn end"),
|
||||||
|
LatencyFromTurnEnd);
|
||||||
|
}
|
||||||
HandleAgentResponse(Root);
|
HandleAgentResponse(Root);
|
||||||
}
|
}
|
||||||
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
|
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
|
||||||
|
|||||||
@ -80,6 +80,29 @@ public:
|
|||||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
|
||||||
bool bAutoStartListening = true;
|
bool bAutoStartListening = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enable speculative turn: the LLM starts generating a response during
|
||||||
|
* silence before the VAD is fully confident the user has finished speaking.
|
||||||
|
* Reduces latency by 200-500ms but may occasionally produce premature responses.
|
||||||
|
*/
|
||||||
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
|
||||||
|
bool bSpeculativeTurn = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Forward user speech transcripts (user_transcript events) to the
|
||||||
|
* OnAgentTranscript delegate. Disable to reduce overhead if you don't
|
||||||
|
* need to display what the user said.
|
||||||
|
*/
|
||||||
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
||||||
|
bool bEnableUserTranscript = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Forward agent text responses (agent_response events) to the
|
||||||
|
* OnAgentTextResponse delegate. Disable if you only need audio output.
|
||||||
|
*/
|
||||||
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
||||||
|
bool bEnableAgentTextResponse = true;
|
||||||
|
|
||||||
// ── Events ────────────────────────────────────────────────────────────────
|
// ── Events ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||||
|
|||||||
@ -184,9 +184,21 @@ private:
|
|||||||
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
|
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
|
||||||
TArray<uint8> BinaryFrameBuffer;
|
TArray<uint8> BinaryFrameBuffer;
|
||||||
|
|
||||||
|
// ── Latency tracking ─────────────────────────────────────────────────────
|
||||||
|
// Timestamp of the last audio chunk sent (user speech).
|
||||||
|
double LastAudioChunkSentTime = 0.0;
|
||||||
|
// Timestamp when user turn ended (StopListening).
|
||||||
|
double UserTurnEndTime = 0.0;
|
||||||
|
// Whether we are waiting for the first response after user stopped speaking.
|
||||||
|
bool bWaitingForResponse = false;
|
||||||
|
// Whether we already logged the first audio response latency for this turn.
|
||||||
|
bool bFirstAudioResponseLogged = false;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
|
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
|
||||||
// Controls the turn mode string sent in conversation_initiation_client_data
|
// Controls turn_timeout in conversation_initiation_client_data.
|
||||||
// AND whether user_activity is sent automatically with each audio chunk.
|
|
||||||
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
|
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
|
||||||
|
|
||||||
|
// Speculative turn: start LLM generation during silence before full turn confidence.
|
||||||
|
bool bSpeculativeTurn = true;
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user