Compare commits

...

3 Commits

Author SHA1 Message Date
f23acc8c1c Ajout sample CPP 2026-02-21 20:48:10 +01:00
d8957625f8 rollback to a more functional version but not perfect 2026-02-21 19:49:26 +01:00
1b883f532f Fix collision stuck state + add streaming partial text event
- Skip mic buffer flush during collision avoidance (bAgentGenerating guard
  in StopListening) to prevent sending audio to a mid-generation server
  which caused both sides to stall permanently
- Add OnAgentPartialResponse event: streams LLM text fragments from
  agent_chat_response_part in real-time (opt-in via bEnableAgentPartialResponse),
  separate from the existing OnAgentTextResponse (full text at end)
- French agent server drop after 3 turns is a server-side issue, not client

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 20:24:50 +01:00
15 changed files with 1086 additions and 83 deletions

View File

@ -0,0 +1,152 @@
# Build directories
build/
cmake-build-*/
out/
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
convai_cpp
# CMake
CMakeCache.txt
CMakeFiles/
CMakeScripts/
Testing/
Makefile
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps/
# IDE files
.vscode/
.idea/
*.swp
*.swo
*~
# macOS
.DS_Store
.AppleDouble
.LSOverride
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# Windows
Thumbs.db
ehthumbs.db
Desktop.ini
$RECYCLE.BIN/
*.cab
*.msi
*.msm
*.msp
*.lnk
# Linux
*~
.fuse_hidden*
.directory
.Trash-*
.nfs*
# Logs
*.log
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Coverage directory used by tools like istanbul
coverage/
# nyc test coverage
.nyc_output
# Dependency directories
node_modules/
# Optional npm cache directory
.npm
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/

View File

@ -0,0 +1,42 @@
cmake_minimum_required(VERSION 3.14)
project(elevenlabs_convai_cpp LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# Find dependencies
find_package(Boost REQUIRED COMPONENTS system thread)
find_package(OpenSSL REQUIRED)
# PortAudio via vcpkg CMake config
find_package(portaudio CONFIG REQUIRED)
# Find nlohmann_json
find_package(nlohmann_json 3.11 QUIET)
if(NOT nlohmann_json_FOUND)
include(FetchContent)
# Fallback: header-only fetch to avoid old CMake policies in upstream CMakeLists
FetchContent_Declare(
nlohmann_json_src
URL https://raw.githubusercontent.com/nlohmann/json/v3.11.2/single_include/nlohmann/json.hpp
)
FetchContent_MakeAvailable(nlohmann_json_src)
add_library(nlohmann_json::nlohmann_json INTERFACE IMPORTED)
target_include_directories(nlohmann_json::nlohmann_json INTERFACE ${nlohmann_json_src_SOURCE_DIR}/single_include)
endif()
add_executable(convai_cpp
src/main.cpp
src/Conversation.cpp
src/DefaultAudioInterface.cpp
)
target_include_directories(convai_cpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
# MSVC: set Windows target version and suppress getenv deprecation warning
if(MSVC)
target_compile_definitions(convai_cpp PRIVATE _WIN32_WINNT=0x0A00 _CRT_SECURE_NO_WARNINGS)
endif()
target_link_libraries(convai_cpp PRIVATE Boost::system Boost::thread OpenSSL::SSL OpenSSL::Crypto portaudio nlohmann_json::nlohmann_json)

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Jitendra
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,197 @@
# ElevenLabs Conversational AI - C++ Implementation
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![C++17](https://img.shields.io/badge/C%2B%2B-17-blue.svg)](https://en.wikipedia.org/wiki/C%2B%2B17)
[![CMake](https://img.shields.io/badge/CMake-3.14+-green.svg)](https://cmake.org/)
C++ implementation of ElevenLabs Conversational AI client
## Features
- **Real-time Audio Processing**: Full-duplex audio streaming with low-latency playback
- **WebSocket Integration**: Secure WSS connection to ElevenLabs Conversational AI platform
- **Cross-platform Audio**: PortAudio-based implementation supporting Windows, macOS, and Linux
- **Echo Suppression**: Built-in acoustic feedback prevention
- **Modern C++**: Clean, maintainable C++17 codebase with proper RAII and exception handling
- **Flexible Architecture**: Modular design allowing easy customization and extension
## Architecture
```mermaid
graph TB
subgraph "User Interface"
A[main.cpp] --> B[Conversation]
end
subgraph "Core Components"
B --> C[DefaultAudioInterface]
B --> D[WebSocket Client]
C --> E[PortAudio]
D --> F[Boost.Beast + OpenSSL]
end
subgraph "ElevenLabs Platform"
F --> G[WSS API Endpoint]
G --> H[Conversational AI Agent]
end
subgraph "Audio Flow"
I[Microphone] --> C
C --> J[Base64 Encoding]
J --> D
D --> K[Audio Events]
K --> L[Base64 Decoding]
L --> C
C --> M[Speakers]
end
subgraph "Message Types"
N[user_audio_chunk]
O[agent_response]
P[user_transcript]
Q[audio_event]
R[ping/pong]
end
style B fill:#e1f5fe
style C fill:#f3e5f5
style D fill:#e8f5e8
style H fill:#fff3e0
```
## Quick Start
### Prerequisites
- **C++17 compatible compiler**: GCC 11+, Clang 14+, or MSVC 2022+
- **CMake** 3.14 or higher
- **Dependencies** (install via package manager):
#### macOS (Homebrew)
```bash
brew install boost openssl portaudio nlohmann-json cmake pkg-config
```
#### Ubuntu/Debian
```bash
sudo apt update
sudo apt install build-essential cmake pkg-config
sudo apt install libboost-system-dev libboost-thread-dev
sudo apt install libssl-dev libportaudio2-dev nlohmann-json3-dev
```
#### Windows (vcpkg)
```bash
vcpkg install boost-system boost-thread openssl portaudio nlohmann-json
```
### Building
```bash
# Clone the repository
git clone https://github.com/Jitendra2603/elevenlabs-convai-cpp.git
cd elevenlabs-convai-cpp
# Build the project
mkdir build && cd build
cmake ..
cmake --build . --config Release
```
### Running
```bash
# Set your agent ID (get this from ElevenLabs dashboard)
export AGENT_ID="your-agent-id-here"
# Run the demo
./convai_cpp
```
The application will:
1. Connect to your ElevenLabs Conversational AI agent
2. Start capturing audio from your default microphone
3. Stream audio to the agent and play responses through speakers
4. Display conversation transcripts in the terminal
5. Continue until you press Enter to quit
## 📋 Usage Examples
### Basic Conversation
```bash
export AGENT_ID="agent_"
./convai_cpp
# Speak into your microphone and hear the AI agent respond
```
## Configuration
### Audio Settings
The audio interface is configured for optimal real-time performance:
- **Sample Rate**: 16 kHz
- **Format**: 16-bit PCM mono
- **Input Buffer**: 250ms (4000 frames)
- **Output Buffer**: 62.5ms (1000 frames)
### WebSocket Connection
- **Endpoint**: `wss://api.elevenlabs.io/v1/convai/conversation`
- **Protocol**: WebSocket Secure (WSS) with TLS 1.2+
- **Authentication**: Optional (required for private agents)
## Project Structure
```
elevenlabs-convai-cpp/
├── CMakeLists.txt # Build configuration
├── README.md # This file
├── LICENSE # MIT license
├── CONTRIBUTING.md # Contribution guidelines
├── .gitignore # Git ignore rules
├── include/ # Header files
│ ├── AudioInterface.hpp # Abstract audio interface
│ ├── DefaultAudioInterface.hpp # PortAudio implementation
│ └── Conversation.hpp # Main conversation handler
└── src/ # Source files
├── main.cpp # Demo application
├── Conversation.cpp # WebSocket and message handling
└── DefaultAudioInterface.cpp # Audio I/O implementation
```
## Technical Details
### Audio Processing Pipeline
1. **Capture**: PortAudio captures 16-bit PCM audio at 16kHz
2. **Encoding**: Raw audio is base64-encoded for WebSocket transmission
3. **Streaming**: Audio chunks sent as `user_audio_chunk` messages
4. **Reception**: Server sends `audio_event` messages with agent responses
5. **Decoding**: Base64 audio data decoded back to PCM
6. **Playback**: Audio queued and played through PortAudio output stream
### Echo Suppression
The implementation includes a simple, effective echo suppression mechanism:
- Microphone input is suppressed during agent speech playback
- Prevents acoustic feedback loops that cause the agent to respond to itself
- Uses atomic flags for thread-safe coordination between input/output
### WebSocket Message Handling
Supported message types:
- `conversation_initiation_client_data` - Session initialization
- `user_audio_chunk` - Microphone audio data
- `audio_event` - Agent speech audio
- `agent_response` - Agent text responses
- `user_transcript` - Speech-to-text results
- `ping`/`pong` - Connection keepalive
## 📝 License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

View File

@ -0,0 +1,23 @@
#pragma once
#include <functional>
#include <vector>
class AudioInterface {
public:
using AudioCallback = std::function<void(const std::vector<char>&)>;
virtual ~AudioInterface() = default;
// Starts the audio interface. The callback will be invoked with raw 16-bit PCM mono samples at 16kHz.
virtual void start(AudioCallback inputCallback) = 0;
// Stops audio I/O and releases underlying resources.
virtual void stop() = 0;
// Play audio to the user; audio is 16-bit PCM mono 16kHz.
virtual void output(const std::vector<char>& audio) = 0;
// Immediately stop any buffered / ongoing output.
virtual void interrupt() = 0;
};

View File

@ -0,0 +1,72 @@
#pragma once
#include "AudioInterface.hpp"
#include <boost/beast/core.hpp>
#include <boost/beast/websocket.hpp>
#include <boost/beast/ssl.hpp>
#include <boost/asio.hpp>
#include <boost/asio/ssl/stream.hpp>
#include <boost/asio/ip/tcp.hpp>
#include <nlohmann/json.hpp>
#include <thread>
#include <atomic>
#include <functional>
class Conversation {
public:
using CallbackAgentResponse = std::function<void(const std::string&)>;
using CallbackAgentResponseCorrection = std::function<void(const std::string&, const std::string&)>;
using CallbackUserTranscript = std::function<void(const std::string&)>;
using CallbackLatencyMeasurement = std::function<void(int)>;
Conversation(
const std::string& agentId,
bool requiresAuth,
std::shared_ptr<AudioInterface> audioInterface,
CallbackAgentResponse callbackAgentResponse = nullptr,
CallbackAgentResponseCorrection callbackAgentResponseCorrection = nullptr,
CallbackUserTranscript callbackUserTranscript = nullptr,
CallbackLatencyMeasurement callbackLatencyMeasurement = nullptr
);
~Conversation();
void startSession();
void endSession();
std::string waitForSessionEnd();
void sendUserMessage(const std::string& text);
void registerUserActivity();
void sendContextualUpdate(const std::string& content);
private:
void run();
void handleMessage(const nlohmann::json& message);
std::string getWssUrl() const;
// networking members
boost::asio::io_context ioc_;
boost::asio::ssl::context sslCtx_{boost::asio::ssl::context::tlsv12_client};
using tcp = boost::asio::ip::tcp;
using websocket_t = boost::beast::websocket::stream<
boost::beast::ssl_stream<tcp::socket>>;
std::unique_ptr<websocket_t> ws_;
// general state
std::string agentId_;
bool requiresAuth_;
std::shared_ptr<AudioInterface> audioInterface_;
CallbackAgentResponse callbackAgentResponse_;
CallbackAgentResponseCorrection callbackAgentResponseCorrection_;
CallbackUserTranscript callbackUserTranscript_;
CallbackLatencyMeasurement callbackLatencyMeasurement_;
std::thread workerThread_;
std::atomic<bool> shouldStop_{false};
std::string conversationId_;
std::atomic<int> lastInterruptId_{0};
};

View File

@ -0,0 +1,45 @@
#pragma once
#include "AudioInterface.hpp"
#include <portaudio.h>
#include <mutex>
#include <condition_variable>
#include <queue>
#include <thread>
#include <atomic>
class DefaultAudioInterface : public AudioInterface {
public:
static constexpr int INPUT_FRAMES_PER_BUFFER = 4000; // 250ms @ 16kHz
static constexpr int OUTPUT_FRAMES_PER_BUFFER = 1000; // 62.5ms @ 16kHz
DefaultAudioInterface();
~DefaultAudioInterface() override;
void start(AudioCallback inputCallback) override;
void stop() override;
void output(const std::vector<char>& audio) override;
void interrupt() override;
private:
static int inputCallbackStatic(const void* input, void* output, unsigned long frameCount,
const PaStreamCallbackTimeInfo* timeInfo, PaStreamCallbackFlags statusFlags,
void* userData);
int inputCallbackInternal(const void* input, unsigned long frameCount);
void outputThreadFunc();
PaStream* inputStream_{};
PaStream* outputStream_{};
AudioCallback inputCallback_;
std::queue<std::vector<char>> outputQueue_;
std::mutex queueMutex_;
std::condition_variable queueCv_;
std::thread outputThread_;
std::atomic<bool> shouldStop_{false};
std::atomic<bool> outputPlaying_{false};
};

View File

@ -0,0 +1,230 @@
#include "Conversation.hpp"
#include <boost/beast/websocket/ssl.hpp>
#include <boost/beast/websocket.hpp>
#include <boost/beast/ssl.hpp>
#include <boost/beast/core/detail/base64.hpp>
#include <boost/asio/connect.hpp>
#include <boost/algorithm/string.hpp>
#include <iostream>
#include <sstream>
#include <openssl/ssl.h>
using tcp = boost::asio::ip::tcp;
namespace ssl = boost::asio::ssl;
namespace websocket = boost::beast::websocket;
namespace beast = boost::beast;
static std::string base64Encode(const std::vector<char>& data) {
auto encodedSize = beast::detail::base64::encoded_size(data.size());
std::string out(encodedSize, '\0');
beast::detail::base64::encode(&out[0], data.data(), data.size());
return out;
}
static std::vector<char> base64Decode(const std::string& str) {
auto decodedSize = beast::detail::base64::decoded_size(str.size());
std::vector<char> out(decodedSize);
auto result = beast::detail::base64::decode(out.data(), str.data(), str.size());
out.resize(result.first);
return out;
}
static std::string toString(const nlohmann::json& j){
if(j.is_string()) return j.get<std::string>();
if(j.is_number_integer()) return std::to_string(j.get<int64_t>());
return j.dump();
}
Conversation::Conversation(const std::string& agentId, bool requiresAuth,
std::shared_ptr<AudioInterface> audioInterface,
CallbackAgentResponse callbackAgentResponse,
CallbackAgentResponseCorrection callbackAgentResponseCorrection,
CallbackUserTranscript callbackUserTranscript,
CallbackLatencyMeasurement callbackLatencyMeasurement)
: agentId_(agentId),
requiresAuth_(requiresAuth),
audioInterface_(std::move(audioInterface)),
callbackAgentResponse_(std::move(callbackAgentResponse)),
callbackAgentResponseCorrection_(std::move(callbackAgentResponseCorrection)),
callbackUserTranscript_(std::move(callbackUserTranscript)),
callbackLatencyMeasurement_(std::move(callbackLatencyMeasurement)) {
sslCtx_.set_default_verify_paths();
}
Conversation::~Conversation() {
endSession();
}
void Conversation::startSession() {
shouldStop_.store(false);
workerThread_ = std::thread(&Conversation::run, this);
}
void Conversation::endSession() {
shouldStop_.store(true);
if (ws_) {
beast::error_code ec;
ws_->close(websocket::close_code::normal, ec);
}
if (audioInterface_) {
audioInterface_->stop();
}
if (workerThread_.joinable()) {
workerThread_.join();
}
}
std::string Conversation::waitForSessionEnd() {
if (workerThread_.joinable()) {
workerThread_.join();
}
return conversationId_;
}
void Conversation::sendUserMessage(const std::string& text) {
if (!ws_) {
throw std::runtime_error("Session not started");
}
nlohmann::json j = {
{"type", "user_message"},
{"text", text}
};
ws_->write(boost::asio::buffer(j.dump()));
}
void Conversation::registerUserActivity() {
if (!ws_) throw std::runtime_error("Session not started");
nlohmann::json j = {{"type", "user_activity"}};
ws_->write(boost::asio::buffer(j.dump()));
}
void Conversation::sendContextualUpdate(const std::string& content) {
if (!ws_) throw std::runtime_error("Session not started");
nlohmann::json j = {{"type", "contextual_update"}, {"content", content}};
ws_->write(boost::asio::buffer(j.dump()));
}
std::string Conversation::getWssUrl() const {
// Hard-coded base env for demo; in production you'd call ElevenLabs env endpoint.
std::ostringstream oss;
oss << "wss://api.elevenlabs.io/v1/convai/conversation?agent_id=" << agentId_;
return oss.str();
}
void Conversation::run() {
try {
auto url = getWssUrl();
std::string protocol, host, target;
unsigned short port = 443;
// Very naive parse: wss://host[:port]/path?query
if (boost::starts_with(url, "wss://")) {
protocol = "wss";
host = url.substr(6);
} else {
throw std::runtime_error("Only wss:// URLs supported in this demo");
}
auto slashPos = host.find('/');
if (slashPos == std::string::npos) {
target = "/";
} else {
target = host.substr(slashPos);
host = host.substr(0, slashPos);
}
auto colonPos = host.find(':');
if (colonPos != std::string::npos) {
port = static_cast<unsigned short>(std::stoi(host.substr(colonPos + 1)));
host = host.substr(0, colonPos);
}
tcp::resolver resolver(ioc_);
auto const results = resolver.resolve(host, std::to_string(port));
beast::ssl_stream<tcp::socket> stream(ioc_, sslCtx_);
boost::asio::connect(beast::get_lowest_layer(stream), results);
if (!SSL_set_tlsext_host_name(stream.native_handle(), host.c_str())) {
throw std::runtime_error("Failed to set SNI hostname on SSL stream");
}
stream.handshake(ssl::stream_base::client);
ws_ = std::make_unique<websocket_t>(std::move(stream));
ws_->set_option(websocket::stream_base::timeout::suggested(beast::role_type::client));
ws_->handshake(host, target);
// send initiation data
nlohmann::json init = {
{"type", "conversation_initiation_client_data"},
{"custom_llm_extra_body", nlohmann::json::object()},
{"conversation_config_override", nlohmann::json::object()},
{"dynamic_variables", nlohmann::json::object()}
};
ws_->write(boost::asio::buffer(init.dump()));
// Prepare audio callback
auto inputCb = [this](const std::vector<char>& audio) {
nlohmann::json msg = {
{"user_audio_chunk", base64Encode(audio)}
};
ws_->write(boost::asio::buffer(msg.dump()));
};
audioInterface_->start(inputCb);
beast::flat_buffer buffer;
while (!shouldStop_.load()) {
beast::error_code ec;
ws_->read(buffer, ec);
if (ec) {
std::cerr << "Websocket read error: " << ec.message() << std::endl;
break;
}
auto text = beast::buffers_to_string(buffer.data());
buffer.consume(buffer.size());
try {
auto message = nlohmann::json::parse(text);
handleMessage(message);
} catch (const std::exception& ex) {
std::cerr << "JSON parse error: " << ex.what() << std::endl;
}
}
} catch (const std::exception& ex) {
std::cerr << "Conversation error: " << ex.what() << std::endl;
}
}
void Conversation::handleMessage(const nlohmann::json& message) {
std::string type = message.value("type", "");
if (type == "conversation_initiation_metadata") {
conversationId_ = message["conversation_initiation_metadata_event"]["conversation_id"].get<std::string>();
} else if (type == "audio") {
auto event = message["audio_event"];
int eventId = std::stoi(toString(event["event_id"]));
if (eventId <= lastInterruptId_.load()) return;
auto audioBytes = base64Decode(event["audio_base_64"].get<std::string>());
audioInterface_->output(audioBytes);
} else if (type == "agent_response" && callbackAgentResponse_) {
auto event = message["agent_response_event"];
callbackAgentResponse_(event["agent_response"].get<std::string>());
} else if (type == "agent_response_correction" && callbackAgentResponseCorrection_) {
auto event = message["agent_response_correction_event"];
callbackAgentResponseCorrection_(event["original_agent_response"].get<std::string>(),
event["corrected_agent_response"].get<std::string>());
} else if (type == "user_transcript" && callbackUserTranscript_) {
auto event = message["user_transcription_event"];
callbackUserTranscript_(event["user_transcript"].get<std::string>());
} else if (type == "interruption") {
auto event = message["interruption_event"];
lastInterruptId_.store(std::stoi(toString(event["event_id"])));
audioInterface_->interrupt();
} else if (type == "ping") {
auto event = message["ping_event"];
nlohmann::json pong = {{"type", "pong"}, {"event_id", event["event_id"]}};
ws_->write(boost::asio::buffer(pong.dump()));
if (callbackLatencyMeasurement_ && event.contains("ping_ms")) {
int latency = event["ping_ms"].is_number() ? event["ping_ms"].get<int>() : std::stoi(event["ping_ms"].get<std::string>());
callbackLatencyMeasurement_(latency);
}
}
// Note: client tool call handling omitted for brevity.
}

View File

@ -0,0 +1,131 @@
#include "DefaultAudioInterface.hpp"
#include <cstring>
#include <iostream>
DefaultAudioInterface::DefaultAudioInterface() {
PaError err = Pa_Initialize();
if (err != paNoError) {
throw std::runtime_error("PortAudio initialization failed");
}
}
DefaultAudioInterface::~DefaultAudioInterface() {
if (!shouldStop_.load()) {
stop();
}
Pa_Terminate();
}
void DefaultAudioInterface::start(AudioCallback inputCallback) {
inputCallback_ = std::move(inputCallback);
PaStreamParameters inputParams;
std::memset(&inputParams, 0, sizeof(inputParams));
inputParams.channelCount = 1;
inputParams.device = Pa_GetDefaultInputDevice();
inputParams.sampleFormat = paInt16;
inputParams.suggestedLatency = Pa_GetDeviceInfo(inputParams.device)->defaultLowInputLatency;
inputParams.hostApiSpecificStreamInfo = nullptr;
PaStreamParameters outputParams;
std::memset(&outputParams, 0, sizeof(outputParams));
outputParams.channelCount = 1;
outputParams.device = Pa_GetDefaultOutputDevice();
outputParams.sampleFormat = paInt16;
outputParams.suggestedLatency = Pa_GetDeviceInfo(outputParams.device)->defaultLowOutputLatency;
outputParams.hostApiSpecificStreamInfo = nullptr;
PaError err = Pa_OpenStream(&inputStream_, &inputParams, nullptr, 16000, INPUT_FRAMES_PER_BUFFER, paClipOff,
&DefaultAudioInterface::inputCallbackStatic, this);
if (err != paNoError) {
throw std::runtime_error("Failed to open input stream");
}
err = Pa_OpenStream(&outputStream_, nullptr, &outputParams, 16000, OUTPUT_FRAMES_PER_BUFFER, paClipOff, nullptr, nullptr);
if (err != paNoError) {
throw std::runtime_error("Failed to open output stream");
}
if ((err = Pa_StartStream(inputStream_)) != paNoError) {
throw std::runtime_error("Failed to start input stream");
}
if ((err = Pa_StartStream(outputStream_)) != paNoError) {
throw std::runtime_error("Failed to start output stream");
}
shouldStop_.store(false);
outputThread_ = std::thread(&DefaultAudioInterface::outputThreadFunc, this);
}
void DefaultAudioInterface::stop() {
shouldStop_.store(true);
queueCv_.notify_all();
if (outputThread_.joinable()) {
outputThread_.join();
}
if (inputStream_) {
Pa_StopStream(inputStream_);
Pa_CloseStream(inputStream_);
inputStream_ = nullptr;
}
if (outputStream_) {
Pa_StopStream(outputStream_);
Pa_CloseStream(outputStream_);
outputStream_ = nullptr;
}
}
void DefaultAudioInterface::output(const std::vector<char>& audio) {
{
std::lock_guard<std::mutex> lg(queueMutex_);
outputQueue_.emplace(audio);
}
queueCv_.notify_one();
}
void DefaultAudioInterface::interrupt() {
std::lock_guard<std::mutex> lg(queueMutex_);
std::queue<std::vector<char>> empty;
std::swap(outputQueue_, empty);
}
int DefaultAudioInterface::inputCallbackStatic(const void* input, void* /*output*/, unsigned long frameCount,
const PaStreamCallbackTimeInfo* /*timeInfo*/, PaStreamCallbackFlags /*statusFlags*/,
void* userData) {
auto* self = static_cast<DefaultAudioInterface*>(userData);
return self->inputCallbackInternal(input, frameCount);
}
int DefaultAudioInterface::inputCallbackInternal(const void* input, unsigned long frameCount) {
if (!input || !inputCallback_) {
return paContinue;
}
if (outputPlaying_.load()) {
// Suppress microphone input while playing output to avoid echo feedback.
return paContinue;
}
const size_t bytes = frameCount * sizeof(int16_t);
std::vector<char> buffer(bytes);
std::memcpy(buffer.data(), input, bytes);
inputCallback_(buffer);
return paContinue;
}
void DefaultAudioInterface::outputThreadFunc() {
while (!shouldStop_.load()) {
std::vector<char> audio;
{
std::unique_lock<std::mutex> lk(queueMutex_);
queueCv_.wait(lk, [this] { return shouldStop_.load() || !outputQueue_.empty(); });
if (shouldStop_.load()) break;
audio = std::move(outputQueue_.front());
outputQueue_.pop();
}
if (!audio.empty() && outputStream_) {
outputPlaying_.store(true);
Pa_WriteStream(outputStream_, audio.data(), audio.size() / sizeof(int16_t));
outputPlaying_.store(false);
}
}
}

View File

@ -0,0 +1,31 @@
#include "Conversation.hpp"
#include "DefaultAudioInterface.hpp"
#include <cstdlib>
#include <iostream>
#include <memory>
int main() {
const char* agentIdEnv = std::getenv("AGENT_ID");
if (!agentIdEnv) {
std::cerr << "AGENT_ID environment variable must be set" << std::endl;
return 1;
}
std::string agentId(agentIdEnv);
auto audioInterface = std::make_shared<DefaultAudioInterface>();
Conversation conv(agentId, /*requiresAuth*/ false, audioInterface,
[](const std::string& resp) { std::cout << "Agent: " << resp << std::endl; },
[](const std::string& orig, const std::string& corrected) {
std::cout << "Agent correction: " << orig << " -> " << corrected << std::endl; },
[](const std::string& transcript) { std::cout << "User: " << transcript << std::endl; });
conv.startSession();
std::cout << "Press Enter to quit..." << std::endl;
std::cin.get();
conv.endSession();
auto convId = conv.waitForSessionEnd();
std::cout << "Conversation ID: " << convId << std::endl;
return 0;
}

View File

@ -158,6 +158,8 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
&UElevenLabsConversationalAgentComponent::HandleInterrupted);
WebSocketProxy->OnAgentResponseStarted.AddDynamic(this,
&UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted);
WebSocketProxy->OnAgentResponsePart.AddDynamic(this,
&UElevenLabsConversationalAgentComponent::HandleAgentResponsePart);
}
// Pass configuration to the proxy before connecting.
@ -266,7 +268,23 @@ void UElevenLabsConversationalAgentComponent::StopListening()
// Flush any partially-accumulated mic audio before signalling end-of-turn.
// This ensures the final words aren't discarded just because the last callback
// didn't push the buffer over the MicChunkMinBytes threshold.
if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected())
//
// EXCEPT during collision avoidance: bAgentGenerating is already true when
// HandleAgentResponseStarted calls StopListening (it sets the flag before calling us).
// Flushing audio to a server that is mid-generation can cause it to re-enter
// "user speaking" state and stall waiting for more audio that never arrives,
// leaving both sides stuck — no audio for the collision response and no response
// for subsequent turns.
if (bAgentGenerating)
{
if (MicAccumulationBuffer.Num() > 0)
{
UE_LOG(LogElevenLabsAgent, Log,
TEXT("StopListening: discarding %d bytes of accumulated mic audio (collision — server is mid-generation)."),
MicAccumulationBuffer.Num());
}
}
else if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected())
{
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
}
@ -423,7 +441,8 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
{
// The server has started generating a response (first agent_chat_response_part).
// Set bAgentGenerating BEFORE StopListening so that any StartListening call
// triggered by the Blueprint's OnAgentStartedGenerating handler is blocked.
// triggered by the Blueprint's OnAgentStartedGenerating handler is blocked,
// and so that StopListening knows to skip the mic buffer flush (collision path).
bAgentGenerating = true;
bWaitingForAgentResponse = false; // Server is generating — response timeout cancelled.
@ -433,21 +452,37 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
if (bIsListening)
{
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
// Log both turn indices so the timeline is unambiguous.
// The server's VAD detected a pause in the user's speech and started generating
// prematurely — the user hasn't finished speaking yet.
//
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
// causing it to re-enter "user speaking" state and stall — both sides stuck.
//
// Do NOT send an interrupt here — just let the server's response play out:
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
// Either way the state machine recovers and Blueprint can reopen the mic.
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
StopListening();
}
else
{
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"),
T, LastClosedTurnIndex, LatencyFromTurnEnd);
}
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"),
T, LastClosedTurnIndex, LatencyFromTurnEnd);
OnAgentStartedGenerating.Broadcast();
}
void UElevenLabsConversationalAgentComponent::HandleAgentResponsePart(const FString& PartialText)
{
if (bEnableAgentPartialResponse)
{
OnAgentPartialResponse.Broadcast(PartialText);
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Audio playback
// ─────────────────────────────────────────────────────────────────────────────
@ -569,9 +604,15 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
{
if (!IsConnected() || !bIsListening) return;
// Echo suppression: skip sending mic audio while the agent is speaking.
// This prevents the agent from hearing its own voice through the speakers,
// which would confuse the server's VAD and STT. Matches the approach used
// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
if (bAgentSpeaking) return;
// Convert this callback's samples to int16 bytes and accumulate.
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
// (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms
// (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here
// until we have enough, then send the whole batch in one WebSocket frame.
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
MicAccumulationBuffer.Append(PCMBytes);

View File

@ -158,20 +158,6 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
// in a loop: part arrives → event → StopListening → SendUserTurnEnd → flag reset → part arrives → loop.
// The flag is only reset in SendUserTurnStart() at the beginning of a new user turn.
// Clear the interrupt-ignore flag if it was never cleared by an "interruption" server ack.
// The ElevenLabs server does not always send the "interruption" acknowledgement reliably.
// By the time the user has spoken a full new turn (seconds of audio), any in-flight content
// from the previously interrupted generation has long since arrived — it is safe to resume
// normal content processing so the server's response to this new turn is not silently discarded.
if (bIgnoreIncomingContent)
{
bIgnoreIncomingContent = false;
const double T = UserTurnEndTime - SessionStartTime;
UE_LOG(LogElevenLabsWS, Log,
TEXT("[T+%.2fs] Cleared interrupt-ignore flag at turn end (server 'interruption' ack was not received — resuming content processing)."),
T);
}
const double T = UserTurnEndTime - SessionStartTime;
UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn ended — server VAD silence detection started (turn_timeout=1s)."), T);
}
@ -196,12 +182,7 @@ void UElevenLabsWebSocketProxy::SendInterrupt()
{
if (!IsConnected()) return;
// Immediately start discarding in-flight audio and chat response parts from
// the generation we are about to interrupt. The server may still send several
// frames before it processes our interrupt. We stop ignoring once the server
// sends its "interruption" acknowledgement (HandleInterruption).
bIgnoreIncomingContent = true;
UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt — ignoring incoming content until server acks."));
UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt."));
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::Interrupt);
@ -400,7 +381,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
}
else if (MsgType == ElevenLabsMessageType::AgentChatResponsePart)
{
HandleAgentChatResponsePart();
HandleAgentChatResponsePart(Root);
}
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
{
@ -467,17 +448,9 @@ void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size,
}
// Broadcast raw PCM bytes directly to the audio queue.
// Discard if we are waiting for an interruption ack (same logic as HandleAudioResponse).
TArray<uint8> PCMData = MoveTemp(BinaryFrameBuffer);
BinaryFrameBuffer.Reset();
if (!bIgnoreIncomingContent)
{
OnAudioReceived.Broadcast(PCMData);
}
else
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding binary audio frame (interrupt pending server ack)."));
}
OnAudioReceived.Broadcast(PCMData);
}
}
@ -507,15 +480,6 @@ void UElevenLabsWebSocketProxy::HandleConversationInitiation(const TSharedPtr<FJ
void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject>& Root)
{
// Discard audio that belongs to an interrupted generation.
// The server may send several more audio frames after we sent "interrupt" —
// they must not restart the speaking state on the client side.
if (bIgnoreIncomingContent)
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio frame (interrupt pending server ack)."));
return;
}
// Expected structure:
// { "type": "audio",
// "audio_event": { "audio_base_64": "<base64 PCM>", "event_id": 1 }
@ -527,6 +491,17 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject
return;
}
// Discard audio belonging to an interrupted generation (event_id approach).
// Matches the official ElevenLabs C++ and Python SDKs: only AUDIO is filtered
// by event_id — transcripts, agent_response, etc. are always processed.
int32 EventId = 0;
(*AudioEvent)->TryGetNumberField(TEXT("event_id"), EventId);
if (EventId > 0 && EventId <= LastInterruptEventId)
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio event_id=%d (interrupted at %d)."), EventId, LastInterruptEventId);
return;
}
FString Base64Audio;
if (!(*AudioEvent)->TryGetStringField(TEXT("audio_base_64"), Base64Audio))
{
@ -569,16 +544,6 @@ void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr<FJsonObject>&
void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr<FJsonObject>& Root)
{
// ISSUE-19: discard agent_response that belongs to an interrupted generation.
// A stale agent_response from the cancelled turn would set bAgentResponseReceived=true
// on the component, allowing the silence-detection Tick to fire OnAgentStoppedSpeaking
// at the wrong time (no audio is currently playing for the new turn yet).
if (bIgnoreIncomingContent)
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_response (interrupt pending server ack)."));
return;
}
// ISSUE-22: reset bAgentResponseStartedFired so OnAgentResponseStarted fires again on
// the next turn. In Server VAD mode SendUserTurnStart() is never called — it is the only
// other place that resets this flag — so without this reset, OnAgentResponseStarted fires
@ -602,18 +567,8 @@ void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr<FJsonObject
OnAgentResponse.Broadcast(ResponseText);
}
void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart()
void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJsonObject>& Root)
{
// Ignore response parts that belong to a generation we have already interrupted.
// Without this guard, old parts arriving after SendInterrupt() would re-trigger
// OnAgentResponseStarted (bAgentResponseStartedFired was reset in SendUserTurnStart),
// causing the component to stop the newly-opened microphone — creating an infinite loop.
if (bIgnoreIncomingContent)
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_chat_response_part (interrupt pending server ack)."));
return;
}
// agent_chat_response_part = the server is actively generating a response (LLM token stream).
// Fire OnAgentResponseStarted once per turn so the component can auto-stop the microphone
// if the Blueprint restarted listening before the server finished processing the previous turn.
@ -628,15 +583,39 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart()
T, LatencyFromTurnEnd);
OnAgentResponseStarted.Broadcast();
}
// Subsequent parts logged at Verbose only (can be dozens per response).
// Extract the streaming text fragment and broadcast it.
// API structure:
// { "type": "agent_chat_response_part",
// "agent_chat_response_part_event": { "agent_response_part": "partial text" }
// }
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
{
FString PartText;
if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
{
OnAgentResponsePart.Broadcast(PartText);
}
}
}
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
{
// Server has acknowledged the interruption — the old generation is fully stopped.
// Resume accepting incoming audio and chat response parts (for the next turn).
bIgnoreIncomingContent = false;
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received — resuming content processing)."));
// Extract the interrupt event_id so we can filter stale audio frames.
// { "type": "interruption", "interruption_event": { "event_id": 42 } }
const TSharedPtr<FJsonObject>* InterruptEvent = nullptr;
if (Root->TryGetObjectField(TEXT("interruption_event"), InterruptEvent) && InterruptEvent)
{
int32 EventId = 0;
(*InterruptEvent)->TryGetNumberField(TEXT("event_id"), EventId);
if (EventId > LastInterruptEventId)
{
LastInterruptEventId = EventId;
}
}
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack, LastInterruptEventId=%d)."), LastInterruptEventId);
OnInterrupted.Broadcast();
}

View File

@ -43,6 +43,15 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentInterrupted);
*/
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedGenerating);
/**
* Fired for every agent_chat_response_part streams the agent's text as the LLM
* generates it, token by token. Use this for real-time subtitles / text display.
* Each call provides the text fragment from that individual part (NOT accumulated).
* The final complete text is still available via OnAgentTextResponse (agent_response).
*/
DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentPartialResponse,
const FString&, PartialText);
/**
* Fired when the server has not started generating a response within ResponseTimeoutSeconds
* after the user stopped speaking (StopListening was called).
@ -138,6 +147,15 @@ public:
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableAgentTextResponse = true;
/**
* Forward streaming text parts (agent_chat_response_part events) to the
* OnAgentPartialResponse delegate. Each part is a text fragment as the LLM
* generates it use this for real-time subtitles that appear while the agent
* speaks, instead of waiting for the full text (OnAgentTextResponse).
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableAgentPartialResponse = false;
/**
* How many seconds to wait for the server to start generating a response
* after the user stops speaking (StopListening) before firing OnAgentResponseTimeout.
@ -168,6 +186,14 @@ public:
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnAgentTextResponse OnAgentTextResponse;
/**
* Streaming text fragments as the LLM generates them.
* Fires for every agent_chat_response_part each call gives one text chunk.
* Enable with bEnableAgentPartialResponse.
*/
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnAgentPartialResponse OnAgentPartialResponse;
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnAgentStartedSpeaking OnAgentStartedSpeaking;
@ -285,6 +311,9 @@ private:
UFUNCTION()
void HandleAgentResponseStarted();
UFUNCTION()
void HandleAgentResponsePart(const FString& PartialText);
// ── Audio playback ────────────────────────────────────────────────────────
void InitAudioPlayback();
void EnqueueAgentAudio(const TArray<uint8>& PCMData);
@ -371,5 +400,5 @@ private:
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
// We accumulate here and only call SendAudioChunk once enough bytes are ready.
TArray<uint8> MicAccumulationBuffer;
static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono
static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation)
};

View File

@ -43,6 +43,11 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsInterrupted);
*/
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsAgentResponseStarted);
/** Fired for every agent_chat_response_part — streams the LLM text as it is generated.
* PartialText is the text fragment from this individual part (NOT accumulated). */
DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnElevenLabsAgentResponsePart,
const FString&, PartialText);
// ─────────────────────────────────────────────────────────────────────────────
// WebSocket Proxy
@ -94,6 +99,10 @@ public:
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnElevenLabsAgentResponseStarted OnAgentResponseStarted;
/** Fired for every agent_chat_response_part with the streaming text fragment. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
FOnElevenLabsAgentResponsePart OnAgentResponsePart;
// ── Lifecycle ─────────────────────────────────────────────────────────────
/**
@ -182,7 +191,7 @@ private:
void HandleAudioResponse(const TSharedPtr<FJsonObject>& Payload);
void HandleTranscript(const TSharedPtr<FJsonObject>& Payload);
void HandleAgentResponse(const TSharedPtr<FJsonObject>& Payload);
void HandleAgentChatResponsePart();
void HandleAgentChatResponsePart(const TSharedPtr<FJsonObject>& Payload);
void HandleInterruption(const TSharedPtr<FJsonObject>& Payload);
void HandlePing(const TSharedPtr<FJsonObject>& Payload);
@ -217,11 +226,12 @@ private:
// Used to compute [T+Xs] session-relative timestamps in all log messages.
double SessionStartTime = 0.0;
// Set to true in SendInterrupt() so that in-flight audio frames and
// agent_chat_response_part messages from the interrupted generation are silently
// discarded instead of re-triggering the speaking/generating state.
// Cleared when the server sends its "interruption" acknowledgement.
bool bIgnoreIncomingContent = false;
// ── Interrupt filtering (event_id approach, matching official SDK) ────────
// When the server sends an "interruption" event it includes an event_id.
// Audio events whose event_id <= LastInterruptEventId belong to the cancelled
// generation and must be discarded. Only AUDIO is filtered — transcripts,
// agent_response, agent_chat_response_part etc. are always processed.
int32 LastInterruptEventId = 0;
public:
// Set by UElevenLabsConversationalAgentComponent before calling Connect().