diff --git a/CPP/elevenlabs-convai-cpp-main/.gitignore b/CPP/elevenlabs-convai-cpp-main/.gitignore new file mode 100644 index 0000000..80545b2 --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/.gitignore @@ -0,0 +1,152 @@ +# Build directories +build/ +cmake-build-*/ +out/ + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app +convai_cpp + +# CMake +CMakeCache.txt +CMakeFiles/ +CMakeScripts/ +Testing/ +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps/ + +# IDE files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# macOS +.DS_Store +.AppleDouble +.LSOverride + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +# Windows +Thumbs.db +ehthumbs.db +Desktop.ini +$RECYCLE.BIN/ +*.cab +*.msi +*.msm +*.msp +*.lnk + +# Linux +*~ +.fuse_hidden* +.directory +.Trash-* +.nfs* + +# Logs +*.log + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Coverage directory used by tools like istanbul +coverage/ + +# nyc test coverage +.nyc_output + +# Dependency directories +node_modules/ + +# Optional npm cache directory +.npm + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env +.env.test + +# parcel-bundler cache (https://parceljs.org/) +.cache +.parcel-cache + +# next.js build output +.next + +# nuxt.js build output +.nuxt + +# vuepress build output +.vuepress/dist + +# Serverless directories +.serverless + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ \ No newline at end of file diff --git a/CPP/elevenlabs-convai-cpp-main/CMakeLists.txt b/CPP/elevenlabs-convai-cpp-main/CMakeLists.txt new file mode 100644 index 0000000..d206779 --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/CMakeLists.txt @@ -0,0 +1,42 @@ +cmake_minimum_required(VERSION 3.14) + +project(elevenlabs_convai_cpp LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Find dependencies +find_package(Boost REQUIRED COMPONENTS system thread) +find_package(OpenSSL REQUIRED) +# PortAudio via vcpkg CMake config +find_package(portaudio CONFIG REQUIRED) + +# Find nlohmann_json +find_package(nlohmann_json 3.11 QUIET) + +if(NOT nlohmann_json_FOUND) + include(FetchContent) + # Fallback: header-only fetch to avoid old CMake policies in upstream CMakeLists + FetchContent_Declare( + nlohmann_json_src + URL https://raw.githubusercontent.com/nlohmann/json/v3.11.2/single_include/nlohmann/json.hpp + ) + FetchContent_MakeAvailable(nlohmann_json_src) + add_library(nlohmann_json::nlohmann_json INTERFACE IMPORTED) + target_include_directories(nlohmann_json::nlohmann_json INTERFACE ${nlohmann_json_src_SOURCE_DIR}/single_include) +endif() + +add_executable(convai_cpp + src/main.cpp + src/Conversation.cpp + src/DefaultAudioInterface.cpp +) + +target_include_directories(convai_cpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) + +# MSVC: set Windows target version and suppress getenv deprecation warning +if(MSVC) + target_compile_definitions(convai_cpp PRIVATE _WIN32_WINNT=0x0A00 _CRT_SECURE_NO_WARNINGS) +endif() + +target_link_libraries(convai_cpp PRIVATE Boost::system Boost::thread OpenSSL::SSL OpenSSL::Crypto portaudio nlohmann_json::nlohmann_json) \ No newline at end of file diff --git a/CPP/elevenlabs-convai-cpp-main/LICENSE b/CPP/elevenlabs-convai-cpp-main/LICENSE new file mode 100644 index 0000000..3165092 --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Jitendra + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/CPP/elevenlabs-convai-cpp-main/README.md b/CPP/elevenlabs-convai-cpp-main/README.md new file mode 100644 index 0000000..f829e18 --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/README.md @@ -0,0 +1,197 @@ +# ElevenLabs Conversational AI - C++ Implementation + +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![C++17](https://img.shields.io/badge/C%2B%2B-17-blue.svg)](https://en.wikipedia.org/wiki/C%2B%2B17) +[![CMake](https://img.shields.io/badge/CMake-3.14+-green.svg)](https://cmake.org/) + +C++ implementation of ElevenLabs Conversational AI client + +## Features + +- **Real-time Audio Processing**: Full-duplex audio streaming with low-latency playback +- **WebSocket Integration**: Secure WSS connection to ElevenLabs Conversational AI platform +- **Cross-platform Audio**: PortAudio-based implementation supporting Windows, macOS, and Linux +- **Echo Suppression**: Built-in acoustic feedback prevention +- **Modern C++**: Clean, maintainable C++17 codebase with proper RAII and exception handling +- **Flexible Architecture**: Modular design allowing easy customization and extension + +## Architecture + +```mermaid +graph TB + subgraph "User Interface" + A[main.cpp] --> B[Conversation] + end + + subgraph "Core Components" + B --> C[DefaultAudioInterface] + B --> D[WebSocket Client] + C --> E[PortAudio] + D --> F[Boost.Beast + OpenSSL] + end + + subgraph "ElevenLabs Platform" + F --> G[WSS API Endpoint] + G --> H[Conversational AI Agent] + end + + subgraph "Audio Flow" + I[Microphone] --> C + C --> J[Base64 Encoding] + J --> D + D --> K[Audio Events] + K --> L[Base64 Decoding] + L --> C + C --> M[Speakers] + end + + subgraph "Message Types" + N[user_audio_chunk] + O[agent_response] + P[user_transcript] + Q[audio_event] + R[ping/pong] + end + + style B fill:#e1f5fe + style C fill:#f3e5f5 + style D fill:#e8f5e8 + style H fill:#fff3e0 +``` + +## Quick Start + +### Prerequisites + +- **C++17 compatible compiler**: GCC 11+, Clang 14+, or MSVC 2022+ +- **CMake** 3.14 or higher +- **Dependencies** (install via package manager): + +#### macOS (Homebrew) +```bash +brew install boost openssl portaudio nlohmann-json cmake pkg-config +``` + +#### Ubuntu/Debian +```bash +sudo apt update +sudo apt install build-essential cmake pkg-config +sudo apt install libboost-system-dev libboost-thread-dev +sudo apt install libssl-dev libportaudio2-dev nlohmann-json3-dev +``` + +#### Windows (vcpkg) +```bash +vcpkg install boost-system boost-thread openssl portaudio nlohmann-json +``` + +### Building + +```bash +# Clone the repository +git clone https://github.com/Jitendra2603/elevenlabs-convai-cpp.git +cd elevenlabs-convai-cpp + +# Build the project +mkdir build && cd build +cmake .. +cmake --build . --config Release +``` + +### Running + +```bash +# Set your agent ID (get this from ElevenLabs dashboard) +export AGENT_ID="your-agent-id-here" + +# Run the demo +./convai_cpp +``` + +The application will: +1. Connect to your ElevenLabs Conversational AI agent +2. Start capturing audio from your default microphone +3. Stream audio to the agent and play responses through speakers +4. Display conversation transcripts in the terminal +5. Continue until you press Enter to quit + +## 📋 Usage Examples + +### Basic Conversation +```bash +export AGENT_ID="agent_" +./convai_cpp +# Speak into your microphone and hear the AI agent respond +``` + + +## Configuration + +### Audio Settings + +The audio interface is configured for optimal real-time performance: + +- **Sample Rate**: 16 kHz +- **Format**: 16-bit PCM mono +- **Input Buffer**: 250ms (4000 frames) +- **Output Buffer**: 62.5ms (1000 frames) + +### WebSocket Connection + +- **Endpoint**: `wss://api.elevenlabs.io/v1/convai/conversation` +- **Protocol**: WebSocket Secure (WSS) with TLS 1.2+ +- **Authentication**: Optional (required for private agents) + +## Project Structure + +``` +elevenlabs-convai-cpp/ +├── CMakeLists.txt # Build configuration +├── README.md # This file +├── LICENSE # MIT license +├── CONTRIBUTING.md # Contribution guidelines +├── .gitignore # Git ignore rules +├── include/ # Header files +│ ├── AudioInterface.hpp # Abstract audio interface +│ ├── DefaultAudioInterface.hpp # PortAudio implementation +│ └── Conversation.hpp # Main conversation handler +└── src/ # Source files + ├── main.cpp # Demo application + ├── Conversation.cpp # WebSocket and message handling + └── DefaultAudioInterface.cpp # Audio I/O implementation +``` + +## Technical Details + +### Audio Processing Pipeline + +1. **Capture**: PortAudio captures 16-bit PCM audio at 16kHz +2. **Encoding**: Raw audio is base64-encoded for WebSocket transmission +3. **Streaming**: Audio chunks sent as `user_audio_chunk` messages +4. **Reception**: Server sends `audio_event` messages with agent responses +5. **Decoding**: Base64 audio data decoded back to PCM +6. **Playback**: Audio queued and played through PortAudio output stream + +### Echo Suppression + +The implementation includes a simple, effective echo suppression mechanism: + +- Microphone input is suppressed during agent speech playback +- Prevents acoustic feedback loops that cause the agent to respond to itself +- Uses atomic flags for thread-safe coordination between input/output + +### WebSocket Message Handling + +Supported message types: +- `conversation_initiation_client_data` - Session initialization +- `user_audio_chunk` - Microphone audio data +- `audio_event` - Agent speech audio +- `agent_response` - Agent text responses +- `user_transcript` - Speech-to-text results +- `ping`/`pong` - Connection keepalive + + + +## 📝 License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. diff --git a/CPP/elevenlabs-convai-cpp-main/include/AudioInterface.hpp b/CPP/elevenlabs-convai-cpp-main/include/AudioInterface.hpp new file mode 100644 index 0000000..b92bd42 --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/include/AudioInterface.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include +#include + +class AudioInterface { +public: + using AudioCallback = std::function&)>; + + virtual ~AudioInterface() = default; + + // Starts the audio interface. The callback will be invoked with raw 16-bit PCM mono samples at 16kHz. + virtual void start(AudioCallback inputCallback) = 0; + + // Stops audio I/O and releases underlying resources. + virtual void stop() = 0; + + // Play audio to the user; audio is 16-bit PCM mono 16kHz. + virtual void output(const std::vector& audio) = 0; + + // Immediately stop any buffered / ongoing output. + virtual void interrupt() = 0; +}; \ No newline at end of file diff --git a/CPP/elevenlabs-convai-cpp-main/include/Conversation.hpp b/CPP/elevenlabs-convai-cpp-main/include/Conversation.hpp new file mode 100644 index 0000000..04005e4 --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/include/Conversation.hpp @@ -0,0 +1,72 @@ +#pragma once + +#include "AudioInterface.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +class Conversation { +public: + using CallbackAgentResponse = std::function; + using CallbackAgentResponseCorrection = std::function; + using CallbackUserTranscript = std::function; + using CallbackLatencyMeasurement = std::function; + + Conversation( + const std::string& agentId, + bool requiresAuth, + std::shared_ptr audioInterface, + CallbackAgentResponse callbackAgentResponse = nullptr, + CallbackAgentResponseCorrection callbackAgentResponseCorrection = nullptr, + CallbackUserTranscript callbackUserTranscript = nullptr, + CallbackLatencyMeasurement callbackLatencyMeasurement = nullptr + ); + + ~Conversation(); + + void startSession(); + void endSession(); + std::string waitForSessionEnd(); + + void sendUserMessage(const std::string& text); + void registerUserActivity(); + void sendContextualUpdate(const std::string& content); + +private: + void run(); + void handleMessage(const nlohmann::json& message); + std::string getWssUrl() const; + + // networking members + boost::asio::io_context ioc_; + boost::asio::ssl::context sslCtx_{boost::asio::ssl::context::tlsv12_client}; + + using tcp = boost::asio::ip::tcp; + using websocket_t = boost::beast::websocket::stream< + boost::beast::ssl_stream>; + std::unique_ptr ws_; + + // general state + std::string agentId_; + bool requiresAuth_; + std::shared_ptr audioInterface_; + + CallbackAgentResponse callbackAgentResponse_; + CallbackAgentResponseCorrection callbackAgentResponseCorrection_; + CallbackUserTranscript callbackUserTranscript_; + CallbackLatencyMeasurement callbackLatencyMeasurement_; + + std::thread workerThread_; + std::atomic shouldStop_{false}; + std::string conversationId_; + + std::atomic lastInterruptId_{0}; +}; \ No newline at end of file diff --git a/CPP/elevenlabs-convai-cpp-main/include/DefaultAudioInterface.hpp b/CPP/elevenlabs-convai-cpp-main/include/DefaultAudioInterface.hpp new file mode 100644 index 0000000..39940e9 --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/include/DefaultAudioInterface.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include "AudioInterface.hpp" +#include +#include +#include +#include +#include +#include + +class DefaultAudioInterface : public AudioInterface { +public: + static constexpr int INPUT_FRAMES_PER_BUFFER = 4000; // 250ms @ 16kHz + static constexpr int OUTPUT_FRAMES_PER_BUFFER = 1000; // 62.5ms @ 16kHz + + DefaultAudioInterface(); + ~DefaultAudioInterface() override; + + void start(AudioCallback inputCallback) override; + void stop() override; + void output(const std::vector& audio) override; + void interrupt() override; + +private: + static int inputCallbackStatic(const void* input, void* output, unsigned long frameCount, + const PaStreamCallbackTimeInfo* timeInfo, PaStreamCallbackFlags statusFlags, + void* userData); + + int inputCallbackInternal(const void* input, unsigned long frameCount); + + void outputThreadFunc(); + + PaStream* inputStream_{}; + PaStream* outputStream_{}; + + AudioCallback inputCallback_; + + std::queue> outputQueue_; + std::mutex queueMutex_; + std::condition_variable queueCv_; + + std::thread outputThread_; + std::atomic shouldStop_{false}; + std::atomic outputPlaying_{false}; +}; \ No newline at end of file diff --git a/CPP/elevenlabs-convai-cpp-main/src/Conversation.cpp b/CPP/elevenlabs-convai-cpp-main/src/Conversation.cpp new file mode 100644 index 0000000..be11e7b --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/src/Conversation.cpp @@ -0,0 +1,230 @@ +#include "Conversation.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using tcp = boost::asio::ip::tcp; +namespace ssl = boost::asio::ssl; +namespace websocket = boost::beast::websocket; +namespace beast = boost::beast; + +static std::string base64Encode(const std::vector& data) { + auto encodedSize = beast::detail::base64::encoded_size(data.size()); + std::string out(encodedSize, '\0'); + beast::detail::base64::encode(&out[0], data.data(), data.size()); + return out; +} + +static std::vector base64Decode(const std::string& str) { + auto decodedSize = beast::detail::base64::decoded_size(str.size()); + std::vector out(decodedSize); + auto result = beast::detail::base64::decode(out.data(), str.data(), str.size()); + out.resize(result.first); + return out; +} + +static std::string toString(const nlohmann::json& j){ + if(j.is_string()) return j.get(); + if(j.is_number_integer()) return std::to_string(j.get()); + return j.dump(); +} + +Conversation::Conversation(const std::string& agentId, bool requiresAuth, + std::shared_ptr audioInterface, + CallbackAgentResponse callbackAgentResponse, + CallbackAgentResponseCorrection callbackAgentResponseCorrection, + CallbackUserTranscript callbackUserTranscript, + CallbackLatencyMeasurement callbackLatencyMeasurement) + : agentId_(agentId), + requiresAuth_(requiresAuth), + audioInterface_(std::move(audioInterface)), + callbackAgentResponse_(std::move(callbackAgentResponse)), + callbackAgentResponseCorrection_(std::move(callbackAgentResponseCorrection)), + callbackUserTranscript_(std::move(callbackUserTranscript)), + callbackLatencyMeasurement_(std::move(callbackLatencyMeasurement)) { + + sslCtx_.set_default_verify_paths(); +} + +Conversation::~Conversation() { + endSession(); +} + +void Conversation::startSession() { + shouldStop_.store(false); + workerThread_ = std::thread(&Conversation::run, this); +} + +void Conversation::endSession() { + shouldStop_.store(true); + if (ws_) { + beast::error_code ec; + ws_->close(websocket::close_code::normal, ec); + } + if (audioInterface_) { + audioInterface_->stop(); + } + if (workerThread_.joinable()) { + workerThread_.join(); + } +} + +std::string Conversation::waitForSessionEnd() { + if (workerThread_.joinable()) { + workerThread_.join(); + } + return conversationId_; +} + +void Conversation::sendUserMessage(const std::string& text) { + if (!ws_) { + throw std::runtime_error("Session not started"); + } + nlohmann::json j = { + {"type", "user_message"}, + {"text", text} + }; + ws_->write(boost::asio::buffer(j.dump())); +} + +void Conversation::registerUserActivity() { + if (!ws_) throw std::runtime_error("Session not started"); + nlohmann::json j = {{"type", "user_activity"}}; + ws_->write(boost::asio::buffer(j.dump())); +} + +void Conversation::sendContextualUpdate(const std::string& content) { + if (!ws_) throw std::runtime_error("Session not started"); + nlohmann::json j = {{"type", "contextual_update"}, {"content", content}}; + ws_->write(boost::asio::buffer(j.dump())); +} + +std::string Conversation::getWssUrl() const { + // Hard-coded base env for demo; in production you'd call ElevenLabs env endpoint. + std::ostringstream oss; + oss << "wss://api.elevenlabs.io/v1/convai/conversation?agent_id=" << agentId_; + return oss.str(); +} + +void Conversation::run() { + try { + auto url = getWssUrl(); + std::string protocol, host, target; + unsigned short port = 443; + + // Very naive parse: wss://host[:port]/path?query + if (boost::starts_with(url, "wss://")) { + protocol = "wss"; + host = url.substr(6); + } else { + throw std::runtime_error("Only wss:// URLs supported in this demo"); + } + auto slashPos = host.find('/'); + if (slashPos == std::string::npos) { + target = "/"; + } else { + target = host.substr(slashPos); + host = host.substr(0, slashPos); + } + auto colonPos = host.find(':'); + if (colonPos != std::string::npos) { + port = static_cast(std::stoi(host.substr(colonPos + 1))); + host = host.substr(0, colonPos); + } + + tcp::resolver resolver(ioc_); + auto const results = resolver.resolve(host, std::to_string(port)); + + beast::ssl_stream stream(ioc_, sslCtx_); + boost::asio::connect(beast::get_lowest_layer(stream), results); + if (!SSL_set_tlsext_host_name(stream.native_handle(), host.c_str())) { + throw std::runtime_error("Failed to set SNI hostname on SSL stream"); + } + stream.handshake(ssl::stream_base::client); + + ws_ = std::make_unique(std::move(stream)); + ws_->set_option(websocket::stream_base::timeout::suggested(beast::role_type::client)); + ws_->handshake(host, target); + + // send initiation data + nlohmann::json init = { + {"type", "conversation_initiation_client_data"}, + {"custom_llm_extra_body", nlohmann::json::object()}, + {"conversation_config_override", nlohmann::json::object()}, + {"dynamic_variables", nlohmann::json::object()} + }; + ws_->write(boost::asio::buffer(init.dump())); + + // Prepare audio callback + auto inputCb = [this](const std::vector& audio) { + nlohmann::json msg = { + {"user_audio_chunk", base64Encode(audio)} + }; + ws_->write(boost::asio::buffer(msg.dump())); + }; + audioInterface_->start(inputCb); + + beast::flat_buffer buffer; + while (!shouldStop_.load()) { + beast::error_code ec; + ws_->read(buffer, ec); + if (ec) { + std::cerr << "Websocket read error: " << ec.message() << std::endl; + break; + } + auto text = beast::buffers_to_string(buffer.data()); + buffer.consume(buffer.size()); + try { + auto message = nlohmann::json::parse(text); + handleMessage(message); + } catch (const std::exception& ex) { + std::cerr << "JSON parse error: " << ex.what() << std::endl; + } + } + } catch (const std::exception& ex) { + std::cerr << "Conversation error: " << ex.what() << std::endl; + } +} + +void Conversation::handleMessage(const nlohmann::json& message) { + std::string type = message.value("type", ""); + if (type == "conversation_initiation_metadata") { + conversationId_ = message["conversation_initiation_metadata_event"]["conversation_id"].get(); + } else if (type == "audio") { + auto event = message["audio_event"]; + int eventId = std::stoi(toString(event["event_id"])); + if (eventId <= lastInterruptId_.load()) return; + auto audioBytes = base64Decode(event["audio_base_64"].get()); + audioInterface_->output(audioBytes); + } else if (type == "agent_response" && callbackAgentResponse_) { + auto event = message["agent_response_event"]; + callbackAgentResponse_(event["agent_response"].get()); + } else if (type == "agent_response_correction" && callbackAgentResponseCorrection_) { + auto event = message["agent_response_correction_event"]; + callbackAgentResponseCorrection_(event["original_agent_response"].get(), + event["corrected_agent_response"].get()); + } else if (type == "user_transcript" && callbackUserTranscript_) { + auto event = message["user_transcription_event"]; + callbackUserTranscript_(event["user_transcript"].get()); + } else if (type == "interruption") { + auto event = message["interruption_event"]; + lastInterruptId_.store(std::stoi(toString(event["event_id"]))); + audioInterface_->interrupt(); + } else if (type == "ping") { + auto event = message["ping_event"]; + nlohmann::json pong = {{"type", "pong"}, {"event_id", event["event_id"]}}; + ws_->write(boost::asio::buffer(pong.dump())); + if (callbackLatencyMeasurement_ && event.contains("ping_ms")) { + int latency = event["ping_ms"].is_number() ? event["ping_ms"].get() : std::stoi(event["ping_ms"].get()); + callbackLatencyMeasurement_(latency); + } + } + // Note: client tool call handling omitted for brevity. +} \ No newline at end of file diff --git a/CPP/elevenlabs-convai-cpp-main/src/DefaultAudioInterface.cpp b/CPP/elevenlabs-convai-cpp-main/src/DefaultAudioInterface.cpp new file mode 100644 index 0000000..9a0ca84 --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/src/DefaultAudioInterface.cpp @@ -0,0 +1,131 @@ +#include "DefaultAudioInterface.hpp" + +#include +#include + +DefaultAudioInterface::DefaultAudioInterface() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + throw std::runtime_error("PortAudio initialization failed"); + } +} + +DefaultAudioInterface::~DefaultAudioInterface() { + if (!shouldStop_.load()) { + stop(); + } + Pa_Terminate(); +} + +void DefaultAudioInterface::start(AudioCallback inputCallback) { + inputCallback_ = std::move(inputCallback); + PaStreamParameters inputParams; + std::memset(&inputParams, 0, sizeof(inputParams)); + inputParams.channelCount = 1; + inputParams.device = Pa_GetDefaultInputDevice(); + inputParams.sampleFormat = paInt16; + inputParams.suggestedLatency = Pa_GetDeviceInfo(inputParams.device)->defaultLowInputLatency; + inputParams.hostApiSpecificStreamInfo = nullptr; + + PaStreamParameters outputParams; + std::memset(&outputParams, 0, sizeof(outputParams)); + outputParams.channelCount = 1; + outputParams.device = Pa_GetDefaultOutputDevice(); + outputParams.sampleFormat = paInt16; + outputParams.suggestedLatency = Pa_GetDeviceInfo(outputParams.device)->defaultLowOutputLatency; + outputParams.hostApiSpecificStreamInfo = nullptr; + + PaError err = Pa_OpenStream(&inputStream_, &inputParams, nullptr, 16000, INPUT_FRAMES_PER_BUFFER, paClipOff, + &DefaultAudioInterface::inputCallbackStatic, this); + if (err != paNoError) { + throw std::runtime_error("Failed to open input stream"); + } + + err = Pa_OpenStream(&outputStream_, nullptr, &outputParams, 16000, OUTPUT_FRAMES_PER_BUFFER, paClipOff, nullptr, nullptr); + if (err != paNoError) { + throw std::runtime_error("Failed to open output stream"); + } + + if ((err = Pa_StartStream(inputStream_)) != paNoError) { + throw std::runtime_error("Failed to start input stream"); + } + if ((err = Pa_StartStream(outputStream_)) != paNoError) { + throw std::runtime_error("Failed to start output stream"); + } + + shouldStop_.store(false); + outputThread_ = std::thread(&DefaultAudioInterface::outputThreadFunc, this); +} + +void DefaultAudioInterface::stop() { + shouldStop_.store(true); + queueCv_.notify_all(); + if (outputThread_.joinable()) { + outputThread_.join(); + } + + if (inputStream_) { + Pa_StopStream(inputStream_); + Pa_CloseStream(inputStream_); + inputStream_ = nullptr; + } + if (outputStream_) { + Pa_StopStream(outputStream_); + Pa_CloseStream(outputStream_); + outputStream_ = nullptr; + } +} + +void DefaultAudioInterface::output(const std::vector& audio) { + { + std::lock_guard lg(queueMutex_); + outputQueue_.emplace(audio); + } + queueCv_.notify_one(); +} + +void DefaultAudioInterface::interrupt() { + std::lock_guard lg(queueMutex_); + std::queue> empty; + std::swap(outputQueue_, empty); +} + +int DefaultAudioInterface::inputCallbackStatic(const void* input, void* /*output*/, unsigned long frameCount, + const PaStreamCallbackTimeInfo* /*timeInfo*/, PaStreamCallbackFlags /*statusFlags*/, + void* userData) { + auto* self = static_cast(userData); + return self->inputCallbackInternal(input, frameCount); +} + +int DefaultAudioInterface::inputCallbackInternal(const void* input, unsigned long frameCount) { + if (!input || !inputCallback_) { + return paContinue; + } + if (outputPlaying_.load()) { + // Suppress microphone input while playing output to avoid echo feedback. + return paContinue; + } + const size_t bytes = frameCount * sizeof(int16_t); + std::vector buffer(bytes); + std::memcpy(buffer.data(), input, bytes); + inputCallback_(buffer); + return paContinue; +} + +void DefaultAudioInterface::outputThreadFunc() { + while (!shouldStop_.load()) { + std::vector audio; + { + std::unique_lock lk(queueMutex_); + queueCv_.wait(lk, [this] { return shouldStop_.load() || !outputQueue_.empty(); }); + if (shouldStop_.load()) break; + audio = std::move(outputQueue_.front()); + outputQueue_.pop(); + } + if (!audio.empty() && outputStream_) { + outputPlaying_.store(true); + Pa_WriteStream(outputStream_, audio.data(), audio.size() / sizeof(int16_t)); + outputPlaying_.store(false); + } + } +} \ No newline at end of file diff --git a/CPP/elevenlabs-convai-cpp-main/src/main.cpp b/CPP/elevenlabs-convai-cpp-main/src/main.cpp new file mode 100644 index 0000000..f8522bd --- /dev/null +++ b/CPP/elevenlabs-convai-cpp-main/src/main.cpp @@ -0,0 +1,31 @@ +#include "Conversation.hpp" +#include "DefaultAudioInterface.hpp" + +#include +#include +#include + +int main() { + const char* agentIdEnv = std::getenv("AGENT_ID"); + if (!agentIdEnv) { + std::cerr << "AGENT_ID environment variable must be set" << std::endl; + return 1; + } + std::string agentId(agentIdEnv); + + auto audioInterface = std::make_shared(); + Conversation conv(agentId, /*requiresAuth*/ false, audioInterface, + [](const std::string& resp) { std::cout << "Agent: " << resp << std::endl; }, + [](const std::string& orig, const std::string& corrected) { + std::cout << "Agent correction: " << orig << " -> " << corrected << std::endl; }, + [](const std::string& transcript) { std::cout << "User: " << transcript << std::endl; }); + + conv.startSession(); + + std::cout << "Press Enter to quit..." << std::endl; + std::cin.get(); + conv.endSession(); + auto convId = conv.waitForSessionEnd(); + std::cout << "Conversation ID: " << convId << std::endl; + return 0; +} \ No newline at end of file diff --git a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset index 748ebd5..a4affba 100644 Binary files a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset and b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index c13d380..4e4e0f4 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -604,9 +604,15 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr { if (!IsConnected() || !bIsListening) return; + // Echo suppression: skip sending mic audio while the agent is speaking. + // This prevents the agent from hearing its own voice through the speakers, + // which would confuse the server's VAD and STT. Matches the approach used + // in the official ElevenLabs C++ SDK (outputPlaying_ flag). + if (bAgentSpeaking) return; + // Convert this callback's samples to int16 bytes and accumulate. - // WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms - // (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here + // WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms + // (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here // until we have enough, then send the whole batch in one WebSocket frame. TArray PCMBytes = FloatPCMToInt16Bytes(FloatPCM); MicAccumulationBuffer.Append(PCMBytes); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index 3f3215b..83f273d 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -491,6 +491,17 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtrTryGetNumberField(TEXT("event_id"), EventId); + if (EventId > 0 && EventId <= LastInterruptEventId) + { + UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio event_id=%d (interrupted at %d)."), EventId, LastInterruptEventId); + return; + } + FString Base64Audio; if (!(*AudioEvent)->TryGetStringField(TEXT("audio_base_64"), Base64Audio)) { @@ -591,7 +602,20 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr& Root) { - UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received).")); + // Extract the interrupt event_id so we can filter stale audio frames. + // { "type": "interruption", "interruption_event": { "event_id": 42 } } + const TSharedPtr* InterruptEvent = nullptr; + if (Root->TryGetObjectField(TEXT("interruption_event"), InterruptEvent) && InterruptEvent) + { + int32 EventId = 0; + (*InterruptEvent)->TryGetNumberField(TEXT("event_id"), EventId); + if (EventId > LastInterruptEventId) + { + LastInterruptEventId = EventId; + } + } + + UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack, LastInterruptEventId=%d)."), LastInterruptEventId); OnInterrupted.Broadcast(); } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index 05d0665..27f93b7 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -400,5 +400,5 @@ private: // ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT. // We accumulate here and only call SendAudioChunk once enough bytes are ready. TArray MicAccumulationBuffer; - static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono + static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation) }; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h index 1cc0efc..fa1ee67 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h @@ -226,6 +226,13 @@ private: // Used to compute [T+Xs] session-relative timestamps in all log messages. double SessionStartTime = 0.0; + // ── Interrupt filtering (event_id approach, matching official SDK) ──────── + // When the server sends an "interruption" event it includes an event_id. + // Audio events whose event_id <= LastInterruptEventId belong to the cancelled + // generation and must be discarded. Only AUDIO is filtered — transcripts, + // agent_response, agent_chat_response_part etc. are always processed. + int32 LastInterruptEventId = 0; + public: // Set by UElevenLabsConversationalAgentComponent before calling Connect(). // Controls turn_timeout in conversation_initiation_client_data.