Ajout sample CPP

2026-02-21 20:48:10 +01:00
parent d8957625f8
commit f23acc8c1c
15 changed files with 985 additions and 4 deletions
--- a/CPP/elevenlabs-convai-cpp-main/.gitignore
+++ b/CPP/elevenlabs-convai-cpp-main/.gitignore
@@ -0,0 +1,152 @@
 # Build directories
 build/
 cmake-build-*/
 out/
 # Compiled Object files
 *.slo
 *.lo
 *.o
 *.obj
 # Precompiled Headers
 *.gch
 *.pch
 # Compiled Dynamic libraries
 *.so
 *.dylib
 *.dll
 # Fortran module files
 *.mod
 *.smod
 # Compiled Static libraries
 *.lai
 *.la
 *.a
 *.lib
 # Executables
 *.exe
 *.out
 *.app
 convai_cpp
 # CMake
 CMakeCache.txt
 CMakeFiles/
 CMakeScripts/
 Testing/
 Makefile
 cmake_install.cmake
 install_manifest.txt
 compile_commands.json
 CTestTestfile.cmake
 _deps/
 # IDE files
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # macOS
 .DS_Store
 .AppleDouble
 .LSOverride
 # Thumbnails
 ._*
 # Files that might appear in the root of a volume
 .DocumentRevisions-V100
 .fseventsd
 .Spotlight-V100
 .TemporaryItems
 .Trashes
 .VolumeIcon.icns
 .com.apple.timemachine.donotpresent
 # Directories potentially created on remote AFP share
 .AppleDB
 .AppleDesktop
 Network Trash Folder
 Temporary Items
 .apdisk
 # Windows
 Thumbs.db
 ehthumbs.db
 Desktop.ini
 $RECYCLE.BIN/
 *.cab
 *.msi
 *.msm
 *.msp
 *.lnk
 # Linux
 *~
 .fuse_hidden*
 .directory
 .Trash-*
 .nfs*
 # Logs
 *.log
 # Runtime data
 pids
 *.pid
 *.seed
 *.pid.lock
 # Coverage directory used by tools like istanbul
 coverage/
 # nyc test coverage
 .nyc_output
 # Dependency directories
 node_modules/
 # Optional npm cache directory
 .npm
 # Optional REPL history
 .node_repl_history
 # Output of 'npm pack'
 *.tgz
 # Yarn Integrity file
 .yarn-integrity
 # dotenv environment variables file
 .env
 .env.test
 # parcel-bundler cache (https://parceljs.org/)
 .cache
 .parcel-cache
 # next.js build output
 .next
 # nuxt.js build output
 .nuxt
 # vuepress build output
 .vuepress/dist
 # Serverless directories
 .serverless
 # FuseBox cache
 .fusebox/
 # DynamoDB Local files
 .dynamodb/ 
--- a/CPP/elevenlabs-convai-cpp-main/CMakeLists.txt
+++ b/CPP/elevenlabs-convai-cpp-main/CMakeLists.txt
@@ -0,0 +1,42 @@
 cmake_minimum_required(VERSION 3.14)
 project(elevenlabs_convai_cpp LANGUAGES CXX)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 # Find dependencies
 find_package(Boost REQUIRED COMPONENTS system thread)
 find_package(OpenSSL REQUIRED)
 # PortAudio via vcpkg CMake config
 find_package(portaudio CONFIG REQUIRED)
 # Find nlohmann_json
 find_package(nlohmann_json 3.11 QUIET)
 if(NOT nlohmann_json_FOUND)
    include(FetchContent)
    # Fallback: header-only fetch to avoid old CMake policies in upstream CMakeLists
    FetchContent_Declare(
      nlohmann_json_src
      URL https://raw.githubusercontent.com/nlohmann/json/v3.11.2/single_include/nlohmann/json.hpp
    )
    FetchContent_MakeAvailable(nlohmann_json_src)
    add_library(nlohmann_json::nlohmann_json INTERFACE IMPORTED)
    target_include_directories(nlohmann_json::nlohmann_json INTERFACE ${nlohmann_json_src_SOURCE_DIR}/single_include)
 endif()
 add_executable(convai_cpp
    src/main.cpp
    src/Conversation.cpp
    src/DefaultAudioInterface.cpp
 )
 target_include_directories(convai_cpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 # MSVC: set Windows target version and suppress getenv deprecation warning
 if(MSVC)
    target_compile_definitions(convai_cpp PRIVATE _WIN32_WINNT=0x0A00 _CRT_SECURE_NO_WARNINGS)
 endif()
 target_link_libraries(convai_cpp PRIVATE Boost::system Boost::thread OpenSSL::SSL OpenSSL::Crypto portaudio nlohmann_json::nlohmann_json) 
--- a/CPP/elevenlabs-convai-cpp-main/LICENSE
+++ b/CPP/elevenlabs-convai-cpp-main/LICENSE
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2024 Jitendra
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE. 
--- a/CPP/elevenlabs-convai-cpp-main/README.md
+++ b/CPP/elevenlabs-convai-cpp-main/README.md
@@ -0,0 +1,197 @@
 # ElevenLabs Conversational AI - C++ Implementation
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![C++17](https://img.shields.io/badge/C%2B%2B-17-blue.svg)](https://en.wikipedia.org/wiki/C%2B%2B17)
 [![CMake](https://img.shields.io/badge/CMake-3.14+-green.svg)](https://cmake.org/)
 C++ implementation of ElevenLabs Conversational AI client
 ## Features
 - **Real-time Audio Processing**: Full-duplex audio streaming with low-latency playback
 - **WebSocket Integration**: Secure WSS connection to ElevenLabs Conversational AI platform
 - **Cross-platform Audio**: PortAudio-based implementation supporting Windows, macOS, and Linux
 - **Echo Suppression**: Built-in acoustic feedback prevention
 - **Modern C++**: Clean, maintainable C++17 codebase with proper RAII and exception handling
 - **Flexible Architecture**: Modular design allowing easy customization and extension
 ## Architecture
 ```mermaid
 graph TB
    subgraph "User Interface"
        A[main.cpp] --> B[Conversation]
    end
    subgraph "Core Components"
        B --> C[DefaultAudioInterface]
        B --> D[WebSocket Client]
        C --> E[PortAudio]
        D --> F[Boost.Beast + OpenSSL]
    end
    subgraph "ElevenLabs Platform"
        F --> G[WSS API Endpoint]
        G --> H[Conversational AI Agent]
    end
    subgraph "Audio Flow"
        I[Microphone] --> C
        C --> J[Base64 Encoding]
        J --> D
        D --> K[Audio Events]
        K --> L[Base64 Decoding]
        L --> C
        C --> M[Speakers]
    end
    subgraph "Message Types"
        N[user_audio_chunk]
        O[agent_response]
        P[user_transcript]
        Q[audio_event]
        R[ping/pong]
    end
    style B fill:#e1f5fe
    style C fill:#f3e5f5
    style D fill:#e8f5e8
    style H fill:#fff3e0
 ```
 ## Quick Start
 ### Prerequisites
 - **C++17 compatible compiler**: GCC 11+, Clang 14+, or MSVC 2022+
 - **CMake** 3.14 or higher
 - **Dependencies** (install via package manager):
 #### macOS (Homebrew)
 ```bash
 brew install boost openssl portaudio nlohmann-json cmake pkg-config
 ```
 #### Ubuntu/Debian
 ```bash
 sudo apt update
 sudo apt install build-essential cmake pkg-config
 sudo apt install libboost-system-dev libboost-thread-dev
 sudo apt install libssl-dev libportaudio2-dev nlohmann-json3-dev
 ```
 #### Windows (vcpkg)
 ```bash
 vcpkg install boost-system boost-thread openssl portaudio nlohmann-json
 ```
 ### Building
 ```bash
 # Clone the repository
 git clone https://github.com/Jitendra2603/elevenlabs-convai-cpp.git
 cd elevenlabs-convai-cpp
 # Build the project
 mkdir build && cd build
 cmake ..
 cmake --build . --config Release
 ```
 ### Running
 ```bash
 # Set your agent ID (get this from ElevenLabs dashboard)
 export AGENT_ID="your-agent-id-here"
 # Run the demo
 ./convai_cpp
 ```
 The application will:
 1. Connect to your ElevenLabs Conversational AI agent
 2. Start capturing audio from your default microphone
 3. Stream audio to the agent and play responses through speakers
 4. Display conversation transcripts in the terminal
 5. Continue until you press Enter to quit
 ## 📋 Usage Examples
 ### Basic Conversation
 ```bash
 export AGENT_ID="agent_"
 ./convai_cpp
 # Speak into your microphone and hear the AI agent respond
 ```
 ## Configuration
 ### Audio Settings
 The audio interface is configured for optimal real-time performance:
 - **Sample Rate**: 16 kHz 
 - **Format**: 16-bit PCM mono
 - **Input Buffer**: 250ms (4000 frames)
 - **Output Buffer**: 62.5ms (1000 frames)
 ### WebSocket Connection
 - **Endpoint**: `wss://api.elevenlabs.io/v1/convai/conversation`
 - **Protocol**: WebSocket Secure (WSS) with TLS 1.2+
 - **Authentication**: Optional (required for private agents)
 ## Project Structure
 ```
 elevenlabs-convai-cpp/
 ├── CMakeLists.txt              # Build configuration
 ├── README.md                   # This file
 ├── LICENSE                     # MIT license
 ├── CONTRIBUTING.md             # Contribution guidelines
 ├── .gitignore                  # Git ignore rules
 ├── include/                    # Header files
 │   ├── AudioInterface.hpp      # Abstract audio interface
 │   ├── DefaultAudioInterface.hpp # PortAudio implementation
 │   └── Conversation.hpp        # Main conversation handler
 └── src/                        # Source files
    ├── main.cpp                # Demo application
    ├── Conversation.cpp        # WebSocket and message handling
    └── DefaultAudioInterface.cpp # Audio I/O implementation
 ```
 ## Technical Details
 ### Audio Processing Pipeline
 1. **Capture**: PortAudio captures 16-bit PCM audio at 16kHz
 2. **Encoding**: Raw audio is base64-encoded for WebSocket transmission
 3. **Streaming**: Audio chunks sent as `user_audio_chunk` messages
 4. **Reception**: Server sends `audio_event` messages with agent responses
 5. **Decoding**: Base64 audio data decoded back to PCM
 6. **Playback**: Audio queued and played through PortAudio output stream
 ### Echo Suppression
 The implementation includes a simple, effective echo suppression mechanism:
 - Microphone input is suppressed during agent speech playback
 - Prevents acoustic feedback loops that cause the agent to respond to itself
 - Uses atomic flags for thread-safe coordination between input/output
 ### WebSocket Message Handling
 Supported message types:
 - `conversation_initiation_client_data` - Session initialization
 - `user_audio_chunk` - Microphone audio data
 - `audio_event` - Agent speech audio
 - `agent_response` - Agent text responses
 - `user_transcript` - Speech-to-text results
 - `ping`/`pong` - Connection keepalive
 ## 📝 License
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
--- a/CPP/elevenlabs-convai-cpp-main/include/AudioInterface.hpp
+++ b/CPP/elevenlabs-convai-cpp-main/include/AudioInterface.hpp
@@ -0,0 +1,23 @@
 #pragma once
 #include <functional>
 #include <vector>
 class AudioInterface {
 public:
    using AudioCallback = std::function<void(const std::vector<char>&)>;
    virtual ~AudioInterface() = default;
    // Starts the audio interface. The callback will be invoked with raw 16-bit PCM mono samples at 16kHz.
    virtual void start(AudioCallback inputCallback) = 0;
    // Stops audio I/O and releases underlying resources.
    virtual void stop() = 0;
    // Play audio to the user; audio is 16-bit PCM mono 16kHz.
    virtual void output(const std::vector<char>& audio) = 0;
    // Immediately stop any buffered / ongoing output.
    virtual void interrupt() = 0;
 }; 
--- a/CPP/elevenlabs-convai-cpp-main/include/Conversation.hpp
+++ b/CPP/elevenlabs-convai-cpp-main/include/Conversation.hpp
@@ -0,0 +1,72 @@
 #pragma once
 #include "AudioInterface.hpp"
 #include <boost/beast/core.hpp>
 #include <boost/beast/websocket.hpp>
 #include <boost/beast/ssl.hpp>
 #include <boost/asio.hpp>
 #include <boost/asio/ssl/stream.hpp>
 #include <boost/asio/ip/tcp.hpp>
 #include <nlohmann/json.hpp>
 #include <thread>
 #include <atomic>
 #include <functional>
 class Conversation {
 public:
    using CallbackAgentResponse = std::function<void(const std::string&)>;
    using CallbackAgentResponseCorrection = std::function<void(const std::string&, const std::string&)>;
    using CallbackUserTranscript = std::function<void(const std::string&)>;
    using CallbackLatencyMeasurement = std::function<void(int)>;
    Conversation(
        const std::string& agentId,
        bool requiresAuth,
        std::shared_ptr<AudioInterface> audioInterface,
        CallbackAgentResponse callbackAgentResponse = nullptr,
        CallbackAgentResponseCorrection callbackAgentResponseCorrection = nullptr,
        CallbackUserTranscript callbackUserTranscript = nullptr,
        CallbackLatencyMeasurement callbackLatencyMeasurement = nullptr
    );
    ~Conversation();
    void startSession();
    void endSession();
    std::string waitForSessionEnd();
    void sendUserMessage(const std::string& text);
    void registerUserActivity();
    void sendContextualUpdate(const std::string& content);
 private:
    void run();
    void handleMessage(const nlohmann::json& message);
    std::string getWssUrl() const;
    // networking members
    boost::asio::io_context ioc_;
    boost::asio::ssl::context sslCtx_{boost::asio::ssl::context::tlsv12_client};
    using tcp = boost::asio::ip::tcp;
    using websocket_t = boost::beast::websocket::stream<
        boost::beast::ssl_stream<tcp::socket>>;
    std::unique_ptr<websocket_t> ws_;
    // general state
    std::string agentId_;
    bool requiresAuth_;
    std::shared_ptr<AudioInterface> audioInterface_;
    CallbackAgentResponse callbackAgentResponse_;
    CallbackAgentResponseCorrection callbackAgentResponseCorrection_;
    CallbackUserTranscript callbackUserTranscript_;
    CallbackLatencyMeasurement callbackLatencyMeasurement_;
    std::thread workerThread_;
    std::atomic<bool> shouldStop_{false};
    std::string conversationId_;
    std::atomic<int> lastInterruptId_{0};
 }; 
--- a/CPP/elevenlabs-convai-cpp-main/include/DefaultAudioInterface.hpp
+++ b/CPP/elevenlabs-convai-cpp-main/include/DefaultAudioInterface.hpp
@@ -0,0 +1,45 @@
 #pragma once
 #include "AudioInterface.hpp"
 #include <portaudio.h>
 #include <mutex>
 #include <condition_variable>
 #include <queue>
 #include <thread>
 #include <atomic>
 class DefaultAudioInterface : public AudioInterface {
 public:
    static constexpr int INPUT_FRAMES_PER_BUFFER = 4000;  // 250ms @ 16kHz
    static constexpr int OUTPUT_FRAMES_PER_BUFFER = 1000; // 62.5ms @ 16kHz
    DefaultAudioInterface();
    ~DefaultAudioInterface() override;
    void start(AudioCallback inputCallback) override;
    void stop() override;
    void output(const std::vector<char>& audio) override;
    void interrupt() override;
 private:
    static int inputCallbackStatic(const void* input, void* output, unsigned long frameCount,
                                   const PaStreamCallbackTimeInfo* timeInfo, PaStreamCallbackFlags statusFlags,
                                   void* userData);
    int inputCallbackInternal(const void* input, unsigned long frameCount);
    void outputThreadFunc();
    PaStream* inputStream_{};
    PaStream* outputStream_{};
    AudioCallback inputCallback_;
    std::queue<std::vector<char>> outputQueue_;
    std::mutex queueMutex_;
    std::condition_variable queueCv_;
    std::thread outputThread_;
    std::atomic<bool> shouldStop_{false};
    std::atomic<bool> outputPlaying_{false};
 }; 
--- a/CPP/elevenlabs-convai-cpp-main/src/Conversation.cpp
+++ b/CPP/elevenlabs-convai-cpp-main/src/Conversation.cpp
@@ -0,0 +1,230 @@
 #include "Conversation.hpp"
 #include <boost/beast/websocket/ssl.hpp>
 #include <boost/beast/websocket.hpp>
 #include <boost/beast/ssl.hpp>
 #include <boost/beast/core/detail/base64.hpp>
 #include <boost/asio/connect.hpp>
 #include <boost/algorithm/string.hpp>
 #include <iostream>
 #include <sstream>
 #include <openssl/ssl.h>
 using tcp = boost::asio::ip::tcp;
 namespace ssl = boost::asio::ssl;
 namespace websocket = boost::beast::websocket;
 namespace beast = boost::beast;
 static std::string base64Encode(const std::vector<char>& data) {
    auto encodedSize = beast::detail::base64::encoded_size(data.size());
    std::string out(encodedSize, '\0');
    beast::detail::base64::encode(&out[0], data.data(), data.size());
    return out;
 }
 static std::vector<char> base64Decode(const std::string& str) {
    auto decodedSize = beast::detail::base64::decoded_size(str.size());
    std::vector<char> out(decodedSize);
    auto result = beast::detail::base64::decode(out.data(), str.data(), str.size());
    out.resize(result.first);
    return out;
 }
 static std::string toString(const nlohmann::json& j){
    if(j.is_string()) return j.get<std::string>();
    if(j.is_number_integer()) return std::to_string(j.get<int64_t>());
    return j.dump();
 }
 Conversation::Conversation(const std::string& agentId, bool requiresAuth,
                           std::shared_ptr<AudioInterface> audioInterface,
                           CallbackAgentResponse callbackAgentResponse,
                           CallbackAgentResponseCorrection callbackAgentResponseCorrection,
                           CallbackUserTranscript callbackUserTranscript,
                           CallbackLatencyMeasurement callbackLatencyMeasurement)
    : agentId_(agentId),
      requiresAuth_(requiresAuth),
      audioInterface_(std::move(audioInterface)),
      callbackAgentResponse_(std::move(callbackAgentResponse)),
      callbackAgentResponseCorrection_(std::move(callbackAgentResponseCorrection)),
      callbackUserTranscript_(std::move(callbackUserTranscript)),
      callbackLatencyMeasurement_(std::move(callbackLatencyMeasurement)) {
    sslCtx_.set_default_verify_paths();
 }
 Conversation::~Conversation() {
    endSession();
 }
 void Conversation::startSession() {
    shouldStop_.store(false);
    workerThread_ = std::thread(&Conversation::run, this);
 }
 void Conversation::endSession() {
    shouldStop_.store(true);
    if (ws_) {
        beast::error_code ec;
        ws_->close(websocket::close_code::normal, ec);
    }
    if (audioInterface_) {
        audioInterface_->stop();
    }
    if (workerThread_.joinable()) {
        workerThread_.join();
    }
 }
 std::string Conversation::waitForSessionEnd() {
    if (workerThread_.joinable()) {
        workerThread_.join();
    }
    return conversationId_;
 }
 void Conversation::sendUserMessage(const std::string& text) {
    if (!ws_) {
        throw std::runtime_error("Session not started");
    }
    nlohmann::json j = {
        {"type", "user_message"},
        {"text", text}
    };
    ws_->write(boost::asio::buffer(j.dump()));
 }
 void Conversation::registerUserActivity() {
    if (!ws_) throw std::runtime_error("Session not started");
    nlohmann::json j = {{"type", "user_activity"}};
    ws_->write(boost::asio::buffer(j.dump()));
 }
 void Conversation::sendContextualUpdate(const std::string& content) {
    if (!ws_) throw std::runtime_error("Session not started");
    nlohmann::json j = {{"type", "contextual_update"}, {"content", content}};
    ws_->write(boost::asio::buffer(j.dump()));
 }
 std::string Conversation::getWssUrl() const {
    // Hard-coded base env for demo; in production you'd call ElevenLabs env endpoint.
    std::ostringstream oss;
    oss << "wss://api.elevenlabs.io/v1/convai/conversation?agent_id=" << agentId_;
    return oss.str();
 }
 void Conversation::run() {
    try {
        auto url = getWssUrl();
        std::string protocol, host, target;
        unsigned short port = 443;
        // Very naive parse: wss://host[:port]/path?query
        if (boost::starts_with(url, "wss://")) {
            protocol = "wss";
            host = url.substr(6);
        } else {
            throw std::runtime_error("Only wss:// URLs supported in this demo");
        }
        auto slashPos = host.find('/');
        if (slashPos == std::string::npos) {
            target = "/";
        } else {
            target = host.substr(slashPos);
            host = host.substr(0, slashPos);
        }
        auto colonPos = host.find(':');
        if (colonPos != std::string::npos) {
            port = static_cast<unsigned short>(std::stoi(host.substr(colonPos + 1)));
            host = host.substr(0, colonPos);
        }
        tcp::resolver resolver(ioc_);
        auto const results = resolver.resolve(host, std::to_string(port));
        beast::ssl_stream<tcp::socket> stream(ioc_, sslCtx_);
        boost::asio::connect(beast::get_lowest_layer(stream), results);
        if (!SSL_set_tlsext_host_name(stream.native_handle(), host.c_str())) {
            throw std::runtime_error("Failed to set SNI hostname on SSL stream");
        }
        stream.handshake(ssl::stream_base::client);
        ws_ = std::make_unique<websocket_t>(std::move(stream));
        ws_->set_option(websocket::stream_base::timeout::suggested(beast::role_type::client));
        ws_->handshake(host, target);
        // send initiation data
        nlohmann::json init = {
            {"type", "conversation_initiation_client_data"},
            {"custom_llm_extra_body", nlohmann::json::object()},
            {"conversation_config_override", nlohmann::json::object()},
            {"dynamic_variables", nlohmann::json::object()}
        };
        ws_->write(boost::asio::buffer(init.dump()));
        // Prepare audio callback
        auto inputCb = [this](const std::vector<char>& audio) {
            nlohmann::json msg = {
                {"user_audio_chunk", base64Encode(audio)}
            };
            ws_->write(boost::asio::buffer(msg.dump()));
        };
        audioInterface_->start(inputCb);
        beast::flat_buffer buffer;
        while (!shouldStop_.load()) {
            beast::error_code ec;
            ws_->read(buffer, ec);
            if (ec) {
                std::cerr << "Websocket read error: " << ec.message() << std::endl;
                break;
            }
            auto text = beast::buffers_to_string(buffer.data());
            buffer.consume(buffer.size());
            try {
                auto message = nlohmann::json::parse(text);
                handleMessage(message);
            } catch (const std::exception& ex) {
                std::cerr << "JSON parse error: " << ex.what() << std::endl;
            }
        }
    } catch (const std::exception& ex) {
        std::cerr << "Conversation error: " << ex.what() << std::endl;
    }
 }
 void Conversation::handleMessage(const nlohmann::json& message) {
    std::string type = message.value("type", "");
    if (type == "conversation_initiation_metadata") {
        conversationId_ = message["conversation_initiation_metadata_event"]["conversation_id"].get<std::string>();
    } else if (type == "audio") {
        auto event = message["audio_event"];
        int eventId = std::stoi(toString(event["event_id"]));
        if (eventId <= lastInterruptId_.load()) return;
        auto audioBytes = base64Decode(event["audio_base_64"].get<std::string>());
        audioInterface_->output(audioBytes);
    } else if (type == "agent_response" && callbackAgentResponse_) {
        auto event = message["agent_response_event"];
        callbackAgentResponse_(event["agent_response"].get<std::string>());
    } else if (type == "agent_response_correction" && callbackAgentResponseCorrection_) {
        auto event = message["agent_response_correction_event"];
        callbackAgentResponseCorrection_(event["original_agent_response"].get<std::string>(),
                                         event["corrected_agent_response"].get<std::string>());
    } else if (type == "user_transcript" && callbackUserTranscript_) {
        auto event = message["user_transcription_event"];
        callbackUserTranscript_(event["user_transcript"].get<std::string>());
    } else if (type == "interruption") {
        auto event = message["interruption_event"];
        lastInterruptId_.store(std::stoi(toString(event["event_id"])));
        audioInterface_->interrupt();
    } else if (type == "ping") {
        auto event = message["ping_event"];
        nlohmann::json pong = {{"type", "pong"}, {"event_id", event["event_id"]}};
        ws_->write(boost::asio::buffer(pong.dump()));
        if (callbackLatencyMeasurement_ && event.contains("ping_ms")) {
            int latency = event["ping_ms"].is_number() ? event["ping_ms"].get<int>() : std::stoi(event["ping_ms"].get<std::string>());
            callbackLatencyMeasurement_(latency);
        }
    }
    // Note: client tool call handling omitted for brevity.
 } 
--- a/CPP/elevenlabs-convai-cpp-main/src/DefaultAudioInterface.cpp
+++ b/CPP/elevenlabs-convai-cpp-main/src/DefaultAudioInterface.cpp
@@ -0,0 +1,131 @@
 #include "DefaultAudioInterface.hpp"
 #include <cstring>
 #include <iostream>
 DefaultAudioInterface::DefaultAudioInterface() {
    PaError err = Pa_Initialize();
    if (err != paNoError) {
        throw std::runtime_error("PortAudio initialization failed");
    }
 }
 DefaultAudioInterface::~DefaultAudioInterface() {
    if (!shouldStop_.load()) {
        stop();
    }
    Pa_Terminate();
 }
 void DefaultAudioInterface::start(AudioCallback inputCallback) {
    inputCallback_ = std::move(inputCallback);
    PaStreamParameters inputParams;
    std::memset(&inputParams, 0, sizeof(inputParams));
    inputParams.channelCount = 1;
    inputParams.device = Pa_GetDefaultInputDevice();
    inputParams.sampleFormat = paInt16;
    inputParams.suggestedLatency = Pa_GetDeviceInfo(inputParams.device)->defaultLowInputLatency;
    inputParams.hostApiSpecificStreamInfo = nullptr;
    PaStreamParameters outputParams;
    std::memset(&outputParams, 0, sizeof(outputParams));
    outputParams.channelCount = 1;
    outputParams.device = Pa_GetDefaultOutputDevice();
    outputParams.sampleFormat = paInt16;
    outputParams.suggestedLatency = Pa_GetDeviceInfo(outputParams.device)->defaultLowOutputLatency;
    outputParams.hostApiSpecificStreamInfo = nullptr;
    PaError err = Pa_OpenStream(&inputStream_, &inputParams, nullptr, 16000, INPUT_FRAMES_PER_BUFFER, paClipOff,
                                &DefaultAudioInterface::inputCallbackStatic, this);
    if (err != paNoError) {
        throw std::runtime_error("Failed to open input stream");
    }
    err = Pa_OpenStream(&outputStream_, nullptr, &outputParams, 16000, OUTPUT_FRAMES_PER_BUFFER, paClipOff, nullptr, nullptr);
    if (err != paNoError) {
        throw std::runtime_error("Failed to open output stream");
    }
    if ((err = Pa_StartStream(inputStream_)) != paNoError) {
        throw std::runtime_error("Failed to start input stream");
    }
    if ((err = Pa_StartStream(outputStream_)) != paNoError) {
        throw std::runtime_error("Failed to start output stream");
    }
    shouldStop_.store(false);
    outputThread_ = std::thread(&DefaultAudioInterface::outputThreadFunc, this);
 }
 void DefaultAudioInterface::stop() {
    shouldStop_.store(true);
    queueCv_.notify_all();
    if (outputThread_.joinable()) {
        outputThread_.join();
    }
    if (inputStream_) {
        Pa_StopStream(inputStream_);
        Pa_CloseStream(inputStream_);
        inputStream_ = nullptr;
    }
    if (outputStream_) {
        Pa_StopStream(outputStream_);
        Pa_CloseStream(outputStream_);
        outputStream_ = nullptr;
    }
 }
 void DefaultAudioInterface::output(const std::vector<char>& audio) {
    {
        std::lock_guard<std::mutex> lg(queueMutex_);
        outputQueue_.emplace(audio);
    }
    queueCv_.notify_one();
 }
 void DefaultAudioInterface::interrupt() {
    std::lock_guard<std::mutex> lg(queueMutex_);
    std::queue<std::vector<char>> empty;
    std::swap(outputQueue_, empty);
 }
 int DefaultAudioInterface::inputCallbackStatic(const void* input, void* /*output*/, unsigned long frameCount,
                                               const PaStreamCallbackTimeInfo* /*timeInfo*/, PaStreamCallbackFlags /*statusFlags*/,
                                               void* userData) {
    auto* self = static_cast<DefaultAudioInterface*>(userData);
    return self->inputCallbackInternal(input, frameCount);
 }
 int DefaultAudioInterface::inputCallbackInternal(const void* input, unsigned long frameCount) {
    if (!input || !inputCallback_) {
        return paContinue;
    }
    if (outputPlaying_.load()) {
        // Suppress microphone input while playing output to avoid echo feedback.
        return paContinue;
    }
    const size_t bytes = frameCount * sizeof(int16_t);
    std::vector<char> buffer(bytes);
    std::memcpy(buffer.data(), input, bytes);
    inputCallback_(buffer);
    return paContinue;
 }
 void DefaultAudioInterface::outputThreadFunc() {
    while (!shouldStop_.load()) {
        std::vector<char> audio;
        {
            std::unique_lock<std::mutex> lk(queueMutex_);
            queueCv_.wait(lk, [this] { return shouldStop_.load() || !outputQueue_.empty(); });
            if (shouldStop_.load()) break;
            audio = std::move(outputQueue_.front());
            outputQueue_.pop();
        }
        if (!audio.empty() && outputStream_) {
            outputPlaying_.store(true);
            Pa_WriteStream(outputStream_, audio.data(), audio.size() / sizeof(int16_t));
            outputPlaying_.store(false);
        }
    }
 } 
--- a/CPP/elevenlabs-convai-cpp-main/src/main.cpp
+++ b/CPP/elevenlabs-convai-cpp-main/src/main.cpp
@@ -0,0 +1,31 @@
 #include "Conversation.hpp"
 #include "DefaultAudioInterface.hpp"
 #include <cstdlib>
 #include <iostream>
 #include <memory>
 int main() {
    const char* agentIdEnv = std::getenv("AGENT_ID");
    if (!agentIdEnv) {
        std::cerr << "AGENT_ID environment variable must be set" << std::endl;
        return 1;
    }
    std::string agentId(agentIdEnv);
    auto audioInterface = std::make_shared<DefaultAudioInterface>();
    Conversation conv(agentId, /*requiresAuth*/ false, audioInterface,
                      [](const std::string& resp) { std::cout << "Agent: " << resp << std::endl; },
                      [](const std::string& orig, const std::string& corrected) {
                          std::cout << "Agent correction: " << orig << " -> " << corrected << std::endl; },
                      [](const std::string& transcript) { std::cout << "User: " << transcript << std::endl; });
    conv.startSession();
    std::cout << "Press Enter to quit..." << std::endl;
    std::cin.get();
    conv.endSession();
    auto convId = conv.waitForSessionEnd();
    std::cout << "Conversation ID: " << convId << std::endl;
    return 0;
 } 
--- a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
+++ b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -604,9 +604,15 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
 {
 	if (!IsConnected() || !bIsListening) return;
 	// Echo suppression: skip sending mic audio while the agent is speaking.
 	// This prevents the agent from hearing its own voice through the speakers,
 	// which would confuse the server's VAD and STT. Matches the approach used
 	// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
 	if (bAgentSpeaking) return;
 	// Convert this callback's samples to int16 bytes and accumulate.
-	// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
+	// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms
-	// (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here
+	// (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here
 	// until we have enough, then send the whole batch in one WebSocket frame.
 	TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
 	MicAccumulationBuffer.Append(PCMBytes);
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -491,6 +491,17 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject
 		return;
 	}
 	// Discard audio belonging to an interrupted generation (event_id approach).
 	// Matches the official ElevenLabs C++ and Python SDKs: only AUDIO is filtered
 	// by event_id — transcripts, agent_response, etc. are always processed.
 	int32 EventId = 0;
 	(*AudioEvent)->TryGetNumberField(TEXT("event_id"), EventId);
 	if (EventId > 0 && EventId <= LastInterruptEventId)
 	{
 		UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio event_id=%d (interrupted at %d)."), EventId, LastInterruptEventId);
 		return;
 	}
 	FString Base64Audio;
 	if (!(*AudioEvent)->TryGetStringField(TEXT("audio_base_64"), Base64Audio))
 	{
@@ -591,7 +602,20 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
 void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
 {
-	UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received)."));
+	// Extract the interrupt event_id so we can filter stale audio frames.
 	// { "type": "interruption", "interruption_event": { "event_id": 42 } }
 	const TSharedPtr<FJsonObject>* InterruptEvent = nullptr;
 	if (Root->TryGetObjectField(TEXT("interruption_event"), InterruptEvent) && InterruptEvent)
 	{
 		int32 EventId = 0;
 		(*InterruptEvent)->TryGetNumberField(TEXT("event_id"), EventId);
 		if (EventId > LastInterruptEventId)
 		{
 			LastInterruptEventId = EventId;
 		}
 	}
 	UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack, LastInterruptEventId=%d)."), LastInterruptEventId);
 	OnInterrupted.Broadcast();
 }
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -400,5 +400,5 @@ private:
 	// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
 	// We accumulate here and only call SendAudioChunk once enough bytes are ready.
 	TArray<uint8> MicAccumulationBuffer;
-	static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono
+	static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation)
 };
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
@@ -226,6 +226,13 @@ private:
 	// Used to compute [T+Xs] session-relative timestamps in all log messages.
 	double SessionStartTime = 0.0;
 	// ── Interrupt filtering (event_id approach, matching official SDK) ────────
 	// When the server sends an "interruption" event it includes an event_id.
 	// Audio events whose event_id <= LastInterruptEventId belong to the cancelled
 	// generation and must be discarded. Only AUDIO is filtered — transcripts,
 	// agent_response, agent_chat_response_part etc. are always processed.
 	int32 LastInterruptEventId = 0;
 public:
 	// Set by UElevenLabsConversationalAgentComponent before calling Connect().
 	// Controls turn_timeout in conversation_initiation_client_data.