Ajout sample CPP
This commit is contained in:
parent
d8957625f8
commit
f23acc8c1c
152
CPP/elevenlabs-convai-cpp-main/.gitignore
vendored
Normal file
152
CPP/elevenlabs-convai-cpp-main/.gitignore
vendored
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
# Build directories
|
||||||
|
build/
|
||||||
|
cmake-build-*/
|
||||||
|
out/
|
||||||
|
|
||||||
|
# Compiled Object files
|
||||||
|
*.slo
|
||||||
|
*.lo
|
||||||
|
*.o
|
||||||
|
*.obj
|
||||||
|
|
||||||
|
# Precompiled Headers
|
||||||
|
*.gch
|
||||||
|
*.pch
|
||||||
|
|
||||||
|
# Compiled Dynamic libraries
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
*.dll
|
||||||
|
|
||||||
|
# Fortran module files
|
||||||
|
*.mod
|
||||||
|
*.smod
|
||||||
|
|
||||||
|
# Compiled Static libraries
|
||||||
|
*.lai
|
||||||
|
*.la
|
||||||
|
*.a
|
||||||
|
*.lib
|
||||||
|
|
||||||
|
# Executables
|
||||||
|
*.exe
|
||||||
|
*.out
|
||||||
|
*.app
|
||||||
|
convai_cpp
|
||||||
|
|
||||||
|
# CMake
|
||||||
|
CMakeCache.txt
|
||||||
|
CMakeFiles/
|
||||||
|
CMakeScripts/
|
||||||
|
Testing/
|
||||||
|
Makefile
|
||||||
|
cmake_install.cmake
|
||||||
|
install_manifest.txt
|
||||||
|
compile_commands.json
|
||||||
|
CTestTestfile.cmake
|
||||||
|
_deps/
|
||||||
|
|
||||||
|
# IDE files
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# macOS
|
||||||
|
.DS_Store
|
||||||
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
|
||||||
|
# Thumbnails
|
||||||
|
._*
|
||||||
|
|
||||||
|
# Files that might appear in the root of a volume
|
||||||
|
.DocumentRevisions-V100
|
||||||
|
.fseventsd
|
||||||
|
.Spotlight-V100
|
||||||
|
.TemporaryItems
|
||||||
|
.Trashes
|
||||||
|
.VolumeIcon.icns
|
||||||
|
.com.apple.timemachine.donotpresent
|
||||||
|
|
||||||
|
# Directories potentially created on remote AFP share
|
||||||
|
.AppleDB
|
||||||
|
.AppleDesktop
|
||||||
|
Network Trash Folder
|
||||||
|
Temporary Items
|
||||||
|
.apdisk
|
||||||
|
|
||||||
|
# Windows
|
||||||
|
Thumbs.db
|
||||||
|
ehthumbs.db
|
||||||
|
Desktop.ini
|
||||||
|
$RECYCLE.BIN/
|
||||||
|
*.cab
|
||||||
|
*.msi
|
||||||
|
*.msm
|
||||||
|
*.msp
|
||||||
|
*.lnk
|
||||||
|
|
||||||
|
# Linux
|
||||||
|
*~
|
||||||
|
.fuse_hidden*
|
||||||
|
.directory
|
||||||
|
.Trash-*
|
||||||
|
.nfs*
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Runtime data
|
||||||
|
pids
|
||||||
|
*.pid
|
||||||
|
*.seed
|
||||||
|
*.pid.lock
|
||||||
|
|
||||||
|
# Coverage directory used by tools like istanbul
|
||||||
|
coverage/
|
||||||
|
|
||||||
|
# nyc test coverage
|
||||||
|
.nyc_output
|
||||||
|
|
||||||
|
# Dependency directories
|
||||||
|
node_modules/
|
||||||
|
|
||||||
|
# Optional npm cache directory
|
||||||
|
.npm
|
||||||
|
|
||||||
|
# Optional REPL history
|
||||||
|
.node_repl_history
|
||||||
|
|
||||||
|
# Output of 'npm pack'
|
||||||
|
*.tgz
|
||||||
|
|
||||||
|
# Yarn Integrity file
|
||||||
|
.yarn-integrity
|
||||||
|
|
||||||
|
# dotenv environment variables file
|
||||||
|
.env
|
||||||
|
.env.test
|
||||||
|
|
||||||
|
# parcel-bundler cache (https://parceljs.org/)
|
||||||
|
.cache
|
||||||
|
.parcel-cache
|
||||||
|
|
||||||
|
# next.js build output
|
||||||
|
.next
|
||||||
|
|
||||||
|
# nuxt.js build output
|
||||||
|
.nuxt
|
||||||
|
|
||||||
|
# vuepress build output
|
||||||
|
.vuepress/dist
|
||||||
|
|
||||||
|
# Serverless directories
|
||||||
|
.serverless
|
||||||
|
|
||||||
|
# FuseBox cache
|
||||||
|
.fusebox/
|
||||||
|
|
||||||
|
# DynamoDB Local files
|
||||||
|
.dynamodb/
|
||||||
42
CPP/elevenlabs-convai-cpp-main/CMakeLists.txt
Normal file
42
CPP/elevenlabs-convai-cpp-main/CMakeLists.txt
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.14)
|
||||||
|
|
||||||
|
project(elevenlabs_convai_cpp LANGUAGES CXX)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
# Find dependencies
|
||||||
|
find_package(Boost REQUIRED COMPONENTS system thread)
|
||||||
|
find_package(OpenSSL REQUIRED)
|
||||||
|
# PortAudio via vcpkg CMake config
|
||||||
|
find_package(portaudio CONFIG REQUIRED)
|
||||||
|
|
||||||
|
# Find nlohmann_json
|
||||||
|
find_package(nlohmann_json 3.11 QUIET)
|
||||||
|
|
||||||
|
if(NOT nlohmann_json_FOUND)
|
||||||
|
include(FetchContent)
|
||||||
|
# Fallback: header-only fetch to avoid old CMake policies in upstream CMakeLists
|
||||||
|
FetchContent_Declare(
|
||||||
|
nlohmann_json_src
|
||||||
|
URL https://raw.githubusercontent.com/nlohmann/json/v3.11.2/single_include/nlohmann/json.hpp
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(nlohmann_json_src)
|
||||||
|
add_library(nlohmann_json::nlohmann_json INTERFACE IMPORTED)
|
||||||
|
target_include_directories(nlohmann_json::nlohmann_json INTERFACE ${nlohmann_json_src_SOURCE_DIR}/single_include)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_executable(convai_cpp
|
||||||
|
src/main.cpp
|
||||||
|
src/Conversation.cpp
|
||||||
|
src/DefaultAudioInterface.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
target_include_directories(convai_cpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
|
||||||
|
|
||||||
|
# MSVC: set Windows target version and suppress getenv deprecation warning
|
||||||
|
if(MSVC)
|
||||||
|
target_compile_definitions(convai_cpp PRIVATE _WIN32_WINNT=0x0A00 _CRT_SECURE_NO_WARNINGS)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
target_link_libraries(convai_cpp PRIVATE Boost::system Boost::thread OpenSSL::SSL OpenSSL::Crypto portaudio nlohmann_json::nlohmann_json)
|
||||||
21
CPP/elevenlabs-convai-cpp-main/LICENSE
Normal file
21
CPP/elevenlabs-convai-cpp-main/LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2024 Jitendra
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
197
CPP/elevenlabs-convai-cpp-main/README.md
Normal file
197
CPP/elevenlabs-convai-cpp-main/README.md
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
# ElevenLabs Conversational AI - C++ Implementation
|
||||||
|
|
||||||
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
[](https://en.wikipedia.org/wiki/C%2B%2B17)
|
||||||
|
[](https://cmake.org/)
|
||||||
|
|
||||||
|
C++ implementation of ElevenLabs Conversational AI client
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Real-time Audio Processing**: Full-duplex audio streaming with low-latency playback
|
||||||
|
- **WebSocket Integration**: Secure WSS connection to ElevenLabs Conversational AI platform
|
||||||
|
- **Cross-platform Audio**: PortAudio-based implementation supporting Windows, macOS, and Linux
|
||||||
|
- **Echo Suppression**: Built-in acoustic feedback prevention
|
||||||
|
- **Modern C++**: Clean, maintainable C++17 codebase with proper RAII and exception handling
|
||||||
|
- **Flexible Architecture**: Modular design allowing easy customization and extension
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
graph TB
|
||||||
|
subgraph "User Interface"
|
||||||
|
A[main.cpp] --> B[Conversation]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph "Core Components"
|
||||||
|
B --> C[DefaultAudioInterface]
|
||||||
|
B --> D[WebSocket Client]
|
||||||
|
C --> E[PortAudio]
|
||||||
|
D --> F[Boost.Beast + OpenSSL]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph "ElevenLabs Platform"
|
||||||
|
F --> G[WSS API Endpoint]
|
||||||
|
G --> H[Conversational AI Agent]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph "Audio Flow"
|
||||||
|
I[Microphone] --> C
|
||||||
|
C --> J[Base64 Encoding]
|
||||||
|
J --> D
|
||||||
|
D --> K[Audio Events]
|
||||||
|
K --> L[Base64 Decoding]
|
||||||
|
L --> C
|
||||||
|
C --> M[Speakers]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph "Message Types"
|
||||||
|
N[user_audio_chunk]
|
||||||
|
O[agent_response]
|
||||||
|
P[user_transcript]
|
||||||
|
Q[audio_event]
|
||||||
|
R[ping/pong]
|
||||||
|
end
|
||||||
|
|
||||||
|
style B fill:#e1f5fe
|
||||||
|
style C fill:#f3e5f5
|
||||||
|
style D fill:#e8f5e8
|
||||||
|
style H fill:#fff3e0
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- **C++17 compatible compiler**: GCC 11+, Clang 14+, or MSVC 2022+
|
||||||
|
- **CMake** 3.14 or higher
|
||||||
|
- **Dependencies** (install via package manager):
|
||||||
|
|
||||||
|
#### macOS (Homebrew)
|
||||||
|
```bash
|
||||||
|
brew install boost openssl portaudio nlohmann-json cmake pkg-config
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Ubuntu/Debian
|
||||||
|
```bash
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install build-essential cmake pkg-config
|
||||||
|
sudo apt install libboost-system-dev libboost-thread-dev
|
||||||
|
sudo apt install libssl-dev libportaudio2-dev nlohmann-json3-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Windows (vcpkg)
|
||||||
|
```bash
|
||||||
|
vcpkg install boost-system boost-thread openssl portaudio nlohmann-json
|
||||||
|
```
|
||||||
|
|
||||||
|
### Building
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone https://github.com/Jitendra2603/elevenlabs-convai-cpp.git
|
||||||
|
cd elevenlabs-convai-cpp
|
||||||
|
|
||||||
|
# Build the project
|
||||||
|
mkdir build && cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Set your agent ID (get this from ElevenLabs dashboard)
|
||||||
|
export AGENT_ID="your-agent-id-here"
|
||||||
|
|
||||||
|
# Run the demo
|
||||||
|
./convai_cpp
|
||||||
|
```
|
||||||
|
|
||||||
|
The application will:
|
||||||
|
1. Connect to your ElevenLabs Conversational AI agent
|
||||||
|
2. Start capturing audio from your default microphone
|
||||||
|
3. Stream audio to the agent and play responses through speakers
|
||||||
|
4. Display conversation transcripts in the terminal
|
||||||
|
5. Continue until you press Enter to quit
|
||||||
|
|
||||||
|
## 📋 Usage Examples
|
||||||
|
|
||||||
|
### Basic Conversation
|
||||||
|
```bash
|
||||||
|
export AGENT_ID="agent_"
|
||||||
|
./convai_cpp
|
||||||
|
# Speak into your microphone and hear the AI agent respond
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Audio Settings
|
||||||
|
|
||||||
|
The audio interface is configured for optimal real-time performance:
|
||||||
|
|
||||||
|
- **Sample Rate**: 16 kHz
|
||||||
|
- **Format**: 16-bit PCM mono
|
||||||
|
- **Input Buffer**: 250ms (4000 frames)
|
||||||
|
- **Output Buffer**: 62.5ms (1000 frames)
|
||||||
|
|
||||||
|
### WebSocket Connection
|
||||||
|
|
||||||
|
- **Endpoint**: `wss://api.elevenlabs.io/v1/convai/conversation`
|
||||||
|
- **Protocol**: WebSocket Secure (WSS) with TLS 1.2+
|
||||||
|
- **Authentication**: Optional (required for private agents)
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
elevenlabs-convai-cpp/
|
||||||
|
├── CMakeLists.txt # Build configuration
|
||||||
|
├── README.md # This file
|
||||||
|
├── LICENSE # MIT license
|
||||||
|
├── CONTRIBUTING.md # Contribution guidelines
|
||||||
|
├── .gitignore # Git ignore rules
|
||||||
|
├── include/ # Header files
|
||||||
|
│ ├── AudioInterface.hpp # Abstract audio interface
|
||||||
|
│ ├── DefaultAudioInterface.hpp # PortAudio implementation
|
||||||
|
│ └── Conversation.hpp # Main conversation handler
|
||||||
|
└── src/ # Source files
|
||||||
|
├── main.cpp # Demo application
|
||||||
|
├── Conversation.cpp # WebSocket and message handling
|
||||||
|
└── DefaultAudioInterface.cpp # Audio I/O implementation
|
||||||
|
```
|
||||||
|
|
||||||
|
## Technical Details
|
||||||
|
|
||||||
|
### Audio Processing Pipeline
|
||||||
|
|
||||||
|
1. **Capture**: PortAudio captures 16-bit PCM audio at 16kHz
|
||||||
|
2. **Encoding**: Raw audio is base64-encoded for WebSocket transmission
|
||||||
|
3. **Streaming**: Audio chunks sent as `user_audio_chunk` messages
|
||||||
|
4. **Reception**: Server sends `audio_event` messages with agent responses
|
||||||
|
5. **Decoding**: Base64 audio data decoded back to PCM
|
||||||
|
6. **Playback**: Audio queued and played through PortAudio output stream
|
||||||
|
|
||||||
|
### Echo Suppression
|
||||||
|
|
||||||
|
The implementation includes a simple, effective echo suppression mechanism:
|
||||||
|
|
||||||
|
- Microphone input is suppressed during agent speech playback
|
||||||
|
- Prevents acoustic feedback loops that cause the agent to respond to itself
|
||||||
|
- Uses atomic flags for thread-safe coordination between input/output
|
||||||
|
|
||||||
|
### WebSocket Message Handling
|
||||||
|
|
||||||
|
Supported message types:
|
||||||
|
- `conversation_initiation_client_data` - Session initialization
|
||||||
|
- `user_audio_chunk` - Microphone audio data
|
||||||
|
- `audio_event` - Agent speech audio
|
||||||
|
- `agent_response` - Agent text responses
|
||||||
|
- `user_transcript` - Speech-to-text results
|
||||||
|
- `ping`/`pong` - Connection keepalive
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## 📝 License
|
||||||
|
|
||||||
|
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
||||||
23
CPP/elevenlabs-convai-cpp-main/include/AudioInterface.hpp
Normal file
23
CPP/elevenlabs-convai-cpp-main/include/AudioInterface.hpp
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
class AudioInterface {
|
||||||
|
public:
|
||||||
|
using AudioCallback = std::function<void(const std::vector<char>&)>;
|
||||||
|
|
||||||
|
virtual ~AudioInterface() = default;
|
||||||
|
|
||||||
|
// Starts the audio interface. The callback will be invoked with raw 16-bit PCM mono samples at 16kHz.
|
||||||
|
virtual void start(AudioCallback inputCallback) = 0;
|
||||||
|
|
||||||
|
// Stops audio I/O and releases underlying resources.
|
||||||
|
virtual void stop() = 0;
|
||||||
|
|
||||||
|
// Play audio to the user; audio is 16-bit PCM mono 16kHz.
|
||||||
|
virtual void output(const std::vector<char>& audio) = 0;
|
||||||
|
|
||||||
|
// Immediately stop any buffered / ongoing output.
|
||||||
|
virtual void interrupt() = 0;
|
||||||
|
};
|
||||||
72
CPP/elevenlabs-convai-cpp-main/include/Conversation.hpp
Normal file
72
CPP/elevenlabs-convai-cpp-main/include/Conversation.hpp
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "AudioInterface.hpp"
|
||||||
|
#include <boost/beast/core.hpp>
|
||||||
|
#include <boost/beast/websocket.hpp>
|
||||||
|
#include <boost/beast/ssl.hpp>
|
||||||
|
#include <boost/asio.hpp>
|
||||||
|
#include <boost/asio/ssl/stream.hpp>
|
||||||
|
#include <boost/asio/ip/tcp.hpp>
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
|
#include <thread>
|
||||||
|
#include <atomic>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
|
class Conversation {
|
||||||
|
public:
|
||||||
|
using CallbackAgentResponse = std::function<void(const std::string&)>;
|
||||||
|
using CallbackAgentResponseCorrection = std::function<void(const std::string&, const std::string&)>;
|
||||||
|
using CallbackUserTranscript = std::function<void(const std::string&)>;
|
||||||
|
using CallbackLatencyMeasurement = std::function<void(int)>;
|
||||||
|
|
||||||
|
Conversation(
|
||||||
|
const std::string& agentId,
|
||||||
|
bool requiresAuth,
|
||||||
|
std::shared_ptr<AudioInterface> audioInterface,
|
||||||
|
CallbackAgentResponse callbackAgentResponse = nullptr,
|
||||||
|
CallbackAgentResponseCorrection callbackAgentResponseCorrection = nullptr,
|
||||||
|
CallbackUserTranscript callbackUserTranscript = nullptr,
|
||||||
|
CallbackLatencyMeasurement callbackLatencyMeasurement = nullptr
|
||||||
|
);
|
||||||
|
|
||||||
|
~Conversation();
|
||||||
|
|
||||||
|
void startSession();
|
||||||
|
void endSession();
|
||||||
|
std::string waitForSessionEnd();
|
||||||
|
|
||||||
|
void sendUserMessage(const std::string& text);
|
||||||
|
void registerUserActivity();
|
||||||
|
void sendContextualUpdate(const std::string& content);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void run();
|
||||||
|
void handleMessage(const nlohmann::json& message);
|
||||||
|
std::string getWssUrl() const;
|
||||||
|
|
||||||
|
// networking members
|
||||||
|
boost::asio::io_context ioc_;
|
||||||
|
boost::asio::ssl::context sslCtx_{boost::asio::ssl::context::tlsv12_client};
|
||||||
|
|
||||||
|
using tcp = boost::asio::ip::tcp;
|
||||||
|
using websocket_t = boost::beast::websocket::stream<
|
||||||
|
boost::beast::ssl_stream<tcp::socket>>;
|
||||||
|
std::unique_ptr<websocket_t> ws_;
|
||||||
|
|
||||||
|
// general state
|
||||||
|
std::string agentId_;
|
||||||
|
bool requiresAuth_;
|
||||||
|
std::shared_ptr<AudioInterface> audioInterface_;
|
||||||
|
|
||||||
|
CallbackAgentResponse callbackAgentResponse_;
|
||||||
|
CallbackAgentResponseCorrection callbackAgentResponseCorrection_;
|
||||||
|
CallbackUserTranscript callbackUserTranscript_;
|
||||||
|
CallbackLatencyMeasurement callbackLatencyMeasurement_;
|
||||||
|
|
||||||
|
std::thread workerThread_;
|
||||||
|
std::atomic<bool> shouldStop_{false};
|
||||||
|
std::string conversationId_;
|
||||||
|
|
||||||
|
std::atomic<int> lastInterruptId_{0};
|
||||||
|
};
|
||||||
@ -0,0 +1,45 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "AudioInterface.hpp"
|
||||||
|
#include <portaudio.h>
|
||||||
|
#include <mutex>
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <queue>
|
||||||
|
#include <thread>
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
|
class DefaultAudioInterface : public AudioInterface {
|
||||||
|
public:
|
||||||
|
static constexpr int INPUT_FRAMES_PER_BUFFER = 4000; // 250ms @ 16kHz
|
||||||
|
static constexpr int OUTPUT_FRAMES_PER_BUFFER = 1000; // 62.5ms @ 16kHz
|
||||||
|
|
||||||
|
DefaultAudioInterface();
|
||||||
|
~DefaultAudioInterface() override;
|
||||||
|
|
||||||
|
void start(AudioCallback inputCallback) override;
|
||||||
|
void stop() override;
|
||||||
|
void output(const std::vector<char>& audio) override;
|
||||||
|
void interrupt() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
static int inputCallbackStatic(const void* input, void* output, unsigned long frameCount,
|
||||||
|
const PaStreamCallbackTimeInfo* timeInfo, PaStreamCallbackFlags statusFlags,
|
||||||
|
void* userData);
|
||||||
|
|
||||||
|
int inputCallbackInternal(const void* input, unsigned long frameCount);
|
||||||
|
|
||||||
|
void outputThreadFunc();
|
||||||
|
|
||||||
|
PaStream* inputStream_{};
|
||||||
|
PaStream* outputStream_{};
|
||||||
|
|
||||||
|
AudioCallback inputCallback_;
|
||||||
|
|
||||||
|
std::queue<std::vector<char>> outputQueue_;
|
||||||
|
std::mutex queueMutex_;
|
||||||
|
std::condition_variable queueCv_;
|
||||||
|
|
||||||
|
std::thread outputThread_;
|
||||||
|
std::atomic<bool> shouldStop_{false};
|
||||||
|
std::atomic<bool> outputPlaying_{false};
|
||||||
|
};
|
||||||
230
CPP/elevenlabs-convai-cpp-main/src/Conversation.cpp
Normal file
230
CPP/elevenlabs-convai-cpp-main/src/Conversation.cpp
Normal file
@ -0,0 +1,230 @@
|
|||||||
|
#include "Conversation.hpp"
|
||||||
|
|
||||||
|
#include <boost/beast/websocket/ssl.hpp>
|
||||||
|
#include <boost/beast/websocket.hpp>
|
||||||
|
#include <boost/beast/ssl.hpp>
|
||||||
|
#include <boost/beast/core/detail/base64.hpp>
|
||||||
|
#include <boost/asio/connect.hpp>
|
||||||
|
#include <boost/algorithm/string.hpp>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <openssl/ssl.h>
|
||||||
|
|
||||||
|
using tcp = boost::asio::ip::tcp;
|
||||||
|
namespace ssl = boost::asio::ssl;
|
||||||
|
namespace websocket = boost::beast::websocket;
|
||||||
|
namespace beast = boost::beast;
|
||||||
|
|
||||||
|
static std::string base64Encode(const std::vector<char>& data) {
|
||||||
|
auto encodedSize = beast::detail::base64::encoded_size(data.size());
|
||||||
|
std::string out(encodedSize, '\0');
|
||||||
|
beast::detail::base64::encode(&out[0], data.data(), data.size());
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<char> base64Decode(const std::string& str) {
|
||||||
|
auto decodedSize = beast::detail::base64::decoded_size(str.size());
|
||||||
|
std::vector<char> out(decodedSize);
|
||||||
|
auto result = beast::detail::base64::decode(out.data(), str.data(), str.size());
|
||||||
|
out.resize(result.first);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string toString(const nlohmann::json& j){
|
||||||
|
if(j.is_string()) return j.get<std::string>();
|
||||||
|
if(j.is_number_integer()) return std::to_string(j.get<int64_t>());
|
||||||
|
return j.dump();
|
||||||
|
}
|
||||||
|
|
||||||
|
Conversation::Conversation(const std::string& agentId, bool requiresAuth,
|
||||||
|
std::shared_ptr<AudioInterface> audioInterface,
|
||||||
|
CallbackAgentResponse callbackAgentResponse,
|
||||||
|
CallbackAgentResponseCorrection callbackAgentResponseCorrection,
|
||||||
|
CallbackUserTranscript callbackUserTranscript,
|
||||||
|
CallbackLatencyMeasurement callbackLatencyMeasurement)
|
||||||
|
: agentId_(agentId),
|
||||||
|
requiresAuth_(requiresAuth),
|
||||||
|
audioInterface_(std::move(audioInterface)),
|
||||||
|
callbackAgentResponse_(std::move(callbackAgentResponse)),
|
||||||
|
callbackAgentResponseCorrection_(std::move(callbackAgentResponseCorrection)),
|
||||||
|
callbackUserTranscript_(std::move(callbackUserTranscript)),
|
||||||
|
callbackLatencyMeasurement_(std::move(callbackLatencyMeasurement)) {
|
||||||
|
|
||||||
|
sslCtx_.set_default_verify_paths();
|
||||||
|
}
|
||||||
|
|
||||||
|
Conversation::~Conversation() {
|
||||||
|
endSession();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Conversation::startSession() {
|
||||||
|
shouldStop_.store(false);
|
||||||
|
workerThread_ = std::thread(&Conversation::run, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Conversation::endSession() {
|
||||||
|
shouldStop_.store(true);
|
||||||
|
if (ws_) {
|
||||||
|
beast::error_code ec;
|
||||||
|
ws_->close(websocket::close_code::normal, ec);
|
||||||
|
}
|
||||||
|
if (audioInterface_) {
|
||||||
|
audioInterface_->stop();
|
||||||
|
}
|
||||||
|
if (workerThread_.joinable()) {
|
||||||
|
workerThread_.join();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Conversation::waitForSessionEnd() {
|
||||||
|
if (workerThread_.joinable()) {
|
||||||
|
workerThread_.join();
|
||||||
|
}
|
||||||
|
return conversationId_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Conversation::sendUserMessage(const std::string& text) {
|
||||||
|
if (!ws_) {
|
||||||
|
throw std::runtime_error("Session not started");
|
||||||
|
}
|
||||||
|
nlohmann::json j = {
|
||||||
|
{"type", "user_message"},
|
||||||
|
{"text", text}
|
||||||
|
};
|
||||||
|
ws_->write(boost::asio::buffer(j.dump()));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Conversation::registerUserActivity() {
|
||||||
|
if (!ws_) throw std::runtime_error("Session not started");
|
||||||
|
nlohmann::json j = {{"type", "user_activity"}};
|
||||||
|
ws_->write(boost::asio::buffer(j.dump()));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Conversation::sendContextualUpdate(const std::string& content) {
|
||||||
|
if (!ws_) throw std::runtime_error("Session not started");
|
||||||
|
nlohmann::json j = {{"type", "contextual_update"}, {"content", content}};
|
||||||
|
ws_->write(boost::asio::buffer(j.dump()));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Conversation::getWssUrl() const {
|
||||||
|
// Hard-coded base env for demo; in production you'd call ElevenLabs env endpoint.
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "wss://api.elevenlabs.io/v1/convai/conversation?agent_id=" << agentId_;
|
||||||
|
return oss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Conversation::run() {
|
||||||
|
try {
|
||||||
|
auto url = getWssUrl();
|
||||||
|
std::string protocol, host, target;
|
||||||
|
unsigned short port = 443;
|
||||||
|
|
||||||
|
// Very naive parse: wss://host[:port]/path?query
|
||||||
|
if (boost::starts_with(url, "wss://")) {
|
||||||
|
protocol = "wss";
|
||||||
|
host = url.substr(6);
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("Only wss:// URLs supported in this demo");
|
||||||
|
}
|
||||||
|
auto slashPos = host.find('/');
|
||||||
|
if (slashPos == std::string::npos) {
|
||||||
|
target = "/";
|
||||||
|
} else {
|
||||||
|
target = host.substr(slashPos);
|
||||||
|
host = host.substr(0, slashPos);
|
||||||
|
}
|
||||||
|
auto colonPos = host.find(':');
|
||||||
|
if (colonPos != std::string::npos) {
|
||||||
|
port = static_cast<unsigned short>(std::stoi(host.substr(colonPos + 1)));
|
||||||
|
host = host.substr(0, colonPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
tcp::resolver resolver(ioc_);
|
||||||
|
auto const results = resolver.resolve(host, std::to_string(port));
|
||||||
|
|
||||||
|
beast::ssl_stream<tcp::socket> stream(ioc_, sslCtx_);
|
||||||
|
boost::asio::connect(beast::get_lowest_layer(stream), results);
|
||||||
|
if (!SSL_set_tlsext_host_name(stream.native_handle(), host.c_str())) {
|
||||||
|
throw std::runtime_error("Failed to set SNI hostname on SSL stream");
|
||||||
|
}
|
||||||
|
stream.handshake(ssl::stream_base::client);
|
||||||
|
|
||||||
|
ws_ = std::make_unique<websocket_t>(std::move(stream));
|
||||||
|
ws_->set_option(websocket::stream_base::timeout::suggested(beast::role_type::client));
|
||||||
|
ws_->handshake(host, target);
|
||||||
|
|
||||||
|
// send initiation data
|
||||||
|
nlohmann::json init = {
|
||||||
|
{"type", "conversation_initiation_client_data"},
|
||||||
|
{"custom_llm_extra_body", nlohmann::json::object()},
|
||||||
|
{"conversation_config_override", nlohmann::json::object()},
|
||||||
|
{"dynamic_variables", nlohmann::json::object()}
|
||||||
|
};
|
||||||
|
ws_->write(boost::asio::buffer(init.dump()));
|
||||||
|
|
||||||
|
// Prepare audio callback
|
||||||
|
auto inputCb = [this](const std::vector<char>& audio) {
|
||||||
|
nlohmann::json msg = {
|
||||||
|
{"user_audio_chunk", base64Encode(audio)}
|
||||||
|
};
|
||||||
|
ws_->write(boost::asio::buffer(msg.dump()));
|
||||||
|
};
|
||||||
|
audioInterface_->start(inputCb);
|
||||||
|
|
||||||
|
beast::flat_buffer buffer;
|
||||||
|
while (!shouldStop_.load()) {
|
||||||
|
beast::error_code ec;
|
||||||
|
ws_->read(buffer, ec);
|
||||||
|
if (ec) {
|
||||||
|
std::cerr << "Websocket read error: " << ec.message() << std::endl;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto text = beast::buffers_to_string(buffer.data());
|
||||||
|
buffer.consume(buffer.size());
|
||||||
|
try {
|
||||||
|
auto message = nlohmann::json::parse(text);
|
||||||
|
handleMessage(message);
|
||||||
|
} catch (const std::exception& ex) {
|
||||||
|
std::cerr << "JSON parse error: " << ex.what() << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (const std::exception& ex) {
|
||||||
|
std::cerr << "Conversation error: " << ex.what() << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Conversation::handleMessage(const nlohmann::json& message) {
|
||||||
|
std::string type = message.value("type", "");
|
||||||
|
if (type == "conversation_initiation_metadata") {
|
||||||
|
conversationId_ = message["conversation_initiation_metadata_event"]["conversation_id"].get<std::string>();
|
||||||
|
} else if (type == "audio") {
|
||||||
|
auto event = message["audio_event"];
|
||||||
|
int eventId = std::stoi(toString(event["event_id"]));
|
||||||
|
if (eventId <= lastInterruptId_.load()) return;
|
||||||
|
auto audioBytes = base64Decode(event["audio_base_64"].get<std::string>());
|
||||||
|
audioInterface_->output(audioBytes);
|
||||||
|
} else if (type == "agent_response" && callbackAgentResponse_) {
|
||||||
|
auto event = message["agent_response_event"];
|
||||||
|
callbackAgentResponse_(event["agent_response"].get<std::string>());
|
||||||
|
} else if (type == "agent_response_correction" && callbackAgentResponseCorrection_) {
|
||||||
|
auto event = message["agent_response_correction_event"];
|
||||||
|
callbackAgentResponseCorrection_(event["original_agent_response"].get<std::string>(),
|
||||||
|
event["corrected_agent_response"].get<std::string>());
|
||||||
|
} else if (type == "user_transcript" && callbackUserTranscript_) {
|
||||||
|
auto event = message["user_transcription_event"];
|
||||||
|
callbackUserTranscript_(event["user_transcript"].get<std::string>());
|
||||||
|
} else if (type == "interruption") {
|
||||||
|
auto event = message["interruption_event"];
|
||||||
|
lastInterruptId_.store(std::stoi(toString(event["event_id"])));
|
||||||
|
audioInterface_->interrupt();
|
||||||
|
} else if (type == "ping") {
|
||||||
|
auto event = message["ping_event"];
|
||||||
|
nlohmann::json pong = {{"type", "pong"}, {"event_id", event["event_id"]}};
|
||||||
|
ws_->write(boost::asio::buffer(pong.dump()));
|
||||||
|
if (callbackLatencyMeasurement_ && event.contains("ping_ms")) {
|
||||||
|
int latency = event["ping_ms"].is_number() ? event["ping_ms"].get<int>() : std::stoi(event["ping_ms"].get<std::string>());
|
||||||
|
callbackLatencyMeasurement_(latency);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Note: client tool call handling omitted for brevity.
|
||||||
|
}
|
||||||
131
CPP/elevenlabs-convai-cpp-main/src/DefaultAudioInterface.cpp
Normal file
131
CPP/elevenlabs-convai-cpp-main/src/DefaultAudioInterface.cpp
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
#include "DefaultAudioInterface.hpp"
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
DefaultAudioInterface::DefaultAudioInterface() {
|
||||||
|
PaError err = Pa_Initialize();
|
||||||
|
if (err != paNoError) {
|
||||||
|
throw std::runtime_error("PortAudio initialization failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DefaultAudioInterface::~DefaultAudioInterface() {
|
||||||
|
if (!shouldStop_.load()) {
|
||||||
|
stop();
|
||||||
|
}
|
||||||
|
Pa_Terminate();
|
||||||
|
}
|
||||||
|
|
||||||
|
void DefaultAudioInterface::start(AudioCallback inputCallback) {
|
||||||
|
inputCallback_ = std::move(inputCallback);
|
||||||
|
PaStreamParameters inputParams;
|
||||||
|
std::memset(&inputParams, 0, sizeof(inputParams));
|
||||||
|
inputParams.channelCount = 1;
|
||||||
|
inputParams.device = Pa_GetDefaultInputDevice();
|
||||||
|
inputParams.sampleFormat = paInt16;
|
||||||
|
inputParams.suggestedLatency = Pa_GetDeviceInfo(inputParams.device)->defaultLowInputLatency;
|
||||||
|
inputParams.hostApiSpecificStreamInfo = nullptr;
|
||||||
|
|
||||||
|
PaStreamParameters outputParams;
|
||||||
|
std::memset(&outputParams, 0, sizeof(outputParams));
|
||||||
|
outputParams.channelCount = 1;
|
||||||
|
outputParams.device = Pa_GetDefaultOutputDevice();
|
||||||
|
outputParams.sampleFormat = paInt16;
|
||||||
|
outputParams.suggestedLatency = Pa_GetDeviceInfo(outputParams.device)->defaultLowOutputLatency;
|
||||||
|
outputParams.hostApiSpecificStreamInfo = nullptr;
|
||||||
|
|
||||||
|
PaError err = Pa_OpenStream(&inputStream_, &inputParams, nullptr, 16000, INPUT_FRAMES_PER_BUFFER, paClipOff,
|
||||||
|
&DefaultAudioInterface::inputCallbackStatic, this);
|
||||||
|
if (err != paNoError) {
|
||||||
|
throw std::runtime_error("Failed to open input stream");
|
||||||
|
}
|
||||||
|
|
||||||
|
err = Pa_OpenStream(&outputStream_, nullptr, &outputParams, 16000, OUTPUT_FRAMES_PER_BUFFER, paClipOff, nullptr, nullptr);
|
||||||
|
if (err != paNoError) {
|
||||||
|
throw std::runtime_error("Failed to open output stream");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((err = Pa_StartStream(inputStream_)) != paNoError) {
|
||||||
|
throw std::runtime_error("Failed to start input stream");
|
||||||
|
}
|
||||||
|
if ((err = Pa_StartStream(outputStream_)) != paNoError) {
|
||||||
|
throw std::runtime_error("Failed to start output stream");
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldStop_.store(false);
|
||||||
|
outputThread_ = std::thread(&DefaultAudioInterface::outputThreadFunc, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DefaultAudioInterface::stop() {
|
||||||
|
shouldStop_.store(true);
|
||||||
|
queueCv_.notify_all();
|
||||||
|
if (outputThread_.joinable()) {
|
||||||
|
outputThread_.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inputStream_) {
|
||||||
|
Pa_StopStream(inputStream_);
|
||||||
|
Pa_CloseStream(inputStream_);
|
||||||
|
inputStream_ = nullptr;
|
||||||
|
}
|
||||||
|
if (outputStream_) {
|
||||||
|
Pa_StopStream(outputStream_);
|
||||||
|
Pa_CloseStream(outputStream_);
|
||||||
|
outputStream_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void DefaultAudioInterface::output(const std::vector<char>& audio) {
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lg(queueMutex_);
|
||||||
|
outputQueue_.emplace(audio);
|
||||||
|
}
|
||||||
|
queueCv_.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
void DefaultAudioInterface::interrupt() {
|
||||||
|
std::lock_guard<std::mutex> lg(queueMutex_);
|
||||||
|
std::queue<std::vector<char>> empty;
|
||||||
|
std::swap(outputQueue_, empty);
|
||||||
|
}
|
||||||
|
|
||||||
|
int DefaultAudioInterface::inputCallbackStatic(const void* input, void* /*output*/, unsigned long frameCount,
|
||||||
|
const PaStreamCallbackTimeInfo* /*timeInfo*/, PaStreamCallbackFlags /*statusFlags*/,
|
||||||
|
void* userData) {
|
||||||
|
auto* self = static_cast<DefaultAudioInterface*>(userData);
|
||||||
|
return self->inputCallbackInternal(input, frameCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
int DefaultAudioInterface::inputCallbackInternal(const void* input, unsigned long frameCount) {
|
||||||
|
if (!input || !inputCallback_) {
|
||||||
|
return paContinue;
|
||||||
|
}
|
||||||
|
if (outputPlaying_.load()) {
|
||||||
|
// Suppress microphone input while playing output to avoid echo feedback.
|
||||||
|
return paContinue;
|
||||||
|
}
|
||||||
|
const size_t bytes = frameCount * sizeof(int16_t);
|
||||||
|
std::vector<char> buffer(bytes);
|
||||||
|
std::memcpy(buffer.data(), input, bytes);
|
||||||
|
inputCallback_(buffer);
|
||||||
|
return paContinue;
|
||||||
|
}
|
||||||
|
|
||||||
|
void DefaultAudioInterface::outputThreadFunc() {
|
||||||
|
while (!shouldStop_.load()) {
|
||||||
|
std::vector<char> audio;
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lk(queueMutex_);
|
||||||
|
queueCv_.wait(lk, [this] { return shouldStop_.load() || !outputQueue_.empty(); });
|
||||||
|
if (shouldStop_.load()) break;
|
||||||
|
audio = std::move(outputQueue_.front());
|
||||||
|
outputQueue_.pop();
|
||||||
|
}
|
||||||
|
if (!audio.empty() && outputStream_) {
|
||||||
|
outputPlaying_.store(true);
|
||||||
|
Pa_WriteStream(outputStream_, audio.data(), audio.size() / sizeof(int16_t));
|
||||||
|
outputPlaying_.store(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
31
CPP/elevenlabs-convai-cpp-main/src/main.cpp
Normal file
31
CPP/elevenlabs-convai-cpp-main/src/main.cpp
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
#include "Conversation.hpp"
|
||||||
|
#include "DefaultAudioInterface.hpp"
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <iostream>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
const char* agentIdEnv = std::getenv("AGENT_ID");
|
||||||
|
if (!agentIdEnv) {
|
||||||
|
std::cerr << "AGENT_ID environment variable must be set" << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
std::string agentId(agentIdEnv);
|
||||||
|
|
||||||
|
auto audioInterface = std::make_shared<DefaultAudioInterface>();
|
||||||
|
Conversation conv(agentId, /*requiresAuth*/ false, audioInterface,
|
||||||
|
[](const std::string& resp) { std::cout << "Agent: " << resp << std::endl; },
|
||||||
|
[](const std::string& orig, const std::string& corrected) {
|
||||||
|
std::cout << "Agent correction: " << orig << " -> " << corrected << std::endl; },
|
||||||
|
[](const std::string& transcript) { std::cout << "User: " << transcript << std::endl; });
|
||||||
|
|
||||||
|
conv.startSession();
|
||||||
|
|
||||||
|
std::cout << "Press Enter to quit..." << std::endl;
|
||||||
|
std::cin.get();
|
||||||
|
conv.endSession();
|
||||||
|
auto convId = conv.waitForSessionEnd();
|
||||||
|
std::cout << "Conversation ID: " << convId << std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Binary file not shown.
@ -604,9 +604,15 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
|
|||||||
{
|
{
|
||||||
if (!IsConnected() || !bIsListening) return;
|
if (!IsConnected() || !bIsListening) return;
|
||||||
|
|
||||||
|
// Echo suppression: skip sending mic audio while the agent is speaking.
|
||||||
|
// This prevents the agent from hearing its own voice through the speakers,
|
||||||
|
// which would confuse the server's VAD and STT. Matches the approach used
|
||||||
|
// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
|
||||||
|
if (bAgentSpeaking) return;
|
||||||
|
|
||||||
// Convert this callback's samples to int16 bytes and accumulate.
|
// Convert this callback's samples to int16 bytes and accumulate.
|
||||||
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
|
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms
|
||||||
// (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here
|
// (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here
|
||||||
// until we have enough, then send the whole batch in one WebSocket frame.
|
// until we have enough, then send the whole batch in one WebSocket frame.
|
||||||
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
|
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
|
||||||
MicAccumulationBuffer.Append(PCMBytes);
|
MicAccumulationBuffer.Append(PCMBytes);
|
||||||
|
|||||||
@ -491,6 +491,17 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Discard audio belonging to an interrupted generation (event_id approach).
|
||||||
|
// Matches the official ElevenLabs C++ and Python SDKs: only AUDIO is filtered
|
||||||
|
// by event_id — transcripts, agent_response, etc. are always processed.
|
||||||
|
int32 EventId = 0;
|
||||||
|
(*AudioEvent)->TryGetNumberField(TEXT("event_id"), EventId);
|
||||||
|
if (EventId > 0 && EventId <= LastInterruptEventId)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio event_id=%d (interrupted at %d)."), EventId, LastInterruptEventId);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
FString Base64Audio;
|
FString Base64Audio;
|
||||||
if (!(*AudioEvent)->TryGetStringField(TEXT("audio_base_64"), Base64Audio))
|
if (!(*AudioEvent)->TryGetStringField(TEXT("audio_base_64"), Base64Audio))
|
||||||
{
|
{
|
||||||
@ -591,7 +602,20 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
|
|||||||
|
|
||||||
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
|
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
|
||||||
{
|
{
|
||||||
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received)."));
|
// Extract the interrupt event_id so we can filter stale audio frames.
|
||||||
|
// { "type": "interruption", "interruption_event": { "event_id": 42 } }
|
||||||
|
const TSharedPtr<FJsonObject>* InterruptEvent = nullptr;
|
||||||
|
if (Root->TryGetObjectField(TEXT("interruption_event"), InterruptEvent) && InterruptEvent)
|
||||||
|
{
|
||||||
|
int32 EventId = 0;
|
||||||
|
(*InterruptEvent)->TryGetNumberField(TEXT("event_id"), EventId);
|
||||||
|
if (EventId > LastInterruptEventId)
|
||||||
|
{
|
||||||
|
LastInterruptEventId = EventId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack, LastInterruptEventId=%d)."), LastInterruptEventId);
|
||||||
OnInterrupted.Broadcast();
|
OnInterrupted.Broadcast();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -400,5 +400,5 @@ private:
|
|||||||
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
|
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
|
||||||
// We accumulate here and only call SendAudioChunk once enough bytes are ready.
|
// We accumulate here and only call SendAudioChunk once enough bytes are ready.
|
||||||
TArray<uint8> MicAccumulationBuffer;
|
TArray<uint8> MicAccumulationBuffer;
|
||||||
static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono
|
static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation)
|
||||||
};
|
};
|
||||||
|
|||||||
@ -226,6 +226,13 @@ private:
|
|||||||
// Used to compute [T+Xs] session-relative timestamps in all log messages.
|
// Used to compute [T+Xs] session-relative timestamps in all log messages.
|
||||||
double SessionStartTime = 0.0;
|
double SessionStartTime = 0.0;
|
||||||
|
|
||||||
|
// ── Interrupt filtering (event_id approach, matching official SDK) ────────
|
||||||
|
// When the server sends an "interruption" event it includes an event_id.
|
||||||
|
// Audio events whose event_id <= LastInterruptEventId belong to the cancelled
|
||||||
|
// generation and must be discarded. Only AUDIO is filtered — transcripts,
|
||||||
|
// agent_response, agent_chat_response_part etc. are always processed.
|
||||||
|
int32 LastInterruptEventId = 0;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
|
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
|
||||||
// Controls turn_timeout in conversation_initiation_client_data.
|
// Controls turn_timeout in conversation_initiation_client_data.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user