Compare commits
No commits in common. "f23acc8c1ce02d84b4984453782bc9b7b7ac730f" and "d63c1776b5c72ce5a60cd93b25abbea50b676f58" have entirely different histories.
f23acc8c1c
...
d63c1776b5
152
CPP/elevenlabs-convai-cpp-main/.gitignore
vendored
152
CPP/elevenlabs-convai-cpp-main/.gitignore
vendored
@ -1,152 +0,0 @@
|
|||||||
# Build directories
|
|
||||||
build/
|
|
||||||
cmake-build-*/
|
|
||||||
out/
|
|
||||||
|
|
||||||
# Compiled Object files
|
|
||||||
*.slo
|
|
||||||
*.lo
|
|
||||||
*.o
|
|
||||||
*.obj
|
|
||||||
|
|
||||||
# Precompiled Headers
|
|
||||||
*.gch
|
|
||||||
*.pch
|
|
||||||
|
|
||||||
# Compiled Dynamic libraries
|
|
||||||
*.so
|
|
||||||
*.dylib
|
|
||||||
*.dll
|
|
||||||
|
|
||||||
# Fortran module files
|
|
||||||
*.mod
|
|
||||||
*.smod
|
|
||||||
|
|
||||||
# Compiled Static libraries
|
|
||||||
*.lai
|
|
||||||
*.la
|
|
||||||
*.a
|
|
||||||
*.lib
|
|
||||||
|
|
||||||
# Executables
|
|
||||||
*.exe
|
|
||||||
*.out
|
|
||||||
*.app
|
|
||||||
convai_cpp
|
|
||||||
|
|
||||||
# CMake
|
|
||||||
CMakeCache.txt
|
|
||||||
CMakeFiles/
|
|
||||||
CMakeScripts/
|
|
||||||
Testing/
|
|
||||||
Makefile
|
|
||||||
cmake_install.cmake
|
|
||||||
install_manifest.txt
|
|
||||||
compile_commands.json
|
|
||||||
CTestTestfile.cmake
|
|
||||||
_deps/
|
|
||||||
|
|
||||||
# IDE files
|
|
||||||
.vscode/
|
|
||||||
.idea/
|
|
||||||
*.swp
|
|
||||||
*.swo
|
|
||||||
*~
|
|
||||||
|
|
||||||
# macOS
|
|
||||||
.DS_Store
|
|
||||||
.AppleDouble
|
|
||||||
.LSOverride
|
|
||||||
|
|
||||||
# Thumbnails
|
|
||||||
._*
|
|
||||||
|
|
||||||
# Files that might appear in the root of a volume
|
|
||||||
.DocumentRevisions-V100
|
|
||||||
.fseventsd
|
|
||||||
.Spotlight-V100
|
|
||||||
.TemporaryItems
|
|
||||||
.Trashes
|
|
||||||
.VolumeIcon.icns
|
|
||||||
.com.apple.timemachine.donotpresent
|
|
||||||
|
|
||||||
# Directories potentially created on remote AFP share
|
|
||||||
.AppleDB
|
|
||||||
.AppleDesktop
|
|
||||||
Network Trash Folder
|
|
||||||
Temporary Items
|
|
||||||
.apdisk
|
|
||||||
|
|
||||||
# Windows
|
|
||||||
Thumbs.db
|
|
||||||
ehthumbs.db
|
|
||||||
Desktop.ini
|
|
||||||
$RECYCLE.BIN/
|
|
||||||
*.cab
|
|
||||||
*.msi
|
|
||||||
*.msm
|
|
||||||
*.msp
|
|
||||||
*.lnk
|
|
||||||
|
|
||||||
# Linux
|
|
||||||
*~
|
|
||||||
.fuse_hidden*
|
|
||||||
.directory
|
|
||||||
.Trash-*
|
|
||||||
.nfs*
|
|
||||||
|
|
||||||
# Logs
|
|
||||||
*.log
|
|
||||||
|
|
||||||
# Runtime data
|
|
||||||
pids
|
|
||||||
*.pid
|
|
||||||
*.seed
|
|
||||||
*.pid.lock
|
|
||||||
|
|
||||||
# Coverage directory used by tools like istanbul
|
|
||||||
coverage/
|
|
||||||
|
|
||||||
# nyc test coverage
|
|
||||||
.nyc_output
|
|
||||||
|
|
||||||
# Dependency directories
|
|
||||||
node_modules/
|
|
||||||
|
|
||||||
# Optional npm cache directory
|
|
||||||
.npm
|
|
||||||
|
|
||||||
# Optional REPL history
|
|
||||||
.node_repl_history
|
|
||||||
|
|
||||||
# Output of 'npm pack'
|
|
||||||
*.tgz
|
|
||||||
|
|
||||||
# Yarn Integrity file
|
|
||||||
.yarn-integrity
|
|
||||||
|
|
||||||
# dotenv environment variables file
|
|
||||||
.env
|
|
||||||
.env.test
|
|
||||||
|
|
||||||
# parcel-bundler cache (https://parceljs.org/)
|
|
||||||
.cache
|
|
||||||
.parcel-cache
|
|
||||||
|
|
||||||
# next.js build output
|
|
||||||
.next
|
|
||||||
|
|
||||||
# nuxt.js build output
|
|
||||||
.nuxt
|
|
||||||
|
|
||||||
# vuepress build output
|
|
||||||
.vuepress/dist
|
|
||||||
|
|
||||||
# Serverless directories
|
|
||||||
.serverless
|
|
||||||
|
|
||||||
# FuseBox cache
|
|
||||||
.fusebox/
|
|
||||||
|
|
||||||
# DynamoDB Local files
|
|
||||||
.dynamodb/
|
|
||||||
@ -1,42 +0,0 @@
|
|||||||
cmake_minimum_required(VERSION 3.14)
|
|
||||||
|
|
||||||
project(elevenlabs_convai_cpp LANGUAGES CXX)
|
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
||||||
|
|
||||||
# Find dependencies
|
|
||||||
find_package(Boost REQUIRED COMPONENTS system thread)
|
|
||||||
find_package(OpenSSL REQUIRED)
|
|
||||||
# PortAudio via vcpkg CMake config
|
|
||||||
find_package(portaudio CONFIG REQUIRED)
|
|
||||||
|
|
||||||
# Find nlohmann_json
|
|
||||||
find_package(nlohmann_json 3.11 QUIET)
|
|
||||||
|
|
||||||
if(NOT nlohmann_json_FOUND)
|
|
||||||
include(FetchContent)
|
|
||||||
# Fallback: header-only fetch to avoid old CMake policies in upstream CMakeLists
|
|
||||||
FetchContent_Declare(
|
|
||||||
nlohmann_json_src
|
|
||||||
URL https://raw.githubusercontent.com/nlohmann/json/v3.11.2/single_include/nlohmann/json.hpp
|
|
||||||
)
|
|
||||||
FetchContent_MakeAvailable(nlohmann_json_src)
|
|
||||||
add_library(nlohmann_json::nlohmann_json INTERFACE IMPORTED)
|
|
||||||
target_include_directories(nlohmann_json::nlohmann_json INTERFACE ${nlohmann_json_src_SOURCE_DIR}/single_include)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_executable(convai_cpp
|
|
||||||
src/main.cpp
|
|
||||||
src/Conversation.cpp
|
|
||||||
src/DefaultAudioInterface.cpp
|
|
||||||
)
|
|
||||||
|
|
||||||
target_include_directories(convai_cpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
|
|
||||||
|
|
||||||
# MSVC: set Windows target version and suppress getenv deprecation warning
|
|
||||||
if(MSVC)
|
|
||||||
target_compile_definitions(convai_cpp PRIVATE _WIN32_WINNT=0x0A00 _CRT_SECURE_NO_WARNINGS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
target_link_libraries(convai_cpp PRIVATE Boost::system Boost::thread OpenSSL::SSL OpenSSL::Crypto portaudio nlohmann_json::nlohmann_json)
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2024 Jitendra
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
@ -1,197 +0,0 @@
|
|||||||
# ElevenLabs Conversational AI - C++ Implementation
|
|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT)
|
|
||||||
[](https://en.wikipedia.org/wiki/C%2B%2B17)
|
|
||||||
[](https://cmake.org/)
|
|
||||||
|
|
||||||
C++ implementation of ElevenLabs Conversational AI client
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- **Real-time Audio Processing**: Full-duplex audio streaming with low-latency playback
|
|
||||||
- **WebSocket Integration**: Secure WSS connection to ElevenLabs Conversational AI platform
|
|
||||||
- **Cross-platform Audio**: PortAudio-based implementation supporting Windows, macOS, and Linux
|
|
||||||
- **Echo Suppression**: Built-in acoustic feedback prevention
|
|
||||||
- **Modern C++**: Clean, maintainable C++17 codebase with proper RAII and exception handling
|
|
||||||
- **Flexible Architecture**: Modular design allowing easy customization and extension
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
graph TB
|
|
||||||
subgraph "User Interface"
|
|
||||||
A[main.cpp] --> B[Conversation]
|
|
||||||
end
|
|
||||||
|
|
||||||
subgraph "Core Components"
|
|
||||||
B --> C[DefaultAudioInterface]
|
|
||||||
B --> D[WebSocket Client]
|
|
||||||
C --> E[PortAudio]
|
|
||||||
D --> F[Boost.Beast + OpenSSL]
|
|
||||||
end
|
|
||||||
|
|
||||||
subgraph "ElevenLabs Platform"
|
|
||||||
F --> G[WSS API Endpoint]
|
|
||||||
G --> H[Conversational AI Agent]
|
|
||||||
end
|
|
||||||
|
|
||||||
subgraph "Audio Flow"
|
|
||||||
I[Microphone] --> C
|
|
||||||
C --> J[Base64 Encoding]
|
|
||||||
J --> D
|
|
||||||
D --> K[Audio Events]
|
|
||||||
K --> L[Base64 Decoding]
|
|
||||||
L --> C
|
|
||||||
C --> M[Speakers]
|
|
||||||
end
|
|
||||||
|
|
||||||
subgraph "Message Types"
|
|
||||||
N[user_audio_chunk]
|
|
||||||
O[agent_response]
|
|
||||||
P[user_transcript]
|
|
||||||
Q[audio_event]
|
|
||||||
R[ping/pong]
|
|
||||||
end
|
|
||||||
|
|
||||||
style B fill:#e1f5fe
|
|
||||||
style C fill:#f3e5f5
|
|
||||||
style D fill:#e8f5e8
|
|
||||||
style H fill:#fff3e0
|
|
||||||
```
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### Prerequisites
|
|
||||||
|
|
||||||
- **C++17 compatible compiler**: GCC 11+, Clang 14+, or MSVC 2022+
|
|
||||||
- **CMake** 3.14 or higher
|
|
||||||
- **Dependencies** (install via package manager):
|
|
||||||
|
|
||||||
#### macOS (Homebrew)
|
|
||||||
```bash
|
|
||||||
brew install boost openssl portaudio nlohmann-json cmake pkg-config
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Ubuntu/Debian
|
|
||||||
```bash
|
|
||||||
sudo apt update
|
|
||||||
sudo apt install build-essential cmake pkg-config
|
|
||||||
sudo apt install libboost-system-dev libboost-thread-dev
|
|
||||||
sudo apt install libssl-dev libportaudio2-dev nlohmann-json3-dev
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Windows (vcpkg)
|
|
||||||
```bash
|
|
||||||
vcpkg install boost-system boost-thread openssl portaudio nlohmann-json
|
|
||||||
```
|
|
||||||
|
|
||||||
### Building
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Clone the repository
|
|
||||||
git clone https://github.com/Jitendra2603/elevenlabs-convai-cpp.git
|
|
||||||
cd elevenlabs-convai-cpp
|
|
||||||
|
|
||||||
# Build the project
|
|
||||||
mkdir build && cd build
|
|
||||||
cmake ..
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Set your agent ID (get this from ElevenLabs dashboard)
|
|
||||||
export AGENT_ID="your-agent-id-here"
|
|
||||||
|
|
||||||
# Run the demo
|
|
||||||
./convai_cpp
|
|
||||||
```
|
|
||||||
|
|
||||||
The application will:
|
|
||||||
1. Connect to your ElevenLabs Conversational AI agent
|
|
||||||
2. Start capturing audio from your default microphone
|
|
||||||
3. Stream audio to the agent and play responses through speakers
|
|
||||||
4. Display conversation transcripts in the terminal
|
|
||||||
5. Continue until you press Enter to quit
|
|
||||||
|
|
||||||
## 📋 Usage Examples
|
|
||||||
|
|
||||||
### Basic Conversation
|
|
||||||
```bash
|
|
||||||
export AGENT_ID="agent_"
|
|
||||||
./convai_cpp
|
|
||||||
# Speak into your microphone and hear the AI agent respond
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### Audio Settings
|
|
||||||
|
|
||||||
The audio interface is configured for optimal real-time performance:
|
|
||||||
|
|
||||||
- **Sample Rate**: 16 kHz
|
|
||||||
- **Format**: 16-bit PCM mono
|
|
||||||
- **Input Buffer**: 250ms (4000 frames)
|
|
||||||
- **Output Buffer**: 62.5ms (1000 frames)
|
|
||||||
|
|
||||||
### WebSocket Connection
|
|
||||||
|
|
||||||
- **Endpoint**: `wss://api.elevenlabs.io/v1/convai/conversation`
|
|
||||||
- **Protocol**: WebSocket Secure (WSS) with TLS 1.2+
|
|
||||||
- **Authentication**: Optional (required for private agents)
|
|
||||||
|
|
||||||
## Project Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
elevenlabs-convai-cpp/
|
|
||||||
├── CMakeLists.txt # Build configuration
|
|
||||||
├── README.md # This file
|
|
||||||
├── LICENSE # MIT license
|
|
||||||
├── CONTRIBUTING.md # Contribution guidelines
|
|
||||||
├── .gitignore # Git ignore rules
|
|
||||||
├── include/ # Header files
|
|
||||||
│ ├── AudioInterface.hpp # Abstract audio interface
|
|
||||||
│ ├── DefaultAudioInterface.hpp # PortAudio implementation
|
|
||||||
│ └── Conversation.hpp # Main conversation handler
|
|
||||||
└── src/ # Source files
|
|
||||||
├── main.cpp # Demo application
|
|
||||||
├── Conversation.cpp # WebSocket and message handling
|
|
||||||
└── DefaultAudioInterface.cpp # Audio I/O implementation
|
|
||||||
```
|
|
||||||
|
|
||||||
## Technical Details
|
|
||||||
|
|
||||||
### Audio Processing Pipeline
|
|
||||||
|
|
||||||
1. **Capture**: PortAudio captures 16-bit PCM audio at 16kHz
|
|
||||||
2. **Encoding**: Raw audio is base64-encoded for WebSocket transmission
|
|
||||||
3. **Streaming**: Audio chunks sent as `user_audio_chunk` messages
|
|
||||||
4. **Reception**: Server sends `audio_event` messages with agent responses
|
|
||||||
5. **Decoding**: Base64 audio data decoded back to PCM
|
|
||||||
6. **Playback**: Audio queued and played through PortAudio output stream
|
|
||||||
|
|
||||||
### Echo Suppression
|
|
||||||
|
|
||||||
The implementation includes a simple, effective echo suppression mechanism:
|
|
||||||
|
|
||||||
- Microphone input is suppressed during agent speech playback
|
|
||||||
- Prevents acoustic feedback loops that cause the agent to respond to itself
|
|
||||||
- Uses atomic flags for thread-safe coordination between input/output
|
|
||||||
|
|
||||||
### WebSocket Message Handling
|
|
||||||
|
|
||||||
Supported message types:
|
|
||||||
- `conversation_initiation_client_data` - Session initialization
|
|
||||||
- `user_audio_chunk` - Microphone audio data
|
|
||||||
- `audio_event` - Agent speech audio
|
|
||||||
- `agent_response` - Agent text responses
|
|
||||||
- `user_transcript` - Speech-to-text results
|
|
||||||
- `ping`/`pong` - Connection keepalive
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 📝 License
|
|
||||||
|
|
||||||
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
||||||
@ -1,23 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <functional>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
class AudioInterface {
|
|
||||||
public:
|
|
||||||
using AudioCallback = std::function<void(const std::vector<char>&)>;
|
|
||||||
|
|
||||||
virtual ~AudioInterface() = default;
|
|
||||||
|
|
||||||
// Starts the audio interface. The callback will be invoked with raw 16-bit PCM mono samples at 16kHz.
|
|
||||||
virtual void start(AudioCallback inputCallback) = 0;
|
|
||||||
|
|
||||||
// Stops audio I/O and releases underlying resources.
|
|
||||||
virtual void stop() = 0;
|
|
||||||
|
|
||||||
// Play audio to the user; audio is 16-bit PCM mono 16kHz.
|
|
||||||
virtual void output(const std::vector<char>& audio) = 0;
|
|
||||||
|
|
||||||
// Immediately stop any buffered / ongoing output.
|
|
||||||
virtual void interrupt() = 0;
|
|
||||||
};
|
|
||||||
@ -1,72 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "AudioInterface.hpp"
|
|
||||||
#include <boost/beast/core.hpp>
|
|
||||||
#include <boost/beast/websocket.hpp>
|
|
||||||
#include <boost/beast/ssl.hpp>
|
|
||||||
#include <boost/asio.hpp>
|
|
||||||
#include <boost/asio/ssl/stream.hpp>
|
|
||||||
#include <boost/asio/ip/tcp.hpp>
|
|
||||||
#include <nlohmann/json.hpp>
|
|
||||||
|
|
||||||
#include <thread>
|
|
||||||
#include <atomic>
|
|
||||||
#include <functional>
|
|
||||||
|
|
||||||
class Conversation {
|
|
||||||
public:
|
|
||||||
using CallbackAgentResponse = std::function<void(const std::string&)>;
|
|
||||||
using CallbackAgentResponseCorrection = std::function<void(const std::string&, const std::string&)>;
|
|
||||||
using CallbackUserTranscript = std::function<void(const std::string&)>;
|
|
||||||
using CallbackLatencyMeasurement = std::function<void(int)>;
|
|
||||||
|
|
||||||
Conversation(
|
|
||||||
const std::string& agentId,
|
|
||||||
bool requiresAuth,
|
|
||||||
std::shared_ptr<AudioInterface> audioInterface,
|
|
||||||
CallbackAgentResponse callbackAgentResponse = nullptr,
|
|
||||||
CallbackAgentResponseCorrection callbackAgentResponseCorrection = nullptr,
|
|
||||||
CallbackUserTranscript callbackUserTranscript = nullptr,
|
|
||||||
CallbackLatencyMeasurement callbackLatencyMeasurement = nullptr
|
|
||||||
);
|
|
||||||
|
|
||||||
~Conversation();
|
|
||||||
|
|
||||||
void startSession();
|
|
||||||
void endSession();
|
|
||||||
std::string waitForSessionEnd();
|
|
||||||
|
|
||||||
void sendUserMessage(const std::string& text);
|
|
||||||
void registerUserActivity();
|
|
||||||
void sendContextualUpdate(const std::string& content);
|
|
||||||
|
|
||||||
private:
|
|
||||||
void run();
|
|
||||||
void handleMessage(const nlohmann::json& message);
|
|
||||||
std::string getWssUrl() const;
|
|
||||||
|
|
||||||
// networking members
|
|
||||||
boost::asio::io_context ioc_;
|
|
||||||
boost::asio::ssl::context sslCtx_{boost::asio::ssl::context::tlsv12_client};
|
|
||||||
|
|
||||||
using tcp = boost::asio::ip::tcp;
|
|
||||||
using websocket_t = boost::beast::websocket::stream<
|
|
||||||
boost::beast::ssl_stream<tcp::socket>>;
|
|
||||||
std::unique_ptr<websocket_t> ws_;
|
|
||||||
|
|
||||||
// general state
|
|
||||||
std::string agentId_;
|
|
||||||
bool requiresAuth_;
|
|
||||||
std::shared_ptr<AudioInterface> audioInterface_;
|
|
||||||
|
|
||||||
CallbackAgentResponse callbackAgentResponse_;
|
|
||||||
CallbackAgentResponseCorrection callbackAgentResponseCorrection_;
|
|
||||||
CallbackUserTranscript callbackUserTranscript_;
|
|
||||||
CallbackLatencyMeasurement callbackLatencyMeasurement_;
|
|
||||||
|
|
||||||
std::thread workerThread_;
|
|
||||||
std::atomic<bool> shouldStop_{false};
|
|
||||||
std::string conversationId_;
|
|
||||||
|
|
||||||
std::atomic<int> lastInterruptId_{0};
|
|
||||||
};
|
|
||||||
@ -1,45 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "AudioInterface.hpp"
|
|
||||||
#include <portaudio.h>
|
|
||||||
#include <mutex>
|
|
||||||
#include <condition_variable>
|
|
||||||
#include <queue>
|
|
||||||
#include <thread>
|
|
||||||
#include <atomic>
|
|
||||||
|
|
||||||
class DefaultAudioInterface : public AudioInterface {
|
|
||||||
public:
|
|
||||||
static constexpr int INPUT_FRAMES_PER_BUFFER = 4000; // 250ms @ 16kHz
|
|
||||||
static constexpr int OUTPUT_FRAMES_PER_BUFFER = 1000; // 62.5ms @ 16kHz
|
|
||||||
|
|
||||||
DefaultAudioInterface();
|
|
||||||
~DefaultAudioInterface() override;
|
|
||||||
|
|
||||||
void start(AudioCallback inputCallback) override;
|
|
||||||
void stop() override;
|
|
||||||
void output(const std::vector<char>& audio) override;
|
|
||||||
void interrupt() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
static int inputCallbackStatic(const void* input, void* output, unsigned long frameCount,
|
|
||||||
const PaStreamCallbackTimeInfo* timeInfo, PaStreamCallbackFlags statusFlags,
|
|
||||||
void* userData);
|
|
||||||
|
|
||||||
int inputCallbackInternal(const void* input, unsigned long frameCount);
|
|
||||||
|
|
||||||
void outputThreadFunc();
|
|
||||||
|
|
||||||
PaStream* inputStream_{};
|
|
||||||
PaStream* outputStream_{};
|
|
||||||
|
|
||||||
AudioCallback inputCallback_;
|
|
||||||
|
|
||||||
std::queue<std::vector<char>> outputQueue_;
|
|
||||||
std::mutex queueMutex_;
|
|
||||||
std::condition_variable queueCv_;
|
|
||||||
|
|
||||||
std::thread outputThread_;
|
|
||||||
std::atomic<bool> shouldStop_{false};
|
|
||||||
std::atomic<bool> outputPlaying_{false};
|
|
||||||
};
|
|
||||||
@ -1,230 +0,0 @@
|
|||||||
#include "Conversation.hpp"
|
|
||||||
|
|
||||||
#include <boost/beast/websocket/ssl.hpp>
|
|
||||||
#include <boost/beast/websocket.hpp>
|
|
||||||
#include <boost/beast/ssl.hpp>
|
|
||||||
#include <boost/beast/core/detail/base64.hpp>
|
|
||||||
#include <boost/asio/connect.hpp>
|
|
||||||
#include <boost/algorithm/string.hpp>
|
|
||||||
#include <iostream>
|
|
||||||
#include <sstream>
|
|
||||||
#include <openssl/ssl.h>
|
|
||||||
|
|
||||||
using tcp = boost::asio::ip::tcp;
|
|
||||||
namespace ssl = boost::asio::ssl;
|
|
||||||
namespace websocket = boost::beast::websocket;
|
|
||||||
namespace beast = boost::beast;
|
|
||||||
|
|
||||||
static std::string base64Encode(const std::vector<char>& data) {
|
|
||||||
auto encodedSize = beast::detail::base64::encoded_size(data.size());
|
|
||||||
std::string out(encodedSize, '\0');
|
|
||||||
beast::detail::base64::encode(&out[0], data.data(), data.size());
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::vector<char> base64Decode(const std::string& str) {
|
|
||||||
auto decodedSize = beast::detail::base64::decoded_size(str.size());
|
|
||||||
std::vector<char> out(decodedSize);
|
|
||||||
auto result = beast::detail::base64::decode(out.data(), str.data(), str.size());
|
|
||||||
out.resize(result.first);
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string toString(const nlohmann::json& j){
|
|
||||||
if(j.is_string()) return j.get<std::string>();
|
|
||||||
if(j.is_number_integer()) return std::to_string(j.get<int64_t>());
|
|
||||||
return j.dump();
|
|
||||||
}
|
|
||||||
|
|
||||||
Conversation::Conversation(const std::string& agentId, bool requiresAuth,
|
|
||||||
std::shared_ptr<AudioInterface> audioInterface,
|
|
||||||
CallbackAgentResponse callbackAgentResponse,
|
|
||||||
CallbackAgentResponseCorrection callbackAgentResponseCorrection,
|
|
||||||
CallbackUserTranscript callbackUserTranscript,
|
|
||||||
CallbackLatencyMeasurement callbackLatencyMeasurement)
|
|
||||||
: agentId_(agentId),
|
|
||||||
requiresAuth_(requiresAuth),
|
|
||||||
audioInterface_(std::move(audioInterface)),
|
|
||||||
callbackAgentResponse_(std::move(callbackAgentResponse)),
|
|
||||||
callbackAgentResponseCorrection_(std::move(callbackAgentResponseCorrection)),
|
|
||||||
callbackUserTranscript_(std::move(callbackUserTranscript)),
|
|
||||||
callbackLatencyMeasurement_(std::move(callbackLatencyMeasurement)) {
|
|
||||||
|
|
||||||
sslCtx_.set_default_verify_paths();
|
|
||||||
}
|
|
||||||
|
|
||||||
Conversation::~Conversation() {
|
|
||||||
endSession();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Conversation::startSession() {
|
|
||||||
shouldStop_.store(false);
|
|
||||||
workerThread_ = std::thread(&Conversation::run, this);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Conversation::endSession() {
|
|
||||||
shouldStop_.store(true);
|
|
||||||
if (ws_) {
|
|
||||||
beast::error_code ec;
|
|
||||||
ws_->close(websocket::close_code::normal, ec);
|
|
||||||
}
|
|
||||||
if (audioInterface_) {
|
|
||||||
audioInterface_->stop();
|
|
||||||
}
|
|
||||||
if (workerThread_.joinable()) {
|
|
||||||
workerThread_.join();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string Conversation::waitForSessionEnd() {
|
|
||||||
if (workerThread_.joinable()) {
|
|
||||||
workerThread_.join();
|
|
||||||
}
|
|
||||||
return conversationId_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Conversation::sendUserMessage(const std::string& text) {
|
|
||||||
if (!ws_) {
|
|
||||||
throw std::runtime_error("Session not started");
|
|
||||||
}
|
|
||||||
nlohmann::json j = {
|
|
||||||
{"type", "user_message"},
|
|
||||||
{"text", text}
|
|
||||||
};
|
|
||||||
ws_->write(boost::asio::buffer(j.dump()));
|
|
||||||
}
|
|
||||||
|
|
||||||
void Conversation::registerUserActivity() {
|
|
||||||
if (!ws_) throw std::runtime_error("Session not started");
|
|
||||||
nlohmann::json j = {{"type", "user_activity"}};
|
|
||||||
ws_->write(boost::asio::buffer(j.dump()));
|
|
||||||
}
|
|
||||||
|
|
||||||
void Conversation::sendContextualUpdate(const std::string& content) {
|
|
||||||
if (!ws_) throw std::runtime_error("Session not started");
|
|
||||||
nlohmann::json j = {{"type", "contextual_update"}, {"content", content}};
|
|
||||||
ws_->write(boost::asio::buffer(j.dump()));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string Conversation::getWssUrl() const {
|
|
||||||
// Hard-coded base env for demo; in production you'd call ElevenLabs env endpoint.
|
|
||||||
std::ostringstream oss;
|
|
||||||
oss << "wss://api.elevenlabs.io/v1/convai/conversation?agent_id=" << agentId_;
|
|
||||||
return oss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Conversation::run() {
|
|
||||||
try {
|
|
||||||
auto url = getWssUrl();
|
|
||||||
std::string protocol, host, target;
|
|
||||||
unsigned short port = 443;
|
|
||||||
|
|
||||||
// Very naive parse: wss://host[:port]/path?query
|
|
||||||
if (boost::starts_with(url, "wss://")) {
|
|
||||||
protocol = "wss";
|
|
||||||
host = url.substr(6);
|
|
||||||
} else {
|
|
||||||
throw std::runtime_error("Only wss:// URLs supported in this demo");
|
|
||||||
}
|
|
||||||
auto slashPos = host.find('/');
|
|
||||||
if (slashPos == std::string::npos) {
|
|
||||||
target = "/";
|
|
||||||
} else {
|
|
||||||
target = host.substr(slashPos);
|
|
||||||
host = host.substr(0, slashPos);
|
|
||||||
}
|
|
||||||
auto colonPos = host.find(':');
|
|
||||||
if (colonPos != std::string::npos) {
|
|
||||||
port = static_cast<unsigned short>(std::stoi(host.substr(colonPos + 1)));
|
|
||||||
host = host.substr(0, colonPos);
|
|
||||||
}
|
|
||||||
|
|
||||||
tcp::resolver resolver(ioc_);
|
|
||||||
auto const results = resolver.resolve(host, std::to_string(port));
|
|
||||||
|
|
||||||
beast::ssl_stream<tcp::socket> stream(ioc_, sslCtx_);
|
|
||||||
boost::asio::connect(beast::get_lowest_layer(stream), results);
|
|
||||||
if (!SSL_set_tlsext_host_name(stream.native_handle(), host.c_str())) {
|
|
||||||
throw std::runtime_error("Failed to set SNI hostname on SSL stream");
|
|
||||||
}
|
|
||||||
stream.handshake(ssl::stream_base::client);
|
|
||||||
|
|
||||||
ws_ = std::make_unique<websocket_t>(std::move(stream));
|
|
||||||
ws_->set_option(websocket::stream_base::timeout::suggested(beast::role_type::client));
|
|
||||||
ws_->handshake(host, target);
|
|
||||||
|
|
||||||
// send initiation data
|
|
||||||
nlohmann::json init = {
|
|
||||||
{"type", "conversation_initiation_client_data"},
|
|
||||||
{"custom_llm_extra_body", nlohmann::json::object()},
|
|
||||||
{"conversation_config_override", nlohmann::json::object()},
|
|
||||||
{"dynamic_variables", nlohmann::json::object()}
|
|
||||||
};
|
|
||||||
ws_->write(boost::asio::buffer(init.dump()));
|
|
||||||
|
|
||||||
// Prepare audio callback
|
|
||||||
auto inputCb = [this](const std::vector<char>& audio) {
|
|
||||||
nlohmann::json msg = {
|
|
||||||
{"user_audio_chunk", base64Encode(audio)}
|
|
||||||
};
|
|
||||||
ws_->write(boost::asio::buffer(msg.dump()));
|
|
||||||
};
|
|
||||||
audioInterface_->start(inputCb);
|
|
||||||
|
|
||||||
beast::flat_buffer buffer;
|
|
||||||
while (!shouldStop_.load()) {
|
|
||||||
beast::error_code ec;
|
|
||||||
ws_->read(buffer, ec);
|
|
||||||
if (ec) {
|
|
||||||
std::cerr << "Websocket read error: " << ec.message() << std::endl;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
auto text = beast::buffers_to_string(buffer.data());
|
|
||||||
buffer.consume(buffer.size());
|
|
||||||
try {
|
|
||||||
auto message = nlohmann::json::parse(text);
|
|
||||||
handleMessage(message);
|
|
||||||
} catch (const std::exception& ex) {
|
|
||||||
std::cerr << "JSON parse error: " << ex.what() << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (const std::exception& ex) {
|
|
||||||
std::cerr << "Conversation error: " << ex.what() << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void Conversation::handleMessage(const nlohmann::json& message) {
|
|
||||||
std::string type = message.value("type", "");
|
|
||||||
if (type == "conversation_initiation_metadata") {
|
|
||||||
conversationId_ = message["conversation_initiation_metadata_event"]["conversation_id"].get<std::string>();
|
|
||||||
} else if (type == "audio") {
|
|
||||||
auto event = message["audio_event"];
|
|
||||||
int eventId = std::stoi(toString(event["event_id"]));
|
|
||||||
if (eventId <= lastInterruptId_.load()) return;
|
|
||||||
auto audioBytes = base64Decode(event["audio_base_64"].get<std::string>());
|
|
||||||
audioInterface_->output(audioBytes);
|
|
||||||
} else if (type == "agent_response" && callbackAgentResponse_) {
|
|
||||||
auto event = message["agent_response_event"];
|
|
||||||
callbackAgentResponse_(event["agent_response"].get<std::string>());
|
|
||||||
} else if (type == "agent_response_correction" && callbackAgentResponseCorrection_) {
|
|
||||||
auto event = message["agent_response_correction_event"];
|
|
||||||
callbackAgentResponseCorrection_(event["original_agent_response"].get<std::string>(),
|
|
||||||
event["corrected_agent_response"].get<std::string>());
|
|
||||||
} else if (type == "user_transcript" && callbackUserTranscript_) {
|
|
||||||
auto event = message["user_transcription_event"];
|
|
||||||
callbackUserTranscript_(event["user_transcript"].get<std::string>());
|
|
||||||
} else if (type == "interruption") {
|
|
||||||
auto event = message["interruption_event"];
|
|
||||||
lastInterruptId_.store(std::stoi(toString(event["event_id"])));
|
|
||||||
audioInterface_->interrupt();
|
|
||||||
} else if (type == "ping") {
|
|
||||||
auto event = message["ping_event"];
|
|
||||||
nlohmann::json pong = {{"type", "pong"}, {"event_id", event["event_id"]}};
|
|
||||||
ws_->write(boost::asio::buffer(pong.dump()));
|
|
||||||
if (callbackLatencyMeasurement_ && event.contains("ping_ms")) {
|
|
||||||
int latency = event["ping_ms"].is_number() ? event["ping_ms"].get<int>() : std::stoi(event["ping_ms"].get<std::string>());
|
|
||||||
callbackLatencyMeasurement_(latency);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Note: client tool call handling omitted for brevity.
|
|
||||||
}
|
|
||||||
@ -1,131 +0,0 @@
|
|||||||
#include "DefaultAudioInterface.hpp"
|
|
||||||
|
|
||||||
#include <cstring>
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
DefaultAudioInterface::DefaultAudioInterface() {
|
|
||||||
PaError err = Pa_Initialize();
|
|
||||||
if (err != paNoError) {
|
|
||||||
throw std::runtime_error("PortAudio initialization failed");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
DefaultAudioInterface::~DefaultAudioInterface() {
|
|
||||||
if (!shouldStop_.load()) {
|
|
||||||
stop();
|
|
||||||
}
|
|
||||||
Pa_Terminate();
|
|
||||||
}
|
|
||||||
|
|
||||||
void DefaultAudioInterface::start(AudioCallback inputCallback) {
|
|
||||||
inputCallback_ = std::move(inputCallback);
|
|
||||||
PaStreamParameters inputParams;
|
|
||||||
std::memset(&inputParams, 0, sizeof(inputParams));
|
|
||||||
inputParams.channelCount = 1;
|
|
||||||
inputParams.device = Pa_GetDefaultInputDevice();
|
|
||||||
inputParams.sampleFormat = paInt16;
|
|
||||||
inputParams.suggestedLatency = Pa_GetDeviceInfo(inputParams.device)->defaultLowInputLatency;
|
|
||||||
inputParams.hostApiSpecificStreamInfo = nullptr;
|
|
||||||
|
|
||||||
PaStreamParameters outputParams;
|
|
||||||
std::memset(&outputParams, 0, sizeof(outputParams));
|
|
||||||
outputParams.channelCount = 1;
|
|
||||||
outputParams.device = Pa_GetDefaultOutputDevice();
|
|
||||||
outputParams.sampleFormat = paInt16;
|
|
||||||
outputParams.suggestedLatency = Pa_GetDeviceInfo(outputParams.device)->defaultLowOutputLatency;
|
|
||||||
outputParams.hostApiSpecificStreamInfo = nullptr;
|
|
||||||
|
|
||||||
PaError err = Pa_OpenStream(&inputStream_, &inputParams, nullptr, 16000, INPUT_FRAMES_PER_BUFFER, paClipOff,
|
|
||||||
&DefaultAudioInterface::inputCallbackStatic, this);
|
|
||||||
if (err != paNoError) {
|
|
||||||
throw std::runtime_error("Failed to open input stream");
|
|
||||||
}
|
|
||||||
|
|
||||||
err = Pa_OpenStream(&outputStream_, nullptr, &outputParams, 16000, OUTPUT_FRAMES_PER_BUFFER, paClipOff, nullptr, nullptr);
|
|
||||||
if (err != paNoError) {
|
|
||||||
throw std::runtime_error("Failed to open output stream");
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((err = Pa_StartStream(inputStream_)) != paNoError) {
|
|
||||||
throw std::runtime_error("Failed to start input stream");
|
|
||||||
}
|
|
||||||
if ((err = Pa_StartStream(outputStream_)) != paNoError) {
|
|
||||||
throw std::runtime_error("Failed to start output stream");
|
|
||||||
}
|
|
||||||
|
|
||||||
shouldStop_.store(false);
|
|
||||||
outputThread_ = std::thread(&DefaultAudioInterface::outputThreadFunc, this);
|
|
||||||
}
|
|
||||||
|
|
||||||
void DefaultAudioInterface::stop() {
|
|
||||||
shouldStop_.store(true);
|
|
||||||
queueCv_.notify_all();
|
|
||||||
if (outputThread_.joinable()) {
|
|
||||||
outputThread_.join();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inputStream_) {
|
|
||||||
Pa_StopStream(inputStream_);
|
|
||||||
Pa_CloseStream(inputStream_);
|
|
||||||
inputStream_ = nullptr;
|
|
||||||
}
|
|
||||||
if (outputStream_) {
|
|
||||||
Pa_StopStream(outputStream_);
|
|
||||||
Pa_CloseStream(outputStream_);
|
|
||||||
outputStream_ = nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void DefaultAudioInterface::output(const std::vector<char>& audio) {
|
|
||||||
{
|
|
||||||
std::lock_guard<std::mutex> lg(queueMutex_);
|
|
||||||
outputQueue_.emplace(audio);
|
|
||||||
}
|
|
||||||
queueCv_.notify_one();
|
|
||||||
}
|
|
||||||
|
|
||||||
void DefaultAudioInterface::interrupt() {
|
|
||||||
std::lock_guard<std::mutex> lg(queueMutex_);
|
|
||||||
std::queue<std::vector<char>> empty;
|
|
||||||
std::swap(outputQueue_, empty);
|
|
||||||
}
|
|
||||||
|
|
||||||
int DefaultAudioInterface::inputCallbackStatic(const void* input, void* /*output*/, unsigned long frameCount,
|
|
||||||
const PaStreamCallbackTimeInfo* /*timeInfo*/, PaStreamCallbackFlags /*statusFlags*/,
|
|
||||||
void* userData) {
|
|
||||||
auto* self = static_cast<DefaultAudioInterface*>(userData);
|
|
||||||
return self->inputCallbackInternal(input, frameCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
int DefaultAudioInterface::inputCallbackInternal(const void* input, unsigned long frameCount) {
|
|
||||||
if (!input || !inputCallback_) {
|
|
||||||
return paContinue;
|
|
||||||
}
|
|
||||||
if (outputPlaying_.load()) {
|
|
||||||
// Suppress microphone input while playing output to avoid echo feedback.
|
|
||||||
return paContinue;
|
|
||||||
}
|
|
||||||
const size_t bytes = frameCount * sizeof(int16_t);
|
|
||||||
std::vector<char> buffer(bytes);
|
|
||||||
std::memcpy(buffer.data(), input, bytes);
|
|
||||||
inputCallback_(buffer);
|
|
||||||
return paContinue;
|
|
||||||
}
|
|
||||||
|
|
||||||
void DefaultAudioInterface::outputThreadFunc() {
|
|
||||||
while (!shouldStop_.load()) {
|
|
||||||
std::vector<char> audio;
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lk(queueMutex_);
|
|
||||||
queueCv_.wait(lk, [this] { return shouldStop_.load() || !outputQueue_.empty(); });
|
|
||||||
if (shouldStop_.load()) break;
|
|
||||||
audio = std::move(outputQueue_.front());
|
|
||||||
outputQueue_.pop();
|
|
||||||
}
|
|
||||||
if (!audio.empty() && outputStream_) {
|
|
||||||
outputPlaying_.store(true);
|
|
||||||
Pa_WriteStream(outputStream_, audio.data(), audio.size() / sizeof(int16_t));
|
|
||||||
outputPlaying_.store(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,31 +0,0 @@
|
|||||||
#include "Conversation.hpp"
|
|
||||||
#include "DefaultAudioInterface.hpp"
|
|
||||||
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <iostream>
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
const char* agentIdEnv = std::getenv("AGENT_ID");
|
|
||||||
if (!agentIdEnv) {
|
|
||||||
std::cerr << "AGENT_ID environment variable must be set" << std::endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
std::string agentId(agentIdEnv);
|
|
||||||
|
|
||||||
auto audioInterface = std::make_shared<DefaultAudioInterface>();
|
|
||||||
Conversation conv(agentId, /*requiresAuth*/ false, audioInterface,
|
|
||||||
[](const std::string& resp) { std::cout << "Agent: " << resp << std::endl; },
|
|
||||||
[](const std::string& orig, const std::string& corrected) {
|
|
||||||
std::cout << "Agent correction: " << orig << " -> " << corrected << std::endl; },
|
|
||||||
[](const std::string& transcript) { std::cout << "User: " << transcript << std::endl; });
|
|
||||||
|
|
||||||
conv.startSession();
|
|
||||||
|
|
||||||
std::cout << "Press Enter to quit..." << std::endl;
|
|
||||||
std::cin.get();
|
|
||||||
conv.endSession();
|
|
||||||
auto convId = conv.waitForSessionEnd();
|
|
||||||
std::cout << "Conversation ID: " << convId << std::endl;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
Binary file not shown.
@ -158,8 +158,6 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
|
|||||||
&UElevenLabsConversationalAgentComponent::HandleInterrupted);
|
&UElevenLabsConversationalAgentComponent::HandleInterrupted);
|
||||||
WebSocketProxy->OnAgentResponseStarted.AddDynamic(this,
|
WebSocketProxy->OnAgentResponseStarted.AddDynamic(this,
|
||||||
&UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted);
|
&UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted);
|
||||||
WebSocketProxy->OnAgentResponsePart.AddDynamic(this,
|
|
||||||
&UElevenLabsConversationalAgentComponent::HandleAgentResponsePart);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pass configuration to the proxy before connecting.
|
// Pass configuration to the proxy before connecting.
|
||||||
@ -268,23 +266,7 @@ void UElevenLabsConversationalAgentComponent::StopListening()
|
|||||||
// Flush any partially-accumulated mic audio before signalling end-of-turn.
|
// Flush any partially-accumulated mic audio before signalling end-of-turn.
|
||||||
// This ensures the final words aren't discarded just because the last callback
|
// This ensures the final words aren't discarded just because the last callback
|
||||||
// didn't push the buffer over the MicChunkMinBytes threshold.
|
// didn't push the buffer over the MicChunkMinBytes threshold.
|
||||||
//
|
if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected())
|
||||||
// EXCEPT during collision avoidance: bAgentGenerating is already true when
|
|
||||||
// HandleAgentResponseStarted calls StopListening (it sets the flag before calling us).
|
|
||||||
// Flushing audio to a server that is mid-generation can cause it to re-enter
|
|
||||||
// "user speaking" state and stall waiting for more audio that never arrives,
|
|
||||||
// leaving both sides stuck — no audio for the collision response and no response
|
|
||||||
// for subsequent turns.
|
|
||||||
if (bAgentGenerating)
|
|
||||||
{
|
|
||||||
if (MicAccumulationBuffer.Num() > 0)
|
|
||||||
{
|
|
||||||
UE_LOG(LogElevenLabsAgent, Log,
|
|
||||||
TEXT("StopListening: discarding %d bytes of accumulated mic audio (collision — server is mid-generation)."),
|
|
||||||
MicAccumulationBuffer.Num());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected())
|
|
||||||
{
|
{
|
||||||
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
|
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
|
||||||
}
|
}
|
||||||
@ -441,8 +423,7 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
|
|||||||
{
|
{
|
||||||
// The server has started generating a response (first agent_chat_response_part).
|
// The server has started generating a response (first agent_chat_response_part).
|
||||||
// Set bAgentGenerating BEFORE StopListening so that any StartListening call
|
// Set bAgentGenerating BEFORE StopListening so that any StartListening call
|
||||||
// triggered by the Blueprint's OnAgentStartedGenerating handler is blocked,
|
// triggered by the Blueprint's OnAgentStartedGenerating handler is blocked.
|
||||||
// and so that StopListening knows to skip the mic buffer flush (collision path).
|
|
||||||
bAgentGenerating = true;
|
bAgentGenerating = true;
|
||||||
bWaitingForAgentResponse = false; // Server is generating — response timeout cancelled.
|
bWaitingForAgentResponse = false; // Server is generating — response timeout cancelled.
|
||||||
|
|
||||||
@ -452,37 +433,21 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
|
|||||||
if (bIsListening)
|
if (bIsListening)
|
||||||
{
|
{
|
||||||
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
|
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
|
||||||
// The server's VAD detected a pause in the user's speech and started generating
|
// Log both turn indices so the timeline is unambiguous.
|
||||||
// prematurely — the user hasn't finished speaking yet.
|
|
||||||
//
|
|
||||||
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
|
|
||||||
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
|
|
||||||
// causing it to re-enter "user speaking" state and stall — both sides stuck.
|
|
||||||
//
|
|
||||||
// Do NOT send an interrupt here — just let the server's response play out:
|
|
||||||
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
|
|
||||||
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
|
|
||||||
// Either way the state machine recovers and Blueprint can reopen the mic.
|
|
||||||
UE_LOG(LogElevenLabsAgent, Log,
|
UE_LOG(LogElevenLabsAgent, Log,
|
||||||
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
|
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
|
||||||
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
|
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
|
||||||
StopListening();
|
StopListening();
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
UE_LOG(LogElevenLabsAgent, Log,
|
UE_LOG(LogElevenLabsAgent, Log,
|
||||||
TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"),
|
TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"),
|
||||||
T, LastClosedTurnIndex, LatencyFromTurnEnd);
|
T, LastClosedTurnIndex, LatencyFromTurnEnd);
|
||||||
|
}
|
||||||
OnAgentStartedGenerating.Broadcast();
|
OnAgentStartedGenerating.Broadcast();
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsConversationalAgentComponent::HandleAgentResponsePart(const FString& PartialText)
|
|
||||||
{
|
|
||||||
if (bEnableAgentPartialResponse)
|
|
||||||
{
|
|
||||||
OnAgentPartialResponse.Broadcast(PartialText);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
// Audio playback
|
// Audio playback
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
@ -604,15 +569,9 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
|
|||||||
{
|
{
|
||||||
if (!IsConnected() || !bIsListening) return;
|
if (!IsConnected() || !bIsListening) return;
|
||||||
|
|
||||||
// Echo suppression: skip sending mic audio while the agent is speaking.
|
|
||||||
// This prevents the agent from hearing its own voice through the speakers,
|
|
||||||
// which would confuse the server's VAD and STT. Matches the approach used
|
|
||||||
// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
|
|
||||||
if (bAgentSpeaking) return;
|
|
||||||
|
|
||||||
// Convert this callback's samples to int16 bytes and accumulate.
|
// Convert this callback's samples to int16 bytes and accumulate.
|
||||||
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms
|
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
|
||||||
// (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here
|
// (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here
|
||||||
// until we have enough, then send the whole batch in one WebSocket frame.
|
// until we have enough, then send the whole batch in one WebSocket frame.
|
||||||
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
|
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
|
||||||
MicAccumulationBuffer.Append(PCMBytes);
|
MicAccumulationBuffer.Append(PCMBytes);
|
||||||
|
|||||||
@ -158,6 +158,20 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
|
|||||||
// in a loop: part arrives → event → StopListening → SendUserTurnEnd → flag reset → part arrives → loop.
|
// in a loop: part arrives → event → StopListening → SendUserTurnEnd → flag reset → part arrives → loop.
|
||||||
// The flag is only reset in SendUserTurnStart() at the beginning of a new user turn.
|
// The flag is only reset in SendUserTurnStart() at the beginning of a new user turn.
|
||||||
|
|
||||||
|
// Clear the interrupt-ignore flag if it was never cleared by an "interruption" server ack.
|
||||||
|
// The ElevenLabs server does not always send the "interruption" acknowledgement reliably.
|
||||||
|
// By the time the user has spoken a full new turn (seconds of audio), any in-flight content
|
||||||
|
// from the previously interrupted generation has long since arrived — it is safe to resume
|
||||||
|
// normal content processing so the server's response to this new turn is not silently discarded.
|
||||||
|
if (bIgnoreIncomingContent)
|
||||||
|
{
|
||||||
|
bIgnoreIncomingContent = false;
|
||||||
|
const double T = UserTurnEndTime - SessionStartTime;
|
||||||
|
UE_LOG(LogElevenLabsWS, Log,
|
||||||
|
TEXT("[T+%.2fs] Cleared interrupt-ignore flag at turn end (server 'interruption' ack was not received — resuming content processing)."),
|
||||||
|
T);
|
||||||
|
}
|
||||||
|
|
||||||
const double T = UserTurnEndTime - SessionStartTime;
|
const double T = UserTurnEndTime - SessionStartTime;
|
||||||
UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn ended — server VAD silence detection started (turn_timeout=1s)."), T);
|
UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn ended — server VAD silence detection started (turn_timeout=1s)."), T);
|
||||||
}
|
}
|
||||||
@ -182,7 +196,12 @@ void UElevenLabsWebSocketProxy::SendInterrupt()
|
|||||||
{
|
{
|
||||||
if (!IsConnected()) return;
|
if (!IsConnected()) return;
|
||||||
|
|
||||||
UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt."));
|
// Immediately start discarding in-flight audio and chat response parts from
|
||||||
|
// the generation we are about to interrupt. The server may still send several
|
||||||
|
// frames before it processes our interrupt. We stop ignoring once the server
|
||||||
|
// sends its "interruption" acknowledgement (HandleInterruption).
|
||||||
|
bIgnoreIncomingContent = true;
|
||||||
|
UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt — ignoring incoming content until server acks."));
|
||||||
|
|
||||||
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
|
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
|
||||||
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::Interrupt);
|
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::Interrupt);
|
||||||
@ -381,7 +400,7 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
|||||||
}
|
}
|
||||||
else if (MsgType == ElevenLabsMessageType::AgentChatResponsePart)
|
else if (MsgType == ElevenLabsMessageType::AgentChatResponsePart)
|
||||||
{
|
{
|
||||||
HandleAgentChatResponsePart(Root);
|
HandleAgentChatResponsePart();
|
||||||
}
|
}
|
||||||
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
|
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
|
||||||
{
|
{
|
||||||
@ -448,10 +467,18 @@ void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Broadcast raw PCM bytes directly to the audio queue.
|
// Broadcast raw PCM bytes directly to the audio queue.
|
||||||
|
// Discard if we are waiting for an interruption ack (same logic as HandleAudioResponse).
|
||||||
TArray<uint8> PCMData = MoveTemp(BinaryFrameBuffer);
|
TArray<uint8> PCMData = MoveTemp(BinaryFrameBuffer);
|
||||||
BinaryFrameBuffer.Reset();
|
BinaryFrameBuffer.Reset();
|
||||||
|
if (!bIgnoreIncomingContent)
|
||||||
|
{
|
||||||
OnAudioReceived.Broadcast(PCMData);
|
OnAudioReceived.Broadcast(PCMData);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding binary audio frame (interrupt pending server ack)."));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
@ -480,6 +507,15 @@ void UElevenLabsWebSocketProxy::HandleConversationInitiation(const TSharedPtr<FJ
|
|||||||
|
|
||||||
void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject>& Root)
|
void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject>& Root)
|
||||||
{
|
{
|
||||||
|
// Discard audio that belongs to an interrupted generation.
|
||||||
|
// The server may send several more audio frames after we sent "interrupt" —
|
||||||
|
// they must not restart the speaking state on the client side.
|
||||||
|
if (bIgnoreIncomingContent)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio frame (interrupt pending server ack)."));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Expected structure:
|
// Expected structure:
|
||||||
// { "type": "audio",
|
// { "type": "audio",
|
||||||
// "audio_event": { "audio_base_64": "<base64 PCM>", "event_id": 1 }
|
// "audio_event": { "audio_base_64": "<base64 PCM>", "event_id": 1 }
|
||||||
@ -491,17 +527,6 @@ void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Discard audio belonging to an interrupted generation (event_id approach).
|
|
||||||
// Matches the official ElevenLabs C++ and Python SDKs: only AUDIO is filtered
|
|
||||||
// by event_id — transcripts, agent_response, etc. are always processed.
|
|
||||||
int32 EventId = 0;
|
|
||||||
(*AudioEvent)->TryGetNumberField(TEXT("event_id"), EventId);
|
|
||||||
if (EventId > 0 && EventId <= LastInterruptEventId)
|
|
||||||
{
|
|
||||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio event_id=%d (interrupted at %d)."), EventId, LastInterruptEventId);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
FString Base64Audio;
|
FString Base64Audio;
|
||||||
if (!(*AudioEvent)->TryGetStringField(TEXT("audio_base_64"), Base64Audio))
|
if (!(*AudioEvent)->TryGetStringField(TEXT("audio_base_64"), Base64Audio))
|
||||||
{
|
{
|
||||||
@ -544,6 +569,16 @@ void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr<FJsonObject>&
|
|||||||
|
|
||||||
void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr<FJsonObject>& Root)
|
void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr<FJsonObject>& Root)
|
||||||
{
|
{
|
||||||
|
// ISSUE-19: discard agent_response that belongs to an interrupted generation.
|
||||||
|
// A stale agent_response from the cancelled turn would set bAgentResponseReceived=true
|
||||||
|
// on the component, allowing the silence-detection Tick to fire OnAgentStoppedSpeaking
|
||||||
|
// at the wrong time (no audio is currently playing for the new turn yet).
|
||||||
|
if (bIgnoreIncomingContent)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_response (interrupt pending server ack)."));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// ISSUE-22: reset bAgentResponseStartedFired so OnAgentResponseStarted fires again on
|
// ISSUE-22: reset bAgentResponseStartedFired so OnAgentResponseStarted fires again on
|
||||||
// the next turn. In Server VAD mode SendUserTurnStart() is never called — it is the only
|
// the next turn. In Server VAD mode SendUserTurnStart() is never called — it is the only
|
||||||
// other place that resets this flag — so without this reset, OnAgentResponseStarted fires
|
// other place that resets this flag — so without this reset, OnAgentResponseStarted fires
|
||||||
@ -567,8 +602,18 @@ void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr<FJsonObject
|
|||||||
OnAgentResponse.Broadcast(ResponseText);
|
OnAgentResponse.Broadcast(ResponseText);
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJsonObject>& Root)
|
void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart()
|
||||||
{
|
{
|
||||||
|
// Ignore response parts that belong to a generation we have already interrupted.
|
||||||
|
// Without this guard, old parts arriving after SendInterrupt() would re-trigger
|
||||||
|
// OnAgentResponseStarted (bAgentResponseStartedFired was reset in SendUserTurnStart),
|
||||||
|
// causing the component to stop the newly-opened microphone — creating an infinite loop.
|
||||||
|
if (bIgnoreIncomingContent)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_chat_response_part (interrupt pending server ack)."));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// agent_chat_response_part = the server is actively generating a response (LLM token stream).
|
// agent_chat_response_part = the server is actively generating a response (LLM token stream).
|
||||||
// Fire OnAgentResponseStarted once per turn so the component can auto-stop the microphone
|
// Fire OnAgentResponseStarted once per turn so the component can auto-stop the microphone
|
||||||
// if the Blueprint restarted listening before the server finished processing the previous turn.
|
// if the Blueprint restarted listening before the server finished processing the previous turn.
|
||||||
@ -583,39 +628,15 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
|
|||||||
T, LatencyFromTurnEnd);
|
T, LatencyFromTurnEnd);
|
||||||
OnAgentResponseStarted.Broadcast();
|
OnAgentResponseStarted.Broadcast();
|
||||||
}
|
}
|
||||||
|
// Subsequent parts logged at Verbose only (can be dozens per response).
|
||||||
// Extract the streaming text fragment and broadcast it.
|
|
||||||
// API structure:
|
|
||||||
// { "type": "agent_chat_response_part",
|
|
||||||
// "agent_chat_response_part_event": { "agent_response_part": "partial text" }
|
|
||||||
// }
|
|
||||||
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
|
|
||||||
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
|
|
||||||
{
|
|
||||||
FString PartText;
|
|
||||||
if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
|
|
||||||
{
|
|
||||||
OnAgentResponsePart.Broadcast(PartText);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
|
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
|
||||||
{
|
{
|
||||||
// Extract the interrupt event_id so we can filter stale audio frames.
|
// Server has acknowledged the interruption — the old generation is fully stopped.
|
||||||
// { "type": "interruption", "interruption_event": { "event_id": 42 } }
|
// Resume accepting incoming audio and chat response parts (for the next turn).
|
||||||
const TSharedPtr<FJsonObject>* InterruptEvent = nullptr;
|
bIgnoreIncomingContent = false;
|
||||||
if (Root->TryGetObjectField(TEXT("interruption_event"), InterruptEvent) && InterruptEvent)
|
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received — resuming content processing)."));
|
||||||
{
|
|
||||||
int32 EventId = 0;
|
|
||||||
(*InterruptEvent)->TryGetNumberField(TEXT("event_id"), EventId);
|
|
||||||
if (EventId > LastInterruptEventId)
|
|
||||||
{
|
|
||||||
LastInterruptEventId = EventId;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack, LastInterruptEventId=%d)."), LastInterruptEventId);
|
|
||||||
OnInterrupted.Broadcast();
|
OnInterrupted.Broadcast();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -43,15 +43,6 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentInterrupted);
|
|||||||
*/
|
*/
|
||||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedGenerating);
|
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedGenerating);
|
||||||
|
|
||||||
/**
|
|
||||||
* Fired for every agent_chat_response_part — streams the agent's text as the LLM
|
|
||||||
* generates it, token by token. Use this for real-time subtitles / text display.
|
|
||||||
* Each call provides the text fragment from that individual part (NOT accumulated).
|
|
||||||
* The final complete text is still available via OnAgentTextResponse (agent_response).
|
|
||||||
*/
|
|
||||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentPartialResponse,
|
|
||||||
const FString&, PartialText);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fired when the server has not started generating a response within ResponseTimeoutSeconds
|
* Fired when the server has not started generating a response within ResponseTimeoutSeconds
|
||||||
* after the user stopped speaking (StopListening was called).
|
* after the user stopped speaking (StopListening was called).
|
||||||
@ -147,15 +138,6 @@ public:
|
|||||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
||||||
bool bEnableAgentTextResponse = true;
|
bool bEnableAgentTextResponse = true;
|
||||||
|
|
||||||
/**
|
|
||||||
* Forward streaming text parts (agent_chat_response_part events) to the
|
|
||||||
* OnAgentPartialResponse delegate. Each part is a text fragment as the LLM
|
|
||||||
* generates it — use this for real-time subtitles that appear while the agent
|
|
||||||
* speaks, instead of waiting for the full text (OnAgentTextResponse).
|
|
||||||
*/
|
|
||||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
|
|
||||||
bool bEnableAgentPartialResponse = false;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* How many seconds to wait for the server to start generating a response
|
* How many seconds to wait for the server to start generating a response
|
||||||
* after the user stops speaking (StopListening) before firing OnAgentResponseTimeout.
|
* after the user stops speaking (StopListening) before firing OnAgentResponseTimeout.
|
||||||
@ -186,14 +168,6 @@ public:
|
|||||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||||
FOnAgentTextResponse OnAgentTextResponse;
|
FOnAgentTextResponse OnAgentTextResponse;
|
||||||
|
|
||||||
/**
|
|
||||||
* Streaming text fragments as the LLM generates them.
|
|
||||||
* Fires for every agent_chat_response_part — each call gives one text chunk.
|
|
||||||
* Enable with bEnableAgentPartialResponse.
|
|
||||||
*/
|
|
||||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
|
||||||
FOnAgentPartialResponse OnAgentPartialResponse;
|
|
||||||
|
|
||||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||||
FOnAgentStartedSpeaking OnAgentStartedSpeaking;
|
FOnAgentStartedSpeaking OnAgentStartedSpeaking;
|
||||||
|
|
||||||
@ -311,9 +285,6 @@ private:
|
|||||||
UFUNCTION()
|
UFUNCTION()
|
||||||
void HandleAgentResponseStarted();
|
void HandleAgentResponseStarted();
|
||||||
|
|
||||||
UFUNCTION()
|
|
||||||
void HandleAgentResponsePart(const FString& PartialText);
|
|
||||||
|
|
||||||
// ── Audio playback ────────────────────────────────────────────────────────
|
// ── Audio playback ────────────────────────────────────────────────────────
|
||||||
void InitAudioPlayback();
|
void InitAudioPlayback();
|
||||||
void EnqueueAgentAudio(const TArray<uint8>& PCMData);
|
void EnqueueAgentAudio(const TArray<uint8>& PCMData);
|
||||||
@ -400,5 +371,5 @@ private:
|
|||||||
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
|
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
|
||||||
// We accumulate here and only call SendAudioChunk once enough bytes are ready.
|
// We accumulate here and only call SendAudioChunk once enough bytes are ready.
|
||||||
TArray<uint8> MicAccumulationBuffer;
|
TArray<uint8> MicAccumulationBuffer;
|
||||||
static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation)
|
static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono
|
||||||
};
|
};
|
||||||
|
|||||||
@ -43,11 +43,6 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsInterrupted);
|
|||||||
*/
|
*/
|
||||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsAgentResponseStarted);
|
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsAgentResponseStarted);
|
||||||
|
|
||||||
/** Fired for every agent_chat_response_part — streams the LLM text as it is generated.
|
|
||||||
* PartialText is the text fragment from this individual part (NOT accumulated). */
|
|
||||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnElevenLabsAgentResponsePart,
|
|
||||||
const FString&, PartialText);
|
|
||||||
|
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
// WebSocket Proxy
|
// WebSocket Proxy
|
||||||
@ -99,10 +94,6 @@ public:
|
|||||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||||
FOnElevenLabsAgentResponseStarted OnAgentResponseStarted;
|
FOnElevenLabsAgentResponseStarted OnAgentResponseStarted;
|
||||||
|
|
||||||
/** Fired for every agent_chat_response_part with the streaming text fragment. */
|
|
||||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
|
||||||
FOnElevenLabsAgentResponsePart OnAgentResponsePart;
|
|
||||||
|
|
||||||
// ── Lifecycle ─────────────────────────────────────────────────────────────
|
// ── Lifecycle ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -191,7 +182,7 @@ private:
|
|||||||
void HandleAudioResponse(const TSharedPtr<FJsonObject>& Payload);
|
void HandleAudioResponse(const TSharedPtr<FJsonObject>& Payload);
|
||||||
void HandleTranscript(const TSharedPtr<FJsonObject>& Payload);
|
void HandleTranscript(const TSharedPtr<FJsonObject>& Payload);
|
||||||
void HandleAgentResponse(const TSharedPtr<FJsonObject>& Payload);
|
void HandleAgentResponse(const TSharedPtr<FJsonObject>& Payload);
|
||||||
void HandleAgentChatResponsePart(const TSharedPtr<FJsonObject>& Payload);
|
void HandleAgentChatResponsePart();
|
||||||
void HandleInterruption(const TSharedPtr<FJsonObject>& Payload);
|
void HandleInterruption(const TSharedPtr<FJsonObject>& Payload);
|
||||||
void HandlePing(const TSharedPtr<FJsonObject>& Payload);
|
void HandlePing(const TSharedPtr<FJsonObject>& Payload);
|
||||||
|
|
||||||
@ -226,12 +217,11 @@ private:
|
|||||||
// Used to compute [T+Xs] session-relative timestamps in all log messages.
|
// Used to compute [T+Xs] session-relative timestamps in all log messages.
|
||||||
double SessionStartTime = 0.0;
|
double SessionStartTime = 0.0;
|
||||||
|
|
||||||
// ── Interrupt filtering (event_id approach, matching official SDK) ────────
|
// Set to true in SendInterrupt() so that in-flight audio frames and
|
||||||
// When the server sends an "interruption" event it includes an event_id.
|
// agent_chat_response_part messages from the interrupted generation are silently
|
||||||
// Audio events whose event_id <= LastInterruptEventId belong to the cancelled
|
// discarded instead of re-triggering the speaking/generating state.
|
||||||
// generation and must be discarded. Only AUDIO is filtered — transcripts,
|
// Cleared when the server sends its "interruption" acknowledgement.
|
||||||
// agent_response, agent_chat_response_part etc. are always processed.
|
bool bIgnoreIncomingContent = false;
|
||||||
int32 LastInterruptEventId = 0;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
|
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user