Compare commits
2 Commits
33ec54150f
...
8175375c28
| Author | SHA1 | Date | |
|---|---|---|---|
| 8175375c28 | |||
| 275065f5aa |
152
CPP/elevenlabs-convai-cpp-main/.gitignore
vendored
152
CPP/elevenlabs-convai-cpp-main/.gitignore
vendored
@ -1,152 +0,0 @@
|
||||
# Build directories
|
||||
build/
|
||||
cmake-build-*/
|
||||
out/
|
||||
|
||||
# Compiled Object files
|
||||
*.slo
|
||||
*.lo
|
||||
*.o
|
||||
*.obj
|
||||
|
||||
# Precompiled Headers
|
||||
*.gch
|
||||
*.pch
|
||||
|
||||
# Compiled Dynamic libraries
|
||||
*.so
|
||||
*.dylib
|
||||
*.dll
|
||||
|
||||
# Fortran module files
|
||||
*.mod
|
||||
*.smod
|
||||
|
||||
# Compiled Static libraries
|
||||
*.lai
|
||||
*.la
|
||||
*.a
|
||||
*.lib
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
*.out
|
||||
*.app
|
||||
convai_cpp
|
||||
|
||||
# CMake
|
||||
CMakeCache.txt
|
||||
CMakeFiles/
|
||||
CMakeScripts/
|
||||
Testing/
|
||||
Makefile
|
||||
cmake_install.cmake
|
||||
install_manifest.txt
|
||||
compile_commands.json
|
||||
CTestTestfile.cmake
|
||||
_deps/
|
||||
|
||||
# IDE files
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# macOS
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
|
||||
# Files that might appear in the root of a volume
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
.com.apple.timemachine.donotpresent
|
||||
|
||||
# Directories potentially created on remote AFP share
|
||||
.AppleDB
|
||||
.AppleDesktop
|
||||
Network Trash Folder
|
||||
Temporary Items
|
||||
.apdisk
|
||||
|
||||
# Windows
|
||||
Thumbs.db
|
||||
ehthumbs.db
|
||||
Desktop.ini
|
||||
$RECYCLE.BIN/
|
||||
*.cab
|
||||
*.msi
|
||||
*.msm
|
||||
*.msp
|
||||
*.lnk
|
||||
|
||||
# Linux
|
||||
*~
|
||||
.fuse_hidden*
|
||||
.directory
|
||||
.Trash-*
|
||||
.nfs*
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# Runtime data
|
||||
pids
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
coverage/
|
||||
|
||||
# nyc test coverage
|
||||
.nyc_output
|
||||
|
||||
# Dependency directories
|
||||
node_modules/
|
||||
|
||||
# Optional npm cache directory
|
||||
.npm
|
||||
|
||||
# Optional REPL history
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variables file
|
||||
.env
|
||||
.env.test
|
||||
|
||||
# parcel-bundler cache (https://parceljs.org/)
|
||||
.cache
|
||||
.parcel-cache
|
||||
|
||||
# next.js build output
|
||||
.next
|
||||
|
||||
# nuxt.js build output
|
||||
.nuxt
|
||||
|
||||
# vuepress build output
|
||||
.vuepress/dist
|
||||
|
||||
# Serverless directories
|
||||
.serverless
|
||||
|
||||
# FuseBox cache
|
||||
.fusebox/
|
||||
|
||||
# DynamoDB Local files
|
||||
.dynamodb/
|
||||
@ -1,42 +0,0 @@
|
||||
cmake_minimum_required(VERSION 3.14)
|
||||
|
||||
project(elevenlabs_convai_cpp LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
# Find dependencies
|
||||
find_package(Boost REQUIRED COMPONENTS system thread)
|
||||
find_package(OpenSSL REQUIRED)
|
||||
# PortAudio via vcpkg CMake config
|
||||
find_package(portaudio CONFIG REQUIRED)
|
||||
|
||||
# Find nlohmann_json
|
||||
find_package(nlohmann_json 3.11 QUIET)
|
||||
|
||||
if(NOT nlohmann_json_FOUND)
|
||||
include(FetchContent)
|
||||
# Fallback: header-only fetch to avoid old CMake policies in upstream CMakeLists
|
||||
FetchContent_Declare(
|
||||
nlohmann_json_src
|
||||
URL https://raw.githubusercontent.com/nlohmann/json/v3.11.2/single_include/nlohmann/json.hpp
|
||||
)
|
||||
FetchContent_MakeAvailable(nlohmann_json_src)
|
||||
add_library(nlohmann_json::nlohmann_json INTERFACE IMPORTED)
|
||||
target_include_directories(nlohmann_json::nlohmann_json INTERFACE ${nlohmann_json_src_SOURCE_DIR}/single_include)
|
||||
endif()
|
||||
|
||||
add_executable(convai_cpp
|
||||
src/main.cpp
|
||||
src/Conversation.cpp
|
||||
src/DefaultAudioInterface.cpp
|
||||
)
|
||||
|
||||
target_include_directories(convai_cpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
|
||||
|
||||
# MSVC: set Windows target version and suppress getenv deprecation warning
|
||||
if(MSVC)
|
||||
target_compile_definitions(convai_cpp PRIVATE _WIN32_WINNT=0x0A00 _CRT_SECURE_NO_WARNINGS)
|
||||
endif()
|
||||
|
||||
target_link_libraries(convai_cpp PRIVATE Boost::system Boost::thread OpenSSL::SSL OpenSSL::Crypto portaudio nlohmann_json::nlohmann_json)
|
||||
@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Jitendra
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@ -1,197 +0,0 @@
|
||||
# ElevenLabs Conversational AI - C++ Implementation
|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://en.wikipedia.org/wiki/C%2B%2B17)
|
||||
[](https://cmake.org/)
|
||||
|
||||
C++ implementation of ElevenLabs Conversational AI client
|
||||
|
||||
## Features
|
||||
|
||||
- **Real-time Audio Processing**: Full-duplex audio streaming with low-latency playback
|
||||
- **WebSocket Integration**: Secure WSS connection to ElevenLabs Conversational AI platform
|
||||
- **Cross-platform Audio**: PortAudio-based implementation supporting Windows, macOS, and Linux
|
||||
- **Echo Suppression**: Built-in acoustic feedback prevention
|
||||
- **Modern C++**: Clean, maintainable C++17 codebase with proper RAII and exception handling
|
||||
- **Flexible Architecture**: Modular design allowing easy customization and extension
|
||||
|
||||
## Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "User Interface"
|
||||
A[main.cpp] --> B[Conversation]
|
||||
end
|
||||
|
||||
subgraph "Core Components"
|
||||
B --> C[DefaultAudioInterface]
|
||||
B --> D[WebSocket Client]
|
||||
C --> E[PortAudio]
|
||||
D --> F[Boost.Beast + OpenSSL]
|
||||
end
|
||||
|
||||
subgraph "ElevenLabs Platform"
|
||||
F --> G[WSS API Endpoint]
|
||||
G --> H[Conversational AI Agent]
|
||||
end
|
||||
|
||||
subgraph "Audio Flow"
|
||||
I[Microphone] --> C
|
||||
C --> J[Base64 Encoding]
|
||||
J --> D
|
||||
D --> K[Audio Events]
|
||||
K --> L[Base64 Decoding]
|
||||
L --> C
|
||||
C --> M[Speakers]
|
||||
end
|
||||
|
||||
subgraph "Message Types"
|
||||
N[user_audio_chunk]
|
||||
O[agent_response]
|
||||
P[user_transcript]
|
||||
Q[audio_event]
|
||||
R[ping/pong]
|
||||
end
|
||||
|
||||
style B fill:#e1f5fe
|
||||
style C fill:#f3e5f5
|
||||
style D fill:#e8f5e8
|
||||
style H fill:#fff3e0
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- **C++17 compatible compiler**: GCC 11+, Clang 14+, or MSVC 2022+
|
||||
- **CMake** 3.14 or higher
|
||||
- **Dependencies** (install via package manager):
|
||||
|
||||
#### macOS (Homebrew)
|
||||
```bash
|
||||
brew install boost openssl portaudio nlohmann-json cmake pkg-config
|
||||
```
|
||||
|
||||
#### Ubuntu/Debian
|
||||
```bash
|
||||
sudo apt update
|
||||
sudo apt install build-essential cmake pkg-config
|
||||
sudo apt install libboost-system-dev libboost-thread-dev
|
||||
sudo apt install libssl-dev libportaudio2-dev nlohmann-json3-dev
|
||||
```
|
||||
|
||||
#### Windows (vcpkg)
|
||||
```bash
|
||||
vcpkg install boost-system boost-thread openssl portaudio nlohmann-json
|
||||
```
|
||||
|
||||
### Building
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://github.com/Jitendra2603/elevenlabs-convai-cpp.git
|
||||
cd elevenlabs-convai-cpp
|
||||
|
||||
# Build the project
|
||||
mkdir build && cd build
|
||||
cmake ..
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
### Running
|
||||
|
||||
```bash
|
||||
# Set your agent ID (get this from ElevenLabs dashboard)
|
||||
export AGENT_ID="your-agent-id-here"
|
||||
|
||||
# Run the demo
|
||||
./convai_cpp
|
||||
```
|
||||
|
||||
The application will:
|
||||
1. Connect to your ElevenLabs Conversational AI agent
|
||||
2. Start capturing audio from your default microphone
|
||||
3. Stream audio to the agent and play responses through speakers
|
||||
4. Display conversation transcripts in the terminal
|
||||
5. Continue until you press Enter to quit
|
||||
|
||||
## 📋 Usage Examples
|
||||
|
||||
### Basic Conversation
|
||||
```bash
|
||||
export AGENT_ID="agent_"
|
||||
./convai_cpp
|
||||
# Speak into your microphone and hear the AI agent respond
|
||||
```
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
### Audio Settings
|
||||
|
||||
The audio interface is configured for optimal real-time performance:
|
||||
|
||||
- **Sample Rate**: 16 kHz
|
||||
- **Format**: 16-bit PCM mono
|
||||
- **Input Buffer**: 250ms (4000 frames)
|
||||
- **Output Buffer**: 62.5ms (1000 frames)
|
||||
|
||||
### WebSocket Connection
|
||||
|
||||
- **Endpoint**: `wss://api.elevenlabs.io/v1/convai/conversation`
|
||||
- **Protocol**: WebSocket Secure (WSS) with TLS 1.2+
|
||||
- **Authentication**: Optional (required for private agents)
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
elevenlabs-convai-cpp/
|
||||
├── CMakeLists.txt # Build configuration
|
||||
├── README.md # This file
|
||||
├── LICENSE # MIT license
|
||||
├── CONTRIBUTING.md # Contribution guidelines
|
||||
├── .gitignore # Git ignore rules
|
||||
├── include/ # Header files
|
||||
│ ├── AudioInterface.hpp # Abstract audio interface
|
||||
│ ├── DefaultAudioInterface.hpp # PortAudio implementation
|
||||
│ └── Conversation.hpp # Main conversation handler
|
||||
└── src/ # Source files
|
||||
├── main.cpp # Demo application
|
||||
├── Conversation.cpp # WebSocket and message handling
|
||||
└── DefaultAudioInterface.cpp # Audio I/O implementation
|
||||
```
|
||||
|
||||
## Technical Details
|
||||
|
||||
### Audio Processing Pipeline
|
||||
|
||||
1. **Capture**: PortAudio captures 16-bit PCM audio at 16kHz
|
||||
2. **Encoding**: Raw audio is base64-encoded for WebSocket transmission
|
||||
3. **Streaming**: Audio chunks sent as `user_audio_chunk` messages
|
||||
4. **Reception**: Server sends `audio_event` messages with agent responses
|
||||
5. **Decoding**: Base64 audio data decoded back to PCM
|
||||
6. **Playback**: Audio queued and played through PortAudio output stream
|
||||
|
||||
### Echo Suppression
|
||||
|
||||
The implementation includes a simple, effective echo suppression mechanism:
|
||||
|
||||
- Microphone input is suppressed during agent speech playback
|
||||
- Prevents acoustic feedback loops that cause the agent to respond to itself
|
||||
- Uses atomic flags for thread-safe coordination between input/output
|
||||
|
||||
### WebSocket Message Handling
|
||||
|
||||
Supported message types:
|
||||
- `conversation_initiation_client_data` - Session initialization
|
||||
- `user_audio_chunk` - Microphone audio data
|
||||
- `audio_event` - Agent speech audio
|
||||
- `agent_response` - Agent text responses
|
||||
- `user_transcript` - Speech-to-text results
|
||||
- `ping`/`pong` - Connection keepalive
|
||||
|
||||
|
||||
|
||||
## 📝 License
|
||||
|
||||
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
||||
@ -1,23 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
|
||||
class AudioInterface {
|
||||
public:
|
||||
using AudioCallback = std::function<void(const std::vector<char>&)>;
|
||||
|
||||
virtual ~AudioInterface() = default;
|
||||
|
||||
// Starts the audio interface. The callback will be invoked with raw 16-bit PCM mono samples at 16kHz.
|
||||
virtual void start(AudioCallback inputCallback) = 0;
|
||||
|
||||
// Stops audio I/O and releases underlying resources.
|
||||
virtual void stop() = 0;
|
||||
|
||||
// Play audio to the user; audio is 16-bit PCM mono 16kHz.
|
||||
virtual void output(const std::vector<char>& audio) = 0;
|
||||
|
||||
// Immediately stop any buffered / ongoing output.
|
||||
virtual void interrupt() = 0;
|
||||
};
|
||||
@ -1,72 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "AudioInterface.hpp"
|
||||
#include <boost/beast/core.hpp>
|
||||
#include <boost/beast/websocket.hpp>
|
||||
#include <boost/beast/ssl.hpp>
|
||||
#include <boost/asio.hpp>
|
||||
#include <boost/asio/ssl/stream.hpp>
|
||||
#include <boost/asio/ip/tcp.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <thread>
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
|
||||
class Conversation {
|
||||
public:
|
||||
using CallbackAgentResponse = std::function<void(const std::string&)>;
|
||||
using CallbackAgentResponseCorrection = std::function<void(const std::string&, const std::string&)>;
|
||||
using CallbackUserTranscript = std::function<void(const std::string&)>;
|
||||
using CallbackLatencyMeasurement = std::function<void(int)>;
|
||||
|
||||
Conversation(
|
||||
const std::string& agentId,
|
||||
bool requiresAuth,
|
||||
std::shared_ptr<AudioInterface> audioInterface,
|
||||
CallbackAgentResponse callbackAgentResponse = nullptr,
|
||||
CallbackAgentResponseCorrection callbackAgentResponseCorrection = nullptr,
|
||||
CallbackUserTranscript callbackUserTranscript = nullptr,
|
||||
CallbackLatencyMeasurement callbackLatencyMeasurement = nullptr
|
||||
);
|
||||
|
||||
~Conversation();
|
||||
|
||||
void startSession();
|
||||
void endSession();
|
||||
std::string waitForSessionEnd();
|
||||
|
||||
void sendUserMessage(const std::string& text);
|
||||
void registerUserActivity();
|
||||
void sendContextualUpdate(const std::string& content);
|
||||
|
||||
private:
|
||||
void run();
|
||||
void handleMessage(const nlohmann::json& message);
|
||||
std::string getWssUrl() const;
|
||||
|
||||
// networking members
|
||||
boost::asio::io_context ioc_;
|
||||
boost::asio::ssl::context sslCtx_{boost::asio::ssl::context::tlsv12_client};
|
||||
|
||||
using tcp = boost::asio::ip::tcp;
|
||||
using websocket_t = boost::beast::websocket::stream<
|
||||
boost::beast::ssl_stream<tcp::socket>>;
|
||||
std::unique_ptr<websocket_t> ws_;
|
||||
|
||||
// general state
|
||||
std::string agentId_;
|
||||
bool requiresAuth_;
|
||||
std::shared_ptr<AudioInterface> audioInterface_;
|
||||
|
||||
CallbackAgentResponse callbackAgentResponse_;
|
||||
CallbackAgentResponseCorrection callbackAgentResponseCorrection_;
|
||||
CallbackUserTranscript callbackUserTranscript_;
|
||||
CallbackLatencyMeasurement callbackLatencyMeasurement_;
|
||||
|
||||
std::thread workerThread_;
|
||||
std::atomic<bool> shouldStop_{false};
|
||||
std::string conversationId_;
|
||||
|
||||
std::atomic<int> lastInterruptId_{0};
|
||||
};
|
||||
@ -1,45 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "AudioInterface.hpp"
|
||||
#include <portaudio.h>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <queue>
|
||||
#include <thread>
|
||||
#include <atomic>
|
||||
|
||||
class DefaultAudioInterface : public AudioInterface {
|
||||
public:
|
||||
static constexpr int INPUT_FRAMES_PER_BUFFER = 4000; // 250ms @ 16kHz
|
||||
static constexpr int OUTPUT_FRAMES_PER_BUFFER = 1000; // 62.5ms @ 16kHz
|
||||
|
||||
DefaultAudioInterface();
|
||||
~DefaultAudioInterface() override;
|
||||
|
||||
void start(AudioCallback inputCallback) override;
|
||||
void stop() override;
|
||||
void output(const std::vector<char>& audio) override;
|
||||
void interrupt() override;
|
||||
|
||||
private:
|
||||
static int inputCallbackStatic(const void* input, void* output, unsigned long frameCount,
|
||||
const PaStreamCallbackTimeInfo* timeInfo, PaStreamCallbackFlags statusFlags,
|
||||
void* userData);
|
||||
|
||||
int inputCallbackInternal(const void* input, unsigned long frameCount);
|
||||
|
||||
void outputThreadFunc();
|
||||
|
||||
PaStream* inputStream_{};
|
||||
PaStream* outputStream_{};
|
||||
|
||||
AudioCallback inputCallback_;
|
||||
|
||||
std::queue<std::vector<char>> outputQueue_;
|
||||
std::mutex queueMutex_;
|
||||
std::condition_variable queueCv_;
|
||||
|
||||
std::thread outputThread_;
|
||||
std::atomic<bool> shouldStop_{false};
|
||||
std::atomic<bool> outputPlaying_{false};
|
||||
};
|
||||
@ -1,230 +0,0 @@
|
||||
#include "Conversation.hpp"
|
||||
|
||||
#include <boost/beast/websocket/ssl.hpp>
|
||||
#include <boost/beast/websocket.hpp>
|
||||
#include <boost/beast/ssl.hpp>
|
||||
#include <boost/beast/core/detail/base64.hpp>
|
||||
#include <boost/asio/connect.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <openssl/ssl.h>
|
||||
|
||||
using tcp = boost::asio::ip::tcp;
|
||||
namespace ssl = boost::asio::ssl;
|
||||
namespace websocket = boost::beast::websocket;
|
||||
namespace beast = boost::beast;
|
||||
|
||||
static std::string base64Encode(const std::vector<char>& data) {
|
||||
auto encodedSize = beast::detail::base64::encoded_size(data.size());
|
||||
std::string out(encodedSize, '\0');
|
||||
beast::detail::base64::encode(&out[0], data.data(), data.size());
|
||||
return out;
|
||||
}
|
||||
|
||||
static std::vector<char> base64Decode(const std::string& str) {
|
||||
auto decodedSize = beast::detail::base64::decoded_size(str.size());
|
||||
std::vector<char> out(decodedSize);
|
||||
auto result = beast::detail::base64::decode(out.data(), str.data(), str.size());
|
||||
out.resize(result.first);
|
||||
return out;
|
||||
}
|
||||
|
||||
static std::string toString(const nlohmann::json& j){
|
||||
if(j.is_string()) return j.get<std::string>();
|
||||
if(j.is_number_integer()) return std::to_string(j.get<int64_t>());
|
||||
return j.dump();
|
||||
}
|
||||
|
||||
Conversation::Conversation(const std::string& agentId, bool requiresAuth,
|
||||
std::shared_ptr<AudioInterface> audioInterface,
|
||||
CallbackAgentResponse callbackAgentResponse,
|
||||
CallbackAgentResponseCorrection callbackAgentResponseCorrection,
|
||||
CallbackUserTranscript callbackUserTranscript,
|
||||
CallbackLatencyMeasurement callbackLatencyMeasurement)
|
||||
: agentId_(agentId),
|
||||
requiresAuth_(requiresAuth),
|
||||
audioInterface_(std::move(audioInterface)),
|
||||
callbackAgentResponse_(std::move(callbackAgentResponse)),
|
||||
callbackAgentResponseCorrection_(std::move(callbackAgentResponseCorrection)),
|
||||
callbackUserTranscript_(std::move(callbackUserTranscript)),
|
||||
callbackLatencyMeasurement_(std::move(callbackLatencyMeasurement)) {
|
||||
|
||||
sslCtx_.set_default_verify_paths();
|
||||
}
|
||||
|
||||
Conversation::~Conversation() {
|
||||
endSession();
|
||||
}
|
||||
|
||||
void Conversation::startSession() {
|
||||
shouldStop_.store(false);
|
||||
workerThread_ = std::thread(&Conversation::run, this);
|
||||
}
|
||||
|
||||
void Conversation::endSession() {
|
||||
shouldStop_.store(true);
|
||||
if (ws_) {
|
||||
beast::error_code ec;
|
||||
ws_->close(websocket::close_code::normal, ec);
|
||||
}
|
||||
if (audioInterface_) {
|
||||
audioInterface_->stop();
|
||||
}
|
||||
if (workerThread_.joinable()) {
|
||||
workerThread_.join();
|
||||
}
|
||||
}
|
||||
|
||||
std::string Conversation::waitForSessionEnd() {
|
||||
if (workerThread_.joinable()) {
|
||||
workerThread_.join();
|
||||
}
|
||||
return conversationId_;
|
||||
}
|
||||
|
||||
void Conversation::sendUserMessage(const std::string& text) {
|
||||
if (!ws_) {
|
||||
throw std::runtime_error("Session not started");
|
||||
}
|
||||
nlohmann::json j = {
|
||||
{"type", "user_message"},
|
||||
{"text", text}
|
||||
};
|
||||
ws_->write(boost::asio::buffer(j.dump()));
|
||||
}
|
||||
|
||||
void Conversation::registerUserActivity() {
|
||||
if (!ws_) throw std::runtime_error("Session not started");
|
||||
nlohmann::json j = {{"type", "user_activity"}};
|
||||
ws_->write(boost::asio::buffer(j.dump()));
|
||||
}
|
||||
|
||||
void Conversation::sendContextualUpdate(const std::string& content) {
|
||||
if (!ws_) throw std::runtime_error("Session not started");
|
||||
nlohmann::json j = {{"type", "contextual_update"}, {"content", content}};
|
||||
ws_->write(boost::asio::buffer(j.dump()));
|
||||
}
|
||||
|
||||
std::string Conversation::getWssUrl() const {
|
||||
// Hard-coded base env for demo; in production you'd call ElevenLabs env endpoint.
|
||||
std::ostringstream oss;
|
||||
oss << "wss://api.elevenlabs.io/v1/convai/conversation?agent_id=" << agentId_;
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
void Conversation::run() {
|
||||
try {
|
||||
auto url = getWssUrl();
|
||||
std::string protocol, host, target;
|
||||
unsigned short port = 443;
|
||||
|
||||
// Very naive parse: wss://host[:port]/path?query
|
||||
if (boost::starts_with(url, "wss://")) {
|
||||
protocol = "wss";
|
||||
host = url.substr(6);
|
||||
} else {
|
||||
throw std::runtime_error("Only wss:// URLs supported in this demo");
|
||||
}
|
||||
auto slashPos = host.find('/');
|
||||
if (slashPos == std::string::npos) {
|
||||
target = "/";
|
||||
} else {
|
||||
target = host.substr(slashPos);
|
||||
host = host.substr(0, slashPos);
|
||||
}
|
||||
auto colonPos = host.find(':');
|
||||
if (colonPos != std::string::npos) {
|
||||
port = static_cast<unsigned short>(std::stoi(host.substr(colonPos + 1)));
|
||||
host = host.substr(0, colonPos);
|
||||
}
|
||||
|
||||
tcp::resolver resolver(ioc_);
|
||||
auto const results = resolver.resolve(host, std::to_string(port));
|
||||
|
||||
beast::ssl_stream<tcp::socket> stream(ioc_, sslCtx_);
|
||||
boost::asio::connect(beast::get_lowest_layer(stream), results);
|
||||
if (!SSL_set_tlsext_host_name(stream.native_handle(), host.c_str())) {
|
||||
throw std::runtime_error("Failed to set SNI hostname on SSL stream");
|
||||
}
|
||||
stream.handshake(ssl::stream_base::client);
|
||||
|
||||
ws_ = std::make_unique<websocket_t>(std::move(stream));
|
||||
ws_->set_option(websocket::stream_base::timeout::suggested(beast::role_type::client));
|
||||
ws_->handshake(host, target);
|
||||
|
||||
// send initiation data
|
||||
nlohmann::json init = {
|
||||
{"type", "conversation_initiation_client_data"},
|
||||
{"custom_llm_extra_body", nlohmann::json::object()},
|
||||
{"conversation_config_override", nlohmann::json::object()},
|
||||
{"dynamic_variables", nlohmann::json::object()}
|
||||
};
|
||||
ws_->write(boost::asio::buffer(init.dump()));
|
||||
|
||||
// Prepare audio callback
|
||||
auto inputCb = [this](const std::vector<char>& audio) {
|
||||
nlohmann::json msg = {
|
||||
{"user_audio_chunk", base64Encode(audio)}
|
||||
};
|
||||
ws_->write(boost::asio::buffer(msg.dump()));
|
||||
};
|
||||
audioInterface_->start(inputCb);
|
||||
|
||||
beast::flat_buffer buffer;
|
||||
while (!shouldStop_.load()) {
|
||||
beast::error_code ec;
|
||||
ws_->read(buffer, ec);
|
||||
if (ec) {
|
||||
std::cerr << "Websocket read error: " << ec.message() << std::endl;
|
||||
break;
|
||||
}
|
||||
auto text = beast::buffers_to_string(buffer.data());
|
||||
buffer.consume(buffer.size());
|
||||
try {
|
||||
auto message = nlohmann::json::parse(text);
|
||||
handleMessage(message);
|
||||
} catch (const std::exception& ex) {
|
||||
std::cerr << "JSON parse error: " << ex.what() << std::endl;
|
||||
}
|
||||
}
|
||||
} catch (const std::exception& ex) {
|
||||
std::cerr << "Conversation error: " << ex.what() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void Conversation::handleMessage(const nlohmann::json& message) {
|
||||
std::string type = message.value("type", "");
|
||||
if (type == "conversation_initiation_metadata") {
|
||||
conversationId_ = message["conversation_initiation_metadata_event"]["conversation_id"].get<std::string>();
|
||||
} else if (type == "audio") {
|
||||
auto event = message["audio_event"];
|
||||
int eventId = std::stoi(toString(event["event_id"]));
|
||||
if (eventId <= lastInterruptId_.load()) return;
|
||||
auto audioBytes = base64Decode(event["audio_base_64"].get<std::string>());
|
||||
audioInterface_->output(audioBytes);
|
||||
} else if (type == "agent_response" && callbackAgentResponse_) {
|
||||
auto event = message["agent_response_event"];
|
||||
callbackAgentResponse_(event["agent_response"].get<std::string>());
|
||||
} else if (type == "agent_response_correction" && callbackAgentResponseCorrection_) {
|
||||
auto event = message["agent_response_correction_event"];
|
||||
callbackAgentResponseCorrection_(event["original_agent_response"].get<std::string>(),
|
||||
event["corrected_agent_response"].get<std::string>());
|
||||
} else if (type == "user_transcript" && callbackUserTranscript_) {
|
||||
auto event = message["user_transcription_event"];
|
||||
callbackUserTranscript_(event["user_transcript"].get<std::string>());
|
||||
} else if (type == "interruption") {
|
||||
auto event = message["interruption_event"];
|
||||
lastInterruptId_.store(std::stoi(toString(event["event_id"])));
|
||||
audioInterface_->interrupt();
|
||||
} else if (type == "ping") {
|
||||
auto event = message["ping_event"];
|
||||
nlohmann::json pong = {{"type", "pong"}, {"event_id", event["event_id"]}};
|
||||
ws_->write(boost::asio::buffer(pong.dump()));
|
||||
if (callbackLatencyMeasurement_ && event.contains("ping_ms")) {
|
||||
int latency = event["ping_ms"].is_number() ? event["ping_ms"].get<int>() : std::stoi(event["ping_ms"].get<std::string>());
|
||||
callbackLatencyMeasurement_(latency);
|
||||
}
|
||||
}
|
||||
// Note: client tool call handling omitted for brevity.
|
||||
}
|
||||
@ -1,131 +0,0 @@
|
||||
#include "DefaultAudioInterface.hpp"
|
||||
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
DefaultAudioInterface::DefaultAudioInterface() {
|
||||
PaError err = Pa_Initialize();
|
||||
if (err != paNoError) {
|
||||
throw std::runtime_error("PortAudio initialization failed");
|
||||
}
|
||||
}
|
||||
|
||||
DefaultAudioInterface::~DefaultAudioInterface() {
|
||||
if (!shouldStop_.load()) {
|
||||
stop();
|
||||
}
|
||||
Pa_Terminate();
|
||||
}
|
||||
|
||||
void DefaultAudioInterface::start(AudioCallback inputCallback) {
|
||||
inputCallback_ = std::move(inputCallback);
|
||||
PaStreamParameters inputParams;
|
||||
std::memset(&inputParams, 0, sizeof(inputParams));
|
||||
inputParams.channelCount = 1;
|
||||
inputParams.device = Pa_GetDefaultInputDevice();
|
||||
inputParams.sampleFormat = paInt16;
|
||||
inputParams.suggestedLatency = Pa_GetDeviceInfo(inputParams.device)->defaultLowInputLatency;
|
||||
inputParams.hostApiSpecificStreamInfo = nullptr;
|
||||
|
||||
PaStreamParameters outputParams;
|
||||
std::memset(&outputParams, 0, sizeof(outputParams));
|
||||
outputParams.channelCount = 1;
|
||||
outputParams.device = Pa_GetDefaultOutputDevice();
|
||||
outputParams.sampleFormat = paInt16;
|
||||
outputParams.suggestedLatency = Pa_GetDeviceInfo(outputParams.device)->defaultLowOutputLatency;
|
||||
outputParams.hostApiSpecificStreamInfo = nullptr;
|
||||
|
||||
PaError err = Pa_OpenStream(&inputStream_, &inputParams, nullptr, 16000, INPUT_FRAMES_PER_BUFFER, paClipOff,
|
||||
&DefaultAudioInterface::inputCallbackStatic, this);
|
||||
if (err != paNoError) {
|
||||
throw std::runtime_error("Failed to open input stream");
|
||||
}
|
||||
|
||||
err = Pa_OpenStream(&outputStream_, nullptr, &outputParams, 16000, OUTPUT_FRAMES_PER_BUFFER, paClipOff, nullptr, nullptr);
|
||||
if (err != paNoError) {
|
||||
throw std::runtime_error("Failed to open output stream");
|
||||
}
|
||||
|
||||
if ((err = Pa_StartStream(inputStream_)) != paNoError) {
|
||||
throw std::runtime_error("Failed to start input stream");
|
||||
}
|
||||
if ((err = Pa_StartStream(outputStream_)) != paNoError) {
|
||||
throw std::runtime_error("Failed to start output stream");
|
||||
}
|
||||
|
||||
shouldStop_.store(false);
|
||||
outputThread_ = std::thread(&DefaultAudioInterface::outputThreadFunc, this);
|
||||
}
|
||||
|
||||
void DefaultAudioInterface::stop() {
|
||||
shouldStop_.store(true);
|
||||
queueCv_.notify_all();
|
||||
if (outputThread_.joinable()) {
|
||||
outputThread_.join();
|
||||
}
|
||||
|
||||
if (inputStream_) {
|
||||
Pa_StopStream(inputStream_);
|
||||
Pa_CloseStream(inputStream_);
|
||||
inputStream_ = nullptr;
|
||||
}
|
||||
if (outputStream_) {
|
||||
Pa_StopStream(outputStream_);
|
||||
Pa_CloseStream(outputStream_);
|
||||
outputStream_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void DefaultAudioInterface::output(const std::vector<char>& audio) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lg(queueMutex_);
|
||||
outputQueue_.emplace(audio);
|
||||
}
|
||||
queueCv_.notify_one();
|
||||
}
|
||||
|
||||
void DefaultAudioInterface::interrupt() {
|
||||
std::lock_guard<std::mutex> lg(queueMutex_);
|
||||
std::queue<std::vector<char>> empty;
|
||||
std::swap(outputQueue_, empty);
|
||||
}
|
||||
|
||||
int DefaultAudioInterface::inputCallbackStatic(const void* input, void* /*output*/, unsigned long frameCount,
|
||||
const PaStreamCallbackTimeInfo* /*timeInfo*/, PaStreamCallbackFlags /*statusFlags*/,
|
||||
void* userData) {
|
||||
auto* self = static_cast<DefaultAudioInterface*>(userData);
|
||||
return self->inputCallbackInternal(input, frameCount);
|
||||
}
|
||||
|
||||
int DefaultAudioInterface::inputCallbackInternal(const void* input, unsigned long frameCount) {
|
||||
if (!input || !inputCallback_) {
|
||||
return paContinue;
|
||||
}
|
||||
if (outputPlaying_.load()) {
|
||||
// Suppress microphone input while playing output to avoid echo feedback.
|
||||
return paContinue;
|
||||
}
|
||||
const size_t bytes = frameCount * sizeof(int16_t);
|
||||
std::vector<char> buffer(bytes);
|
||||
std::memcpy(buffer.data(), input, bytes);
|
||||
inputCallback_(buffer);
|
||||
return paContinue;
|
||||
}
|
||||
|
||||
void DefaultAudioInterface::outputThreadFunc() {
|
||||
while (!shouldStop_.load()) {
|
||||
std::vector<char> audio;
|
||||
{
|
||||
std::unique_lock<std::mutex> lk(queueMutex_);
|
||||
queueCv_.wait(lk, [this] { return shouldStop_.load() || !outputQueue_.empty(); });
|
||||
if (shouldStop_.load()) break;
|
||||
audio = std::move(outputQueue_.front());
|
||||
outputQueue_.pop();
|
||||
}
|
||||
if (!audio.empty() && outputStream_) {
|
||||
outputPlaying_.store(true);
|
||||
Pa_WriteStream(outputStream_, audio.data(), audio.size() / sizeof(int16_t));
|
||||
outputPlaying_.store(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,31 +0,0 @@
|
||||
#include "Conversation.hpp"
|
||||
#include "DefaultAudioInterface.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
int main() {
|
||||
const char* agentIdEnv = std::getenv("AGENT_ID");
|
||||
if (!agentIdEnv) {
|
||||
std::cerr << "AGENT_ID environment variable must be set" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
std::string agentId(agentIdEnv);
|
||||
|
||||
auto audioInterface = std::make_shared<DefaultAudioInterface>();
|
||||
Conversation conv(agentId, /*requiresAuth*/ false, audioInterface,
|
||||
[](const std::string& resp) { std::cout << "Agent: " << resp << std::endl; },
|
||||
[](const std::string& orig, const std::string& corrected) {
|
||||
std::cout << "Agent correction: " << orig << " -> " << corrected << std::endl; },
|
||||
[](const std::string& transcript) { std::cout << "User: " << transcript << std::endl; });
|
||||
|
||||
conv.startSession();
|
||||
|
||||
std::cout << "Press Enter to quit..." << std::endl;
|
||||
std::cin.get();
|
||||
conv.endSession();
|
||||
auto convId = conv.waitForSessionEnd();
|
||||
std::cout << "Conversation ID: " << convId << std::endl;
|
||||
return 0;
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user