Compare commits

..

2 Commits

Author SHA1 Message Date
8175375c28 remove unwanted old plugins 2026-03-01 17:18:30 +01:00
275065f5aa Revert SSL cert path to Content/Certificates for packaged build staging
Saved/ is not staged in packaged builds, so Content/Certificates/ is the
only reliable location. Simplified code by removing Android-specific
writable fallback (Content/ works on all platforms with NonUFS staging).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 16:42:16 +01:00
3444 changed files with 7 additions and 956028 deletions

View File

@ -1,152 +0,0 @@
# Build directories
build/
cmake-build-*/
out/
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
convai_cpp
# CMake
CMakeCache.txt
CMakeFiles/
CMakeScripts/
Testing/
Makefile
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps/
# IDE files
.vscode/
.idea/
*.swp
*.swo
*~
# macOS
.DS_Store
.AppleDouble
.LSOverride
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# Windows
Thumbs.db
ehthumbs.db
Desktop.ini
$RECYCLE.BIN/
*.cab
*.msi
*.msm
*.msp
*.lnk
# Linux
*~
.fuse_hidden*
.directory
.Trash-*
.nfs*
# Logs
*.log
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Coverage directory used by tools like istanbul
coverage/
# nyc test coverage
.nyc_output
# Dependency directories
node_modules/
# Optional npm cache directory
.npm
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/

View File

@ -1,42 +0,0 @@
cmake_minimum_required(VERSION 3.14)
project(elevenlabs_convai_cpp LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# Find dependencies
find_package(Boost REQUIRED COMPONENTS system thread)
find_package(OpenSSL REQUIRED)
# PortAudio via vcpkg CMake config
find_package(portaudio CONFIG REQUIRED)
# Find nlohmann_json
find_package(nlohmann_json 3.11 QUIET)
if(NOT nlohmann_json_FOUND)
include(FetchContent)
# Fallback: header-only fetch to avoid old CMake policies in upstream CMakeLists
FetchContent_Declare(
nlohmann_json_src
URL https://raw.githubusercontent.com/nlohmann/json/v3.11.2/single_include/nlohmann/json.hpp
)
FetchContent_MakeAvailable(nlohmann_json_src)
add_library(nlohmann_json::nlohmann_json INTERFACE IMPORTED)
target_include_directories(nlohmann_json::nlohmann_json INTERFACE ${nlohmann_json_src_SOURCE_DIR}/single_include)
endif()
add_executable(convai_cpp
src/main.cpp
src/Conversation.cpp
src/DefaultAudioInterface.cpp
)
target_include_directories(convai_cpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
# MSVC: set Windows target version and suppress getenv deprecation warning
if(MSVC)
target_compile_definitions(convai_cpp PRIVATE _WIN32_WINNT=0x0A00 _CRT_SECURE_NO_WARNINGS)
endif()
target_link_libraries(convai_cpp PRIVATE Boost::system Boost::thread OpenSSL::SSL OpenSSL::Crypto portaudio nlohmann_json::nlohmann_json)

View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2024 Jitendra
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,197 +0,0 @@
# ElevenLabs Conversational AI - C++ Implementation
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![C++17](https://img.shields.io/badge/C%2B%2B-17-blue.svg)](https://en.wikipedia.org/wiki/C%2B%2B17)
[![CMake](https://img.shields.io/badge/CMake-3.14+-green.svg)](https://cmake.org/)
C++ implementation of ElevenLabs Conversational AI client
## Features
- **Real-time Audio Processing**: Full-duplex audio streaming with low-latency playback
- **WebSocket Integration**: Secure WSS connection to ElevenLabs Conversational AI platform
- **Cross-platform Audio**: PortAudio-based implementation supporting Windows, macOS, and Linux
- **Echo Suppression**: Built-in acoustic feedback prevention
- **Modern C++**: Clean, maintainable C++17 codebase with proper RAII and exception handling
- **Flexible Architecture**: Modular design allowing easy customization and extension
## Architecture
```mermaid
graph TB
subgraph "User Interface"
A[main.cpp] --> B[Conversation]
end
subgraph "Core Components"
B --> C[DefaultAudioInterface]
B --> D[WebSocket Client]
C --> E[PortAudio]
D --> F[Boost.Beast + OpenSSL]
end
subgraph "ElevenLabs Platform"
F --> G[WSS API Endpoint]
G --> H[Conversational AI Agent]
end
subgraph "Audio Flow"
I[Microphone] --> C
C --> J[Base64 Encoding]
J --> D
D --> K[Audio Events]
K --> L[Base64 Decoding]
L --> C
C --> M[Speakers]
end
subgraph "Message Types"
N[user_audio_chunk]
O[agent_response]
P[user_transcript]
Q[audio_event]
R[ping/pong]
end
style B fill:#e1f5fe
style C fill:#f3e5f5
style D fill:#e8f5e8
style H fill:#fff3e0
```
## Quick Start
### Prerequisites
- **C++17 compatible compiler**: GCC 11+, Clang 14+, or MSVC 2022+
- **CMake** 3.14 or higher
- **Dependencies** (install via package manager):
#### macOS (Homebrew)
```bash
brew install boost openssl portaudio nlohmann-json cmake pkg-config
```
#### Ubuntu/Debian
```bash
sudo apt update
sudo apt install build-essential cmake pkg-config
sudo apt install libboost-system-dev libboost-thread-dev
sudo apt install libssl-dev libportaudio2-dev nlohmann-json3-dev
```
#### Windows (vcpkg)
```bash
vcpkg install boost-system boost-thread openssl portaudio nlohmann-json
```
### Building
```bash
# Clone the repository
git clone https://github.com/Jitendra2603/elevenlabs-convai-cpp.git
cd elevenlabs-convai-cpp
# Build the project
mkdir build && cd build
cmake ..
cmake --build . --config Release
```
### Running
```bash
# Set your agent ID (get this from ElevenLabs dashboard)
export AGENT_ID="your-agent-id-here"
# Run the demo
./convai_cpp
```
The application will:
1. Connect to your ElevenLabs Conversational AI agent
2. Start capturing audio from your default microphone
3. Stream audio to the agent and play responses through speakers
4. Display conversation transcripts in the terminal
5. Continue until you press Enter to quit
## 📋 Usage Examples
### Basic Conversation
```bash
export AGENT_ID="agent_"
./convai_cpp
# Speak into your microphone and hear the AI agent respond
```
## Configuration
### Audio Settings
The audio interface is configured for optimal real-time performance:
- **Sample Rate**: 16 kHz
- **Format**: 16-bit PCM mono
- **Input Buffer**: 250ms (4000 frames)
- **Output Buffer**: 62.5ms (1000 frames)
### WebSocket Connection
- **Endpoint**: `wss://api.elevenlabs.io/v1/convai/conversation`
- **Protocol**: WebSocket Secure (WSS) with TLS 1.2+
- **Authentication**: Optional (required for private agents)
## Project Structure
```
elevenlabs-convai-cpp/
├── CMakeLists.txt # Build configuration
├── README.md # This file
├── LICENSE # MIT license
├── CONTRIBUTING.md # Contribution guidelines
├── .gitignore # Git ignore rules
├── include/ # Header files
│ ├── AudioInterface.hpp # Abstract audio interface
│ ├── DefaultAudioInterface.hpp # PortAudio implementation
│ └── Conversation.hpp # Main conversation handler
└── src/ # Source files
├── main.cpp # Demo application
├── Conversation.cpp # WebSocket and message handling
└── DefaultAudioInterface.cpp # Audio I/O implementation
```
## Technical Details
### Audio Processing Pipeline
1. **Capture**: PortAudio captures 16-bit PCM audio at 16kHz
2. **Encoding**: Raw audio is base64-encoded for WebSocket transmission
3. **Streaming**: Audio chunks sent as `user_audio_chunk` messages
4. **Reception**: Server sends `audio_event` messages with agent responses
5. **Decoding**: Base64 audio data decoded back to PCM
6. **Playback**: Audio queued and played through PortAudio output stream
### Echo Suppression
The implementation includes a simple, effective echo suppression mechanism:
- Microphone input is suppressed during agent speech playback
- Prevents acoustic feedback loops that cause the agent to respond to itself
- Uses atomic flags for thread-safe coordination between input/output
### WebSocket Message Handling
Supported message types:
- `conversation_initiation_client_data` - Session initialization
- `user_audio_chunk` - Microphone audio data
- `audio_event` - Agent speech audio
- `agent_response` - Agent text responses
- `user_transcript` - Speech-to-text results
- `ping`/`pong` - Connection keepalive
## 📝 License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

View File

@ -1,23 +0,0 @@
#pragma once
#include <functional>
#include <vector>
class AudioInterface {
public:
using AudioCallback = std::function<void(const std::vector<char>&)>;
virtual ~AudioInterface() = default;
// Starts the audio interface. The callback will be invoked with raw 16-bit PCM mono samples at 16kHz.
virtual void start(AudioCallback inputCallback) = 0;
// Stops audio I/O and releases underlying resources.
virtual void stop() = 0;
// Play audio to the user; audio is 16-bit PCM mono 16kHz.
virtual void output(const std::vector<char>& audio) = 0;
// Immediately stop any buffered / ongoing output.
virtual void interrupt() = 0;
};

View File

@ -1,72 +0,0 @@
#pragma once
#include "AudioInterface.hpp"
#include <boost/beast/core.hpp>
#include <boost/beast/websocket.hpp>
#include <boost/beast/ssl.hpp>
#include <boost/asio.hpp>
#include <boost/asio/ssl/stream.hpp>
#include <boost/asio/ip/tcp.hpp>
#include <nlohmann/json.hpp>
#include <thread>
#include <atomic>
#include <functional>
class Conversation {
public:
using CallbackAgentResponse = std::function<void(const std::string&)>;
using CallbackAgentResponseCorrection = std::function<void(const std::string&, const std::string&)>;
using CallbackUserTranscript = std::function<void(const std::string&)>;
using CallbackLatencyMeasurement = std::function<void(int)>;
Conversation(
const std::string& agentId,
bool requiresAuth,
std::shared_ptr<AudioInterface> audioInterface,
CallbackAgentResponse callbackAgentResponse = nullptr,
CallbackAgentResponseCorrection callbackAgentResponseCorrection = nullptr,
CallbackUserTranscript callbackUserTranscript = nullptr,
CallbackLatencyMeasurement callbackLatencyMeasurement = nullptr
);
~Conversation();
void startSession();
void endSession();
std::string waitForSessionEnd();
void sendUserMessage(const std::string& text);
void registerUserActivity();
void sendContextualUpdate(const std::string& content);
private:
void run();
void handleMessage(const nlohmann::json& message);
std::string getWssUrl() const;
// networking members
boost::asio::io_context ioc_;
boost::asio::ssl::context sslCtx_{boost::asio::ssl::context::tlsv12_client};
using tcp = boost::asio::ip::tcp;
using websocket_t = boost::beast::websocket::stream<
boost::beast::ssl_stream<tcp::socket>>;
std::unique_ptr<websocket_t> ws_;
// general state
std::string agentId_;
bool requiresAuth_;
std::shared_ptr<AudioInterface> audioInterface_;
CallbackAgentResponse callbackAgentResponse_;
CallbackAgentResponseCorrection callbackAgentResponseCorrection_;
CallbackUserTranscript callbackUserTranscript_;
CallbackLatencyMeasurement callbackLatencyMeasurement_;
std::thread workerThread_;
std::atomic<bool> shouldStop_{false};
std::string conversationId_;
std::atomic<int> lastInterruptId_{0};
};

View File

@ -1,45 +0,0 @@
#pragma once
#include "AudioInterface.hpp"
#include <portaudio.h>
#include <mutex>
#include <condition_variable>
#include <queue>
#include <thread>
#include <atomic>
class DefaultAudioInterface : public AudioInterface {
public:
static constexpr int INPUT_FRAMES_PER_BUFFER = 4000; // 250ms @ 16kHz
static constexpr int OUTPUT_FRAMES_PER_BUFFER = 1000; // 62.5ms @ 16kHz
DefaultAudioInterface();
~DefaultAudioInterface() override;
void start(AudioCallback inputCallback) override;
void stop() override;
void output(const std::vector<char>& audio) override;
void interrupt() override;
private:
static int inputCallbackStatic(const void* input, void* output, unsigned long frameCount,
const PaStreamCallbackTimeInfo* timeInfo, PaStreamCallbackFlags statusFlags,
void* userData);
int inputCallbackInternal(const void* input, unsigned long frameCount);
void outputThreadFunc();
PaStream* inputStream_{};
PaStream* outputStream_{};
AudioCallback inputCallback_;
std::queue<std::vector<char>> outputQueue_;
std::mutex queueMutex_;
std::condition_variable queueCv_;
std::thread outputThread_;
std::atomic<bool> shouldStop_{false};
std::atomic<bool> outputPlaying_{false};
};

View File

@ -1,230 +0,0 @@
#include "Conversation.hpp"
#include <boost/beast/websocket/ssl.hpp>
#include <boost/beast/websocket.hpp>
#include <boost/beast/ssl.hpp>
#include <boost/beast/core/detail/base64.hpp>
#include <boost/asio/connect.hpp>
#include <boost/algorithm/string.hpp>
#include <iostream>
#include <sstream>
#include <openssl/ssl.h>
using tcp = boost::asio::ip::tcp;
namespace ssl = boost::asio::ssl;
namespace websocket = boost::beast::websocket;
namespace beast = boost::beast;
static std::string base64Encode(const std::vector<char>& data) {
auto encodedSize = beast::detail::base64::encoded_size(data.size());
std::string out(encodedSize, '\0');
beast::detail::base64::encode(&out[0], data.data(), data.size());
return out;
}
static std::vector<char> base64Decode(const std::string& str) {
auto decodedSize = beast::detail::base64::decoded_size(str.size());
std::vector<char> out(decodedSize);
auto result = beast::detail::base64::decode(out.data(), str.data(), str.size());
out.resize(result.first);
return out;
}
static std::string toString(const nlohmann::json& j){
if(j.is_string()) return j.get<std::string>();
if(j.is_number_integer()) return std::to_string(j.get<int64_t>());
return j.dump();
}
Conversation::Conversation(const std::string& agentId, bool requiresAuth,
std::shared_ptr<AudioInterface> audioInterface,
CallbackAgentResponse callbackAgentResponse,
CallbackAgentResponseCorrection callbackAgentResponseCorrection,
CallbackUserTranscript callbackUserTranscript,
CallbackLatencyMeasurement callbackLatencyMeasurement)
: agentId_(agentId),
requiresAuth_(requiresAuth),
audioInterface_(std::move(audioInterface)),
callbackAgentResponse_(std::move(callbackAgentResponse)),
callbackAgentResponseCorrection_(std::move(callbackAgentResponseCorrection)),
callbackUserTranscript_(std::move(callbackUserTranscript)),
callbackLatencyMeasurement_(std::move(callbackLatencyMeasurement)) {
sslCtx_.set_default_verify_paths();
}
Conversation::~Conversation() {
endSession();
}
void Conversation::startSession() {
shouldStop_.store(false);
workerThread_ = std::thread(&Conversation::run, this);
}
void Conversation::endSession() {
shouldStop_.store(true);
if (ws_) {
beast::error_code ec;
ws_->close(websocket::close_code::normal, ec);
}
if (audioInterface_) {
audioInterface_->stop();
}
if (workerThread_.joinable()) {
workerThread_.join();
}
}
std::string Conversation::waitForSessionEnd() {
if (workerThread_.joinable()) {
workerThread_.join();
}
return conversationId_;
}
void Conversation::sendUserMessage(const std::string& text) {
if (!ws_) {
throw std::runtime_error("Session not started");
}
nlohmann::json j = {
{"type", "user_message"},
{"text", text}
};
ws_->write(boost::asio::buffer(j.dump()));
}
void Conversation::registerUserActivity() {
if (!ws_) throw std::runtime_error("Session not started");
nlohmann::json j = {{"type", "user_activity"}};
ws_->write(boost::asio::buffer(j.dump()));
}
void Conversation::sendContextualUpdate(const std::string& content) {
if (!ws_) throw std::runtime_error("Session not started");
nlohmann::json j = {{"type", "contextual_update"}, {"content", content}};
ws_->write(boost::asio::buffer(j.dump()));
}
std::string Conversation::getWssUrl() const {
// Hard-coded base env for demo; in production you'd call ElevenLabs env endpoint.
std::ostringstream oss;
oss << "wss://api.elevenlabs.io/v1/convai/conversation?agent_id=" << agentId_;
return oss.str();
}
void Conversation::run() {
try {
auto url = getWssUrl();
std::string protocol, host, target;
unsigned short port = 443;
// Very naive parse: wss://host[:port]/path?query
if (boost::starts_with(url, "wss://")) {
protocol = "wss";
host = url.substr(6);
} else {
throw std::runtime_error("Only wss:// URLs supported in this demo");
}
auto slashPos = host.find('/');
if (slashPos == std::string::npos) {
target = "/";
} else {
target = host.substr(slashPos);
host = host.substr(0, slashPos);
}
auto colonPos = host.find(':');
if (colonPos != std::string::npos) {
port = static_cast<unsigned short>(std::stoi(host.substr(colonPos + 1)));
host = host.substr(0, colonPos);
}
tcp::resolver resolver(ioc_);
auto const results = resolver.resolve(host, std::to_string(port));
beast::ssl_stream<tcp::socket> stream(ioc_, sslCtx_);
boost::asio::connect(beast::get_lowest_layer(stream), results);
if (!SSL_set_tlsext_host_name(stream.native_handle(), host.c_str())) {
throw std::runtime_error("Failed to set SNI hostname on SSL stream");
}
stream.handshake(ssl::stream_base::client);
ws_ = std::make_unique<websocket_t>(std::move(stream));
ws_->set_option(websocket::stream_base::timeout::suggested(beast::role_type::client));
ws_->handshake(host, target);
// send initiation data
nlohmann::json init = {
{"type", "conversation_initiation_client_data"},
{"custom_llm_extra_body", nlohmann::json::object()},
{"conversation_config_override", nlohmann::json::object()},
{"dynamic_variables", nlohmann::json::object()}
};
ws_->write(boost::asio::buffer(init.dump()));
// Prepare audio callback
auto inputCb = [this](const std::vector<char>& audio) {
nlohmann::json msg = {
{"user_audio_chunk", base64Encode(audio)}
};
ws_->write(boost::asio::buffer(msg.dump()));
};
audioInterface_->start(inputCb);
beast::flat_buffer buffer;
while (!shouldStop_.load()) {
beast::error_code ec;
ws_->read(buffer, ec);
if (ec) {
std::cerr << "Websocket read error: " << ec.message() << std::endl;
break;
}
auto text = beast::buffers_to_string(buffer.data());
buffer.consume(buffer.size());
try {
auto message = nlohmann::json::parse(text);
handleMessage(message);
} catch (const std::exception& ex) {
std::cerr << "JSON parse error: " << ex.what() << std::endl;
}
}
} catch (const std::exception& ex) {
std::cerr << "Conversation error: " << ex.what() << std::endl;
}
}
void Conversation::handleMessage(const nlohmann::json& message) {
std::string type = message.value("type", "");
if (type == "conversation_initiation_metadata") {
conversationId_ = message["conversation_initiation_metadata_event"]["conversation_id"].get<std::string>();
} else if (type == "audio") {
auto event = message["audio_event"];
int eventId = std::stoi(toString(event["event_id"]));
if (eventId <= lastInterruptId_.load()) return;
auto audioBytes = base64Decode(event["audio_base_64"].get<std::string>());
audioInterface_->output(audioBytes);
} else if (type == "agent_response" && callbackAgentResponse_) {
auto event = message["agent_response_event"];
callbackAgentResponse_(event["agent_response"].get<std::string>());
} else if (type == "agent_response_correction" && callbackAgentResponseCorrection_) {
auto event = message["agent_response_correction_event"];
callbackAgentResponseCorrection_(event["original_agent_response"].get<std::string>(),
event["corrected_agent_response"].get<std::string>());
} else if (type == "user_transcript" && callbackUserTranscript_) {
auto event = message["user_transcription_event"];
callbackUserTranscript_(event["user_transcript"].get<std::string>());
} else if (type == "interruption") {
auto event = message["interruption_event"];
lastInterruptId_.store(std::stoi(toString(event["event_id"])));
audioInterface_->interrupt();
} else if (type == "ping") {
auto event = message["ping_event"];
nlohmann::json pong = {{"type", "pong"}, {"event_id", event["event_id"]}};
ws_->write(boost::asio::buffer(pong.dump()));
if (callbackLatencyMeasurement_ && event.contains("ping_ms")) {
int latency = event["ping_ms"].is_number() ? event["ping_ms"].get<int>() : std::stoi(event["ping_ms"].get<std::string>());
callbackLatencyMeasurement_(latency);
}
}
// Note: client tool call handling omitted for brevity.
}

View File

@ -1,131 +0,0 @@
#include "DefaultAudioInterface.hpp"
#include <cstring>
#include <iostream>
DefaultAudioInterface::DefaultAudioInterface() {
PaError err = Pa_Initialize();
if (err != paNoError) {
throw std::runtime_error("PortAudio initialization failed");
}
}
DefaultAudioInterface::~DefaultAudioInterface() {
if (!shouldStop_.load()) {
stop();
}
Pa_Terminate();
}
void DefaultAudioInterface::start(AudioCallback inputCallback) {
inputCallback_ = std::move(inputCallback);
PaStreamParameters inputParams;
std::memset(&inputParams, 0, sizeof(inputParams));
inputParams.channelCount = 1;
inputParams.device = Pa_GetDefaultInputDevice();
inputParams.sampleFormat = paInt16;
inputParams.suggestedLatency = Pa_GetDeviceInfo(inputParams.device)->defaultLowInputLatency;
inputParams.hostApiSpecificStreamInfo = nullptr;
PaStreamParameters outputParams;
std::memset(&outputParams, 0, sizeof(outputParams));
outputParams.channelCount = 1;
outputParams.device = Pa_GetDefaultOutputDevice();
outputParams.sampleFormat = paInt16;
outputParams.suggestedLatency = Pa_GetDeviceInfo(outputParams.device)->defaultLowOutputLatency;
outputParams.hostApiSpecificStreamInfo = nullptr;
PaError err = Pa_OpenStream(&inputStream_, &inputParams, nullptr, 16000, INPUT_FRAMES_PER_BUFFER, paClipOff,
&DefaultAudioInterface::inputCallbackStatic, this);
if (err != paNoError) {
throw std::runtime_error("Failed to open input stream");
}
err = Pa_OpenStream(&outputStream_, nullptr, &outputParams, 16000, OUTPUT_FRAMES_PER_BUFFER, paClipOff, nullptr, nullptr);
if (err != paNoError) {
throw std::runtime_error("Failed to open output stream");
}
if ((err = Pa_StartStream(inputStream_)) != paNoError) {
throw std::runtime_error("Failed to start input stream");
}
if ((err = Pa_StartStream(outputStream_)) != paNoError) {
throw std::runtime_error("Failed to start output stream");
}
shouldStop_.store(false);
outputThread_ = std::thread(&DefaultAudioInterface::outputThreadFunc, this);
}
void DefaultAudioInterface::stop() {
shouldStop_.store(true);
queueCv_.notify_all();
if (outputThread_.joinable()) {
outputThread_.join();
}
if (inputStream_) {
Pa_StopStream(inputStream_);
Pa_CloseStream(inputStream_);
inputStream_ = nullptr;
}
if (outputStream_) {
Pa_StopStream(outputStream_);
Pa_CloseStream(outputStream_);
outputStream_ = nullptr;
}
}
void DefaultAudioInterface::output(const std::vector<char>& audio) {
{
std::lock_guard<std::mutex> lg(queueMutex_);
outputQueue_.emplace(audio);
}
queueCv_.notify_one();
}
void DefaultAudioInterface::interrupt() {
std::lock_guard<std::mutex> lg(queueMutex_);
std::queue<std::vector<char>> empty;
std::swap(outputQueue_, empty);
}
int DefaultAudioInterface::inputCallbackStatic(const void* input, void* /*output*/, unsigned long frameCount,
const PaStreamCallbackTimeInfo* /*timeInfo*/, PaStreamCallbackFlags /*statusFlags*/,
void* userData) {
auto* self = static_cast<DefaultAudioInterface*>(userData);
return self->inputCallbackInternal(input, frameCount);
}
int DefaultAudioInterface::inputCallbackInternal(const void* input, unsigned long frameCount) {
if (!input || !inputCallback_) {
return paContinue;
}
if (outputPlaying_.load()) {
// Suppress microphone input while playing output to avoid echo feedback.
return paContinue;
}
const size_t bytes = frameCount * sizeof(int16_t);
std::vector<char> buffer(bytes);
std::memcpy(buffer.data(), input, bytes);
inputCallback_(buffer);
return paContinue;
}
void DefaultAudioInterface::outputThreadFunc() {
while (!shouldStop_.load()) {
std::vector<char> audio;
{
std::unique_lock<std::mutex> lk(queueMutex_);
queueCv_.wait(lk, [this] { return shouldStop_.load() || !outputQueue_.empty(); });
if (shouldStop_.load()) break;
audio = std::move(outputQueue_.front());
outputQueue_.pop();
}
if (!audio.empty() && outputStream_) {
outputPlaying_.store(true);
Pa_WriteStream(outputStream_, audio.data(), audio.size() / sizeof(int16_t));
outputPlaying_.store(false);
}
}
}

View File

@ -1,31 +0,0 @@
#include "Conversation.hpp"
#include "DefaultAudioInterface.hpp"
#include <cstdlib>
#include <iostream>
#include <memory>
int main() {
const char* agentIdEnv = std::getenv("AGENT_ID");
if (!agentIdEnv) {
std::cerr << "AGENT_ID environment variable must be set" << std::endl;
return 1;
}
std::string agentId(agentIdEnv);
auto audioInterface = std::make_shared<DefaultAudioInterface>();
Conversation conv(agentId, /*requiresAuth*/ false, audioInterface,
[](const std::string& resp) { std::cout << "Agent: " << resp << std::endl; },
[](const std::string& orig, const std::string& corrected) {
std::cout << "Agent correction: " << orig << " -> " << corrected << std::endl; },
[](const std::string& transcript) { std::cout << "User: " << transcript << std::endl; });
conv.startSession();
std::cout << "Press Enter to quit..." << std::endl;
std::cin.get();
conv.endSession();
auto convId = conv.waitForSessionEnd();
std::cout << "Conversation ID: " << convId << std::endl;
return 0;
}

Some files were not shown because too many files have changed in this diff Show More