From bbeb4294a8fbda069ba0fb2c220622e820b46a42 Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Thu, 19 Feb 2026 13:35:35 +0100 Subject: [PATCH] Add ElevenLabs API reference doc for future Claude sessions Covers: WebSocket protocol (all message types), Agent ID location, Signed URL auth, REST agents API, audio format, UE5 integration notes. Co-Authored-By: Claude Opus 4.6 --- .claude/elevenlabs_api_reference.md | 463 ++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100644 .claude/elevenlabs_api_reference.md diff --git a/.claude/elevenlabs_api_reference.md b/.claude/elevenlabs_api_reference.md new file mode 100644 index 0000000..0195ba7 --- /dev/null +++ b/.claude/elevenlabs_api_reference.md @@ -0,0 +1,463 @@ +# ElevenLabs Conversational AI – API Reference +> Saved for Claude Code sessions. Auto-loaded via `.claude/` directory. +> Last updated: 2026-02-19 + +--- + +## 1. Agent ID — Where to Find It + +### In the Dashboard (UI) +1. Go to **https://elevenlabs.io/app/conversational-ai** +2. Click on your agent to open it +3. The **Agent ID** is shown in the agent settings page — typically in the URL bar and/or in the agent's "General" settings tab + - URL pattern: `https://elevenlabs.io/app/conversational-ai/agents/` + - Also visible in the "API" or "Overview" tab of the agent editor (copy button available) + +### Via API +```http +GET https://api.elevenlabs.io/v1/convai/agents +xi-api-key: YOUR_API_KEY +``` +Returns a list of all agents with their `agent_id` strings. + +### Via API (single agent) +```http +GET https://api.elevenlabs.io/v1/convai/agents/{agent_id} +xi-api-key: YOUR_API_KEY +``` + +### Agent ID Format +- Type: `string` +- Returned on agent creation via `POST /v1/convai/agents/create` +- Used as URL path param and WebSocket query param throughout the API + +--- + +## 2. WebSocket Conversational AI + +### Connection URL +``` +wss://api.elevenlabs.io/v1/convai/conversation?agent_id= +``` + +Regional alternatives: +| Region | URL | +|--------|-----| +| Default (Global) | `wss://api.elevenlabs.io/` | +| US | `wss://api.us.elevenlabs.io/` | +| EU | `wss://api.eu.residency.elevenlabs.io/` | +| India | `wss://api.in.residency.elevenlabs.io/` | + +### Authentication +- **Public agents**: No key required, just `agent_id` query param +- **Private agents**: Use a **Signed URL** (see Section 4) instead of direct `agent_id` +- **Server-side** (backend): Pass `xi-api-key` as an HTTP upgrade header + +``` +Headers: + xi-api-key: YOUR_API_KEY +``` + +> ⚠️ Never expose your API key client-side. For browser/mobile apps, use Signed URLs. + +--- + +## 3. WebSocket Protocol — Message Reference + +### Audio Format +- **Input (mic → server)**: PCM 16-bit signed, **16000 Hz**, mono, little-endian, Base64-encoded +- **Output (server → client)**: Base64-encoded audio (format specified in `conversation_initiation_metadata`) + +--- + +### Messages FROM Server (Subscribe / Receive) + +#### `conversation_initiation_metadata` +Sent immediately after connection. Contains conversation ID and audio format specs. +```json +{ + "type": "conversation_initiation_metadata", + "conversation_initiation_metadata_event": { + "conversation_id": "string", + "agent_output_audio_format": "pcm_16000 | mp3_44100 | ...", + "user_input_audio_format": "pcm_16000" + } +} +``` + +#### `audio` +Agent speech audio chunk. +```json +{ + "type": "audio", + "audio_event": { + "audio_base_64": "BASE64_PCM_BYTES", + "event_id": 42 + } +} +``` + +#### `user_transcript` +Transcribed text of what the user said. +```json +{ + "type": "user_transcript", + "user_transcription_event": { + "user_transcript": "Hello, how are you?" + } +} +``` + +#### `agent_response` +The text the agent is saying (arrives in parallel with audio). +```json +{ + "type": "agent_response", + "agent_response_event": { + "agent_response": "I'm doing great, thanks!" + } +} +``` + +#### `agent_response_correction` +Sent after an interruption — shows what was truncated. +```json +{ + "type": "agent_response_correction", + "agent_response_correction_event": { + "original_agent_response": "string", + "corrected_agent_response": "string" + } +} +``` + +#### `interruption` +Signals that a specific audio event was interrupted. +```json +{ + "type": "interruption", + "interruption_event": { + "event_id": 42 + } +} +``` + +#### `ping` +Keepalive ping from server. Client must reply with `pong`. +```json +{ + "type": "ping", + "ping_event": { + "event_id": 1, + "ping_ms": 150 + } +} +``` + +#### `client_tool_call` +Requests the client execute a tool (custom tools integration). +```json +{ + "type": "client_tool_call", + "client_tool_call": { + "tool_name": "string", + "tool_call_id": "string", + "parameters": {} + } +} +``` + +#### `contextual_update` +Text context added to conversation state (non-interrupting). +```json +{ + "type": "contextual_update", + "contextual_update_event": { + "text": "string" + } +} +``` + +#### `vad_score` +Voice Activity Detection confidence score (0.0–1.0). +```json +{ + "type": "vad_score", + "vad_score_event": { + "vad_score": 0.85 + } +} +``` + +#### `internal_tentative_agent_response` +Preliminary agent text during LLM generation (not final). +```json +{ + "type": "internal_tentative_agent_response", + "tentative_agent_response_internal_event": { + "tentative_agent_response": "string" + } +} +``` + +--- + +### Messages TO Server (Publish / Send) + +#### `user_audio_chunk` +Microphone audio data. Send continuously during user speech. +```json +{ + "user_audio_chunk": "BASE64_PCM_16BIT_16KHZ_MONO" +} +``` +Audio must be: **PCM 16-bit signed, 16000 Hz, mono, little-endian**, then Base64-encoded. + +#### `pong` +Reply to server `ping` to keep connection alive. +```json +{ + "type": "pong", + "event_id": 1 +} +``` + +#### `conversation_initiation_client_data` +Override agent configuration at connection time. Send before or just after connecting. +```json +{ + "type": "conversation_initiation_client_data", + "conversation_config_override": { + "agent": { + "prompt": { "prompt": "Custom system prompt override" }, + "first_message": "Hello! How can I help?", + "language": "en" + }, + "tts": { + "voice_id": "string", + "speed": 1.0, + "stability": 0.5, + "similarity_boost": 0.75 + } + }, + "dynamic_variables": { + "user_name": "Alice", + "session_id": 12345 + } +} +``` + +Config override ranges: +- `tts.speed`: 0.7 – 1.2 +- `tts.stability`: 0.0 – 1.0 +- `tts.similarity_boost`: 0.0 – 1.0 + +#### `client_tool_result` +Response to a `client_tool_call` from the server. +```json +{ + "type": "client_tool_result", + "tool_call_id": "string", + "result": "tool output string", + "is_error": false +} +``` + +#### `contextual_update` +Inject context without interrupting the conversation. +```json +{ + "type": "contextual_update", + "text": "User just entered room 4B" +} +``` + +#### `user_message` +Send a text message (no mic audio needed). +```json +{ + "type": "user_message", + "text": "What is the weather like?" +} +``` + +#### `user_activity` +Signal that user is active (for turn detection in client mode). +```json +{ + "type": "user_activity" +} +``` + +--- + +## 4. Signed URL (Private Agents) + +Used for browser/mobile clients to authenticate without exposing the API key. + +### Flow +1. **Backend** calls ElevenLabs API to get a temporary signed URL +2. Backend returns signed URL to client +3. **Client** opens WebSocket to the signed URL (no API key needed) + +### Get Signed URL +```http +GET https://api.elevenlabs.io/v1/convai/conversation/get-signed-url?agent_id= +xi-api-key: YOUR_API_KEY +``` + +Optional query params: +- `include_conversation_id=true` — generates unique conversation ID, prevents URL reuse +- `branch_id` — specific agent branch + +Response: +```json +{ + "signed_url": "wss://api.elevenlabs.io/v1/convai/conversation?agent_id=...&token=..." +} +``` + +Client connects to `signed_url` directly — no headers needed. + +--- + +## 5. Agents REST API + +Base URL: `https://api.elevenlabs.io` +Auth header: `xi-api-key: YOUR_API_KEY` + +### Create Agent +```http +POST /v1/convai/agents/create +Content-Type: application/json + +{ + "name": "My NPC Agent", + "conversation_config": { + "agent": { + "first_message": "Hello adventurer!", + "prompt": { "prompt": "You are a wise tavern keeper in a fantasy world." }, + "language": "en" + } + } +} +``` +Response includes `agent_id`. + +### List Agents +```http +GET /v1/convai/agents?page_size=30&search=&sort_by=created_at&sort_direction=desc +``` +Response: +```json +{ + "agents": [ + { + "agent_id": "abc123xyz", + "name": "My NPC Agent", + "created_at_unix_secs": 1708300000, + "last_call_time_unix_secs": null, + "archived": false, + "tags": [] + } + ], + "has_more": false, + "next_cursor": null +} +``` + +### Get Agent +```http +GET /v1/convai/agents/{agent_id} +``` + +### Update Agent +```http +PATCH /v1/convai/agents/{agent_id} +Content-Type: application/json +{ "name": "Updated Name", "conversation_config": { ... } } +``` + +### Delete Agent +```http +DELETE /v1/convai/agents/{agent_id} +``` + +--- + +## 6. Turn Modes + +### Server VAD (Default / Recommended) +- ElevenLabs server detects when user stops speaking +- Client streams audio continuously +- Server handles all turn-taking automatically + +### Client Turn Mode +- Client explicitly signals turn boundaries +- Send `user_activity` to indicate user is speaking +- Use when you have your own VAD or push-to-talk UI + +--- + +## 7. Audio Pipeline (UE5 Implementation Notes) + +``` +Microphone (FAudioCapture) + → float32 samples at device rate (e.g. 44100 Hz stereo) + → Resample to 16000 Hz mono + → Convert float32 → int16 little-endian + → Base64-encode + → Send as {"user_audio_chunk": "BASE64"} + +Server → {"type":"audio","audio_event":{"audio_base_64":"BASE64"}} + → Base64-decode + → Raw PCM bytes + → Push to USoundWaveProcedural + → UAudioComponent plays back +``` + +### Float32 → Int16 Conversion (C++) +```cpp +static TArray FloatPCMToInt16Bytes(const TArray& FloatSamples) +{ + TArray Bytes; + Bytes.SetNumUninitialized(FloatSamples.Num() * 2); + for (int32 i = 0; i < FloatSamples.Num(); i++) + { + float Clamped = FMath::Clamp(FloatSamples[i], -1.f, 1.f); + int16 Sample = (int16)(Clamped * 32767.f); + Bytes[i * 2] = (uint8)(Sample & 0xFF); // Low byte + Bytes[i * 2 + 1] = (uint8)((Sample >> 8) & 0xFF); // High byte + } + return Bytes; +} +``` + +--- + +## 8. Quick Integration Checklist (UE5 Plugin) + +- [ ] Set `AgentID` in `UElevenLabsSettings` (Project Settings → ElevenLabs AI Agent) + - Or override per-component via `UElevenLabsConversationalAgentComponent::AgentID` +- [ ] Set `API_Key` in settings (or leave empty for public agents) +- [ ] Add `UElevenLabsConversationalAgentComponent` to your NPC actor +- [ ] Set `TurnMode` (default: `Server` — recommended) +- [ ] Bind to events: `OnAgentConnected`, `OnAgentTranscript`, `OnAgentTextResponse`, `OnAgentStartedSpeaking`, `OnAgentStoppedSpeaking` +- [ ] Call `StartConversation()` to begin +- [ ] Call `EndConversation()` when done + +--- + +## 9. Key API URLs Reference + +| Purpose | URL | +|---------|-----| +| Dashboard | https://elevenlabs.io/app/conversational-ai | +| API Keys | https://elevenlabs.io/app/settings/api-keys | +| WebSocket endpoint | wss://api.elevenlabs.io/v1/convai/conversation | +| Agents list | GET https://api.elevenlabs.io/v1/convai/agents | +| Agent by ID | GET https://api.elevenlabs.io/v1/convai/agents/{agent_id} | +| Create agent | POST https://api.elevenlabs.io/v1/convai/agents/create | +| Signed URL | GET https://api.elevenlabs.io/v1/convai/conversation/get-signed-url | +| WS protocol docs | https://elevenlabs.io/docs/eleven-agents/api-reference/eleven-agents/websocket | +| Quickstart | https://elevenlabs.io/docs/eleven-agents/quickstart |