Add ez-assistant and kerberos service folders
This commit is contained in:
@@ -0,0 +1,73 @@
|
||||
# Changelog
|
||||
|
||||
## 2026.1.26
|
||||
|
||||
### Changes
|
||||
- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deep‑merges with core).
|
||||
- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
|
||||
- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
|
||||
- Ngrok free-tier bypass renamed to `tunnel.allowNgrokFreeTierLoopbackBypass` and gated to loopback + `tunnel.provider="ngrok"`.
|
||||
|
||||
## 2026.1.23
|
||||
|
||||
### Changes
|
||||
- Version alignment with core Moltbot release numbers.
|
||||
|
||||
## 2026.1.22
|
||||
|
||||
### Changes
|
||||
- Version alignment with core Moltbot release numbers.
|
||||
|
||||
## 2026.1.21
|
||||
|
||||
### Changes
|
||||
- Version alignment with core Moltbot release numbers.
|
||||
|
||||
## 2026.1.20
|
||||
|
||||
### Changes
|
||||
- Version alignment with core Moltbot release numbers.
|
||||
|
||||
## 2026.1.17-1
|
||||
|
||||
### Changes
|
||||
- Version alignment with core Moltbot release numbers.
|
||||
|
||||
## 2026.1.17
|
||||
|
||||
### Changes
|
||||
- Version alignment with core Moltbot release numbers.
|
||||
|
||||
## 2026.1.16
|
||||
|
||||
### Changes
|
||||
- Version alignment with core Moltbot release numbers.
|
||||
|
||||
## 2026.1.15
|
||||
|
||||
### Changes
|
||||
- Version alignment with core Moltbot release numbers.
|
||||
|
||||
## 2026.1.14
|
||||
|
||||
### Changes
|
||||
- Version alignment with core Moltbot release numbers.
|
||||
|
||||
## 0.1.0
|
||||
|
||||
### Highlights
|
||||
- First public release of the @moltbot/voice-call plugin.
|
||||
|
||||
### Features
|
||||
- Providers: Twilio (Programmable Voice + Media Streams), Telnyx (Call Control v2), and mock provider for local dev.
|
||||
- Call flows: outbound notify vs. conversation modes, configurable auto‑hangup, and multi‑turn continuation.
|
||||
- Inbound handling: policy controls (disabled/allowlist/open), allowlist matching, and inbound greeting.
|
||||
- Webhooks: built‑in server with configurable bind/port/path plus `publicUrl` override.
|
||||
- Exposure helpers: ngrok + Tailscale serve/funnel; dev‑only signature bypass for ngrok free tier.
|
||||
- Streaming: OpenAI Realtime STT over media WebSocket with partial + final transcripts.
|
||||
- Speech: OpenAI TTS (model/voice/instructions) with Twilio `<Say>` fallback.
|
||||
- Tooling: `voice_call` tool actions for initiate/continue/speak/end/status.
|
||||
- Gateway RPC: `voicecall.initiate|continue|speak|end|status` (+ legacy `voicecall.start`).
|
||||
- CLI: `moltbot voicecall` commands (call/start/continue/speak/end/status/tail/expose).
|
||||
- Observability: JSONL call logs and `voicecall tail` for live inspection.
|
||||
- Response controls: `responseModel`, `responseSystemPrompt`, and `responseTimeoutMs` for auto‑responses.
|
||||
135
docker-compose/ez-assistant/extensions/voice-call/README.md
Normal file
135
docker-compose/ez-assistant/extensions/voice-call/README.md
Normal file
@@ -0,0 +1,135 @@
|
||||
# @clawdbot/voice-call
|
||||
|
||||
Official Voice Call plugin for **Clawdbot**.
|
||||
|
||||
Providers:
|
||||
- **Twilio** (Programmable Voice + Media Streams)
|
||||
- **Telnyx** (Call Control v2)
|
||||
- **Plivo** (Voice API + XML transfer + GetInput speech)
|
||||
- **Mock** (dev/no network)
|
||||
|
||||
Docs: `https://docs.molt.bot/plugins/voice-call`
|
||||
Plugin system: `https://docs.molt.bot/plugin`
|
||||
|
||||
## Install (local dev)
|
||||
|
||||
### Option A: install via Clawdbot (recommended)
|
||||
|
||||
```bash
|
||||
clawdbot plugins install @clawdbot/voice-call
|
||||
```
|
||||
|
||||
Restart the Gateway afterwards.
|
||||
|
||||
### Option B: copy into your global extensions folder (dev)
|
||||
|
||||
```bash
|
||||
mkdir -p ~/.clawdbot/extensions
|
||||
cp -R extensions/voice-call ~/.clawdbot/extensions/voice-call
|
||||
cd ~/.clawdbot/extensions/voice-call && pnpm install
|
||||
```
|
||||
|
||||
## Config
|
||||
|
||||
Put under `plugins.entries.voice-call.config`:
|
||||
|
||||
```json5
|
||||
{
|
||||
provider: "twilio", // or "telnyx" | "plivo" | "mock"
|
||||
fromNumber: "+15550001234",
|
||||
toNumber: "+15550005678",
|
||||
|
||||
twilio: {
|
||||
accountSid: "ACxxxxxxxx",
|
||||
authToken: "your_token"
|
||||
},
|
||||
|
||||
plivo: {
|
||||
authId: "MAxxxxxxxxxxxxxxxxxxxx",
|
||||
authToken: "your_token"
|
||||
},
|
||||
|
||||
// Webhook server
|
||||
serve: {
|
||||
port: 3334,
|
||||
path: "/voice/webhook"
|
||||
},
|
||||
|
||||
// Public exposure (pick one):
|
||||
// publicUrl: "https://example.ngrok.app/voice/webhook",
|
||||
// tunnel: { provider: "ngrok" },
|
||||
// tailscale: { mode: "funnel", path: "/voice/webhook" }
|
||||
|
||||
outbound: {
|
||||
defaultMode: "notify" // or "conversation"
|
||||
},
|
||||
|
||||
streaming: {
|
||||
enabled: true,
|
||||
streamPath: "/voice/stream"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Twilio/Telnyx/Plivo require a **publicly reachable** webhook URL.
|
||||
- `mock` is a local dev provider (no network calls).
|
||||
- `tunnel.allowNgrokFreeTierLoopbackBypass: true` allows Twilio webhooks with invalid signatures **only** when `tunnel.provider="ngrok"` and `serve.bind` is loopback (ngrok local agent). Use for local dev only.
|
||||
|
||||
## TTS for calls
|
||||
|
||||
Voice Call uses the core `messages.tts` configuration (OpenAI or ElevenLabs) for
|
||||
streaming speech on calls. You can override it under the plugin config with the
|
||||
same shape — overrides deep-merge with `messages.tts`.
|
||||
|
||||
```json5
|
||||
{
|
||||
tts: {
|
||||
provider: "openai",
|
||||
openai: {
|
||||
voice: "alloy"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Edge TTS is ignored for voice calls (telephony audio needs PCM; Edge output is unreliable).
|
||||
- Core TTS is used when Twilio media streaming is enabled; otherwise calls fall back to provider native voices.
|
||||
|
||||
## CLI
|
||||
|
||||
```bash
|
||||
clawdbot voicecall call --to "+15555550123" --message "Hello from Clawdbot"
|
||||
clawdbot voicecall continue --call-id <id> --message "Any questions?"
|
||||
clawdbot voicecall speak --call-id <id> --message "One moment"
|
||||
clawdbot voicecall end --call-id <id>
|
||||
clawdbot voicecall status --call-id <id>
|
||||
clawdbot voicecall tail
|
||||
clawdbot voicecall expose --mode funnel
|
||||
```
|
||||
|
||||
## Tool
|
||||
|
||||
Tool name: `voice_call`
|
||||
|
||||
Actions:
|
||||
- `initiate_call` (message, to?, mode?)
|
||||
- `continue_call` (callId, message)
|
||||
- `speak_to_user` (callId, message)
|
||||
- `end_call` (callId)
|
||||
- `get_status` (callId)
|
||||
|
||||
## Gateway RPC
|
||||
|
||||
- `voicecall.initiate` (to?, message, mode?)
|
||||
- `voicecall.continue` (callId, message)
|
||||
- `voicecall.speak` (callId, message)
|
||||
- `voicecall.end` (callId)
|
||||
- `voicecall.status` (callId)
|
||||
|
||||
## Notes
|
||||
|
||||
- Uses webhook signature verification for Twilio/Telnyx/Plivo.
|
||||
- `responseModel` / `responseSystemPrompt` control AI auto-responses.
|
||||
- Media streaming requires `ws` and OpenAI Realtime API key.
|
||||
@@ -0,0 +1,601 @@
|
||||
{
|
||||
"id": "voice-call",
|
||||
"uiHints": {
|
||||
"provider": {
|
||||
"label": "Provider",
|
||||
"help": "Use twilio, telnyx, or mock for dev/no-network."
|
||||
},
|
||||
"fromNumber": {
|
||||
"label": "From Number",
|
||||
"placeholder": "+15550001234"
|
||||
},
|
||||
"toNumber": {
|
||||
"label": "Default To Number",
|
||||
"placeholder": "+15550001234"
|
||||
},
|
||||
"inboundPolicy": {
|
||||
"label": "Inbound Policy"
|
||||
},
|
||||
"allowFrom": {
|
||||
"label": "Inbound Allowlist"
|
||||
},
|
||||
"inboundGreeting": {
|
||||
"label": "Inbound Greeting",
|
||||
"advanced": true
|
||||
},
|
||||
"telnyx.apiKey": {
|
||||
"label": "Telnyx API Key",
|
||||
"sensitive": true
|
||||
},
|
||||
"telnyx.connectionId": {
|
||||
"label": "Telnyx Connection ID"
|
||||
},
|
||||
"telnyx.publicKey": {
|
||||
"label": "Telnyx Public Key",
|
||||
"sensitive": true
|
||||
},
|
||||
"twilio.accountSid": {
|
||||
"label": "Twilio Account SID"
|
||||
},
|
||||
"twilio.authToken": {
|
||||
"label": "Twilio Auth Token",
|
||||
"sensitive": true
|
||||
},
|
||||
"outbound.defaultMode": {
|
||||
"label": "Default Call Mode"
|
||||
},
|
||||
"outbound.notifyHangupDelaySec": {
|
||||
"label": "Notify Hangup Delay (sec)",
|
||||
"advanced": true
|
||||
},
|
||||
"serve.port": {
|
||||
"label": "Webhook Port"
|
||||
},
|
||||
"serve.bind": {
|
||||
"label": "Webhook Bind"
|
||||
},
|
||||
"serve.path": {
|
||||
"label": "Webhook Path"
|
||||
},
|
||||
"tailscale.mode": {
|
||||
"label": "Tailscale Mode",
|
||||
"advanced": true
|
||||
},
|
||||
"tailscale.path": {
|
||||
"label": "Tailscale Path",
|
||||
"advanced": true
|
||||
},
|
||||
"tunnel.provider": {
|
||||
"label": "Tunnel Provider",
|
||||
"advanced": true
|
||||
},
|
||||
"tunnel.ngrokAuthToken": {
|
||||
"label": "ngrok Auth Token",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"tunnel.ngrokDomain": {
|
||||
"label": "ngrok Domain",
|
||||
"advanced": true
|
||||
},
|
||||
"tunnel.allowNgrokFreeTierLoopbackBypass": {
|
||||
"label": "Allow ngrok Free Tier (Loopback Bypass)",
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.enabled": {
|
||||
"label": "Enable Streaming",
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.openaiApiKey": {
|
||||
"label": "OpenAI Realtime API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.sttModel": {
|
||||
"label": "Realtime STT Model",
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.streamPath": {
|
||||
"label": "Media Stream Path",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.provider": {
|
||||
"label": "TTS Provider Override",
|
||||
"help": "Deep-merges with messages.tts (Edge is ignored for calls).",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.openai.model": {
|
||||
"label": "OpenAI TTS Model",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.openai.voice": {
|
||||
"label": "OpenAI TTS Voice",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.openai.apiKey": {
|
||||
"label": "OpenAI API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.modelId": {
|
||||
"label": "ElevenLabs Model ID",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.voiceId": {
|
||||
"label": "ElevenLabs Voice ID",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.apiKey": {
|
||||
"label": "ElevenLabs API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"tts.elevenlabs.baseUrl": {
|
||||
"label": "ElevenLabs Base URL",
|
||||
"advanced": true
|
||||
},
|
||||
"publicUrl": {
|
||||
"label": "Public Webhook URL",
|
||||
"advanced": true
|
||||
},
|
||||
"skipSignatureVerification": {
|
||||
"label": "Skip Signature Verification",
|
||||
"advanced": true
|
||||
},
|
||||
"store": {
|
||||
"label": "Call Log Store Path",
|
||||
"advanced": true
|
||||
},
|
||||
"responseModel": {
|
||||
"label": "Response Model",
|
||||
"advanced": true
|
||||
},
|
||||
"responseSystemPrompt": {
|
||||
"label": "Response System Prompt",
|
||||
"advanced": true
|
||||
},
|
||||
"responseTimeoutMs": {
|
||||
"label": "Response Timeout (ms)",
|
||||
"advanced": true
|
||||
}
|
||||
},
|
||||
"configSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"provider": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"telnyx",
|
||||
"twilio",
|
||||
"plivo",
|
||||
"mock"
|
||||
]
|
||||
},
|
||||
"telnyx": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"apiKey": {
|
||||
"type": "string"
|
||||
},
|
||||
"connectionId": {
|
||||
"type": "string"
|
||||
},
|
||||
"publicKey": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"twilio": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"accountSid": {
|
||||
"type": "string"
|
||||
},
|
||||
"authToken": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"plivo": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"authId": {
|
||||
"type": "string"
|
||||
},
|
||||
"authToken": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"fromNumber": {
|
||||
"type": "string",
|
||||
"pattern": "^\\+[1-9]\\d{1,14}$"
|
||||
},
|
||||
"toNumber": {
|
||||
"type": "string",
|
||||
"pattern": "^\\+[1-9]\\d{1,14}$"
|
||||
},
|
||||
"inboundPolicy": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"disabled",
|
||||
"allowlist",
|
||||
"pairing",
|
||||
"open"
|
||||
]
|
||||
},
|
||||
"allowFrom": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"pattern": "^\\+[1-9]\\d{1,14}$"
|
||||
}
|
||||
},
|
||||
"inboundGreeting": {
|
||||
"type": "string"
|
||||
},
|
||||
"outbound": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"defaultMode": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"notify",
|
||||
"conversation"
|
||||
]
|
||||
},
|
||||
"notifyHangupDelaySec": {
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"maxDurationSeconds": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"silenceTimeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"transcriptTimeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"ringTimeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"maxConcurrentCalls": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"serve": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"port": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"bind": {
|
||||
"type": "string"
|
||||
},
|
||||
"path": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tailscale": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"off",
|
||||
"serve",
|
||||
"funnel"
|
||||
]
|
||||
},
|
||||
"path": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tunnel": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"provider": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"none",
|
||||
"ngrok",
|
||||
"tailscale-serve",
|
||||
"tailscale-funnel"
|
||||
]
|
||||
},
|
||||
"ngrokAuthToken": {
|
||||
"type": "string"
|
||||
},
|
||||
"ngrokDomain": {
|
||||
"type": "string"
|
||||
},
|
||||
"allowNgrokFreeTierLoopbackBypass": {
|
||||
"type": "boolean"
|
||||
}
|
||||
}
|
||||
},
|
||||
"streaming": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"sttProvider": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"openai-realtime"
|
||||
]
|
||||
},
|
||||
"openaiApiKey": {
|
||||
"type": "string"
|
||||
},
|
||||
"sttModel": {
|
||||
"type": "string"
|
||||
},
|
||||
"silenceDurationMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"vadThreshold": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"streamPath": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"publicUrl": {
|
||||
"type": "string"
|
||||
},
|
||||
"skipSignatureVerification": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"stt": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"provider": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"openai"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tts": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"auto": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"off",
|
||||
"always",
|
||||
"inbound",
|
||||
"tagged"
|
||||
]
|
||||
},
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"final",
|
||||
"all"
|
||||
]
|
||||
},
|
||||
"provider": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"openai",
|
||||
"elevenlabs",
|
||||
"edge"
|
||||
]
|
||||
},
|
||||
"summaryModel": {
|
||||
"type": "string"
|
||||
},
|
||||
"modelOverrides": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowText": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowProvider": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowVoice": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowModelId": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowVoiceSettings": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowNormalization": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"allowSeed": {
|
||||
"type": "boolean"
|
||||
}
|
||||
}
|
||||
},
|
||||
"elevenlabs": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"apiKey": {
|
||||
"type": "string"
|
||||
},
|
||||
"baseUrl": {
|
||||
"type": "string"
|
||||
},
|
||||
"voiceId": {
|
||||
"type": "string"
|
||||
},
|
||||
"modelId": {
|
||||
"type": "string"
|
||||
},
|
||||
"seed": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 4294967295
|
||||
},
|
||||
"applyTextNormalization": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"auto",
|
||||
"on",
|
||||
"off"
|
||||
]
|
||||
},
|
||||
"languageCode": {
|
||||
"type": "string"
|
||||
},
|
||||
"voiceSettings": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"stability": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"similarityBoost": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"style": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"useSpeakerBoost": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"speed": {
|
||||
"type": "number",
|
||||
"minimum": 0.5,
|
||||
"maximum": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"openai": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"apiKey": {
|
||||
"type": "string"
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
},
|
||||
"voice": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"edge": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"voice": {
|
||||
"type": "string"
|
||||
},
|
||||
"lang": {
|
||||
"type": "string"
|
||||
},
|
||||
"outputFormat": {
|
||||
"type": "string"
|
||||
},
|
||||
"pitch": {
|
||||
"type": "string"
|
||||
},
|
||||
"rate": {
|
||||
"type": "string"
|
||||
},
|
||||
"volume": {
|
||||
"type": "string"
|
||||
},
|
||||
"saveSubtitles": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"proxy": {
|
||||
"type": "string"
|
||||
},
|
||||
"timeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1000,
|
||||
"maximum": 120000
|
||||
}
|
||||
}
|
||||
},
|
||||
"prefsPath": {
|
||||
"type": "string"
|
||||
},
|
||||
"maxTextLength": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"timeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1000,
|
||||
"maximum": 120000
|
||||
}
|
||||
}
|
||||
},
|
||||
"store": {
|
||||
"type": "string"
|
||||
},
|
||||
"responseModel": {
|
||||
"type": "string"
|
||||
},
|
||||
"responseSystemPrompt": {
|
||||
"type": "string"
|
||||
},
|
||||
"responseTimeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
497
docker-compose/ez-assistant/extensions/voice-call/index.ts
Normal file
497
docker-compose/ez-assistant/extensions/voice-call/index.ts
Normal file
@@ -0,0 +1,497 @@
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import type { CoreConfig } from "./src/core-bridge.js";
|
||||
import {
|
||||
VoiceCallConfigSchema,
|
||||
resolveVoiceCallConfig,
|
||||
validateProviderConfig,
|
||||
type VoiceCallConfig,
|
||||
} from "./src/config.js";
|
||||
import { registerVoiceCallCli } from "./src/cli.js";
|
||||
import { createVoiceCallRuntime, type VoiceCallRuntime } from "./src/runtime.js";
|
||||
|
||||
const voiceCallConfigSchema = {
|
||||
parse(value: unknown): VoiceCallConfig {
|
||||
const raw =
|
||||
value && typeof value === "object" && !Array.isArray(value)
|
||||
? (value as Record<string, unknown>)
|
||||
: {};
|
||||
|
||||
const twilio = raw.twilio as Record<string, unknown> | undefined;
|
||||
const legacyFrom = typeof twilio?.from === "string" ? twilio.from : undefined;
|
||||
|
||||
const enabled = typeof raw.enabled === "boolean" ? raw.enabled : true;
|
||||
const providerRaw = raw.provider === "log" ? "mock" : raw.provider;
|
||||
const provider = providerRaw ?? (enabled ? "mock" : undefined);
|
||||
|
||||
return VoiceCallConfigSchema.parse({
|
||||
...raw,
|
||||
enabled,
|
||||
provider,
|
||||
fromNumber: raw.fromNumber ?? legacyFrom,
|
||||
});
|
||||
},
|
||||
uiHints: {
|
||||
provider: {
|
||||
label: "Provider",
|
||||
help: "Use twilio, telnyx, or mock for dev/no-network.",
|
||||
},
|
||||
fromNumber: { label: "From Number", placeholder: "+15550001234" },
|
||||
toNumber: { label: "Default To Number", placeholder: "+15550001234" },
|
||||
inboundPolicy: { label: "Inbound Policy" },
|
||||
allowFrom: { label: "Inbound Allowlist" },
|
||||
inboundGreeting: { label: "Inbound Greeting", advanced: true },
|
||||
"telnyx.apiKey": { label: "Telnyx API Key", sensitive: true },
|
||||
"telnyx.connectionId": { label: "Telnyx Connection ID" },
|
||||
"telnyx.publicKey": { label: "Telnyx Public Key", sensitive: true },
|
||||
"twilio.accountSid": { label: "Twilio Account SID" },
|
||||
"twilio.authToken": { label: "Twilio Auth Token", sensitive: true },
|
||||
"outbound.defaultMode": { label: "Default Call Mode" },
|
||||
"outbound.notifyHangupDelaySec": {
|
||||
label: "Notify Hangup Delay (sec)",
|
||||
advanced: true,
|
||||
},
|
||||
"serve.port": { label: "Webhook Port" },
|
||||
"serve.bind": { label: "Webhook Bind" },
|
||||
"serve.path": { label: "Webhook Path" },
|
||||
"tailscale.mode": { label: "Tailscale Mode", advanced: true },
|
||||
"tailscale.path": { label: "Tailscale Path", advanced: true },
|
||||
"tunnel.provider": { label: "Tunnel Provider", advanced: true },
|
||||
"tunnel.ngrokAuthToken": {
|
||||
label: "ngrok Auth Token",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"tunnel.ngrokDomain": { label: "ngrok Domain", advanced: true },
|
||||
"tunnel.allowNgrokFreeTierLoopbackBypass": {
|
||||
label: "Allow ngrok Free Tier (Loopback Bypass)",
|
||||
advanced: true,
|
||||
},
|
||||
"streaming.enabled": { label: "Enable Streaming", advanced: true },
|
||||
"streaming.openaiApiKey": {
|
||||
label: "OpenAI Realtime API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"streaming.sttModel": { label: "Realtime STT Model", advanced: true },
|
||||
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
|
||||
"tts.provider": {
|
||||
label: "TTS Provider Override",
|
||||
help: "Deep-merges with messages.tts (Edge is ignored for calls).",
|
||||
advanced: true,
|
||||
},
|
||||
"tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
|
||||
"tts.openai.voice": { label: "OpenAI TTS Voice", advanced: true },
|
||||
"tts.openai.apiKey": {
|
||||
label: "OpenAI API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"tts.elevenlabs.modelId": { label: "ElevenLabs Model ID", advanced: true },
|
||||
"tts.elevenlabs.voiceId": { label: "ElevenLabs Voice ID", advanced: true },
|
||||
"tts.elevenlabs.apiKey": {
|
||||
label: "ElevenLabs API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"tts.elevenlabs.baseUrl": { label: "ElevenLabs Base URL", advanced: true },
|
||||
publicUrl: { label: "Public Webhook URL", advanced: true },
|
||||
skipSignatureVerification: {
|
||||
label: "Skip Signature Verification",
|
||||
advanced: true,
|
||||
},
|
||||
store: { label: "Call Log Store Path", advanced: true },
|
||||
responseModel: { label: "Response Model", advanced: true },
|
||||
responseSystemPrompt: { label: "Response System Prompt", advanced: true },
|
||||
responseTimeoutMs: { label: "Response Timeout (ms)", advanced: true },
|
||||
},
|
||||
};
|
||||
|
||||
const VoiceCallToolSchema = Type.Union([
|
||||
Type.Object({
|
||||
action: Type.Literal("initiate_call"),
|
||||
to: Type.Optional(Type.String({ description: "Call target" })),
|
||||
message: Type.String({ description: "Intro message" }),
|
||||
mode: Type.Optional(Type.Union([Type.Literal("notify"), Type.Literal("conversation")])),
|
||||
}),
|
||||
Type.Object({
|
||||
action: Type.Literal("continue_call"),
|
||||
callId: Type.String({ description: "Call ID" }),
|
||||
message: Type.String({ description: "Follow-up message" }),
|
||||
}),
|
||||
Type.Object({
|
||||
action: Type.Literal("speak_to_user"),
|
||||
callId: Type.String({ description: "Call ID" }),
|
||||
message: Type.String({ description: "Message to speak" }),
|
||||
}),
|
||||
Type.Object({
|
||||
action: Type.Literal("end_call"),
|
||||
callId: Type.String({ description: "Call ID" }),
|
||||
}),
|
||||
Type.Object({
|
||||
action: Type.Literal("get_status"),
|
||||
callId: Type.String({ description: "Call ID" }),
|
||||
}),
|
||||
Type.Object({
|
||||
mode: Type.Optional(Type.Union([Type.Literal("call"), Type.Literal("status")])),
|
||||
to: Type.Optional(Type.String({ description: "Call target" })),
|
||||
sid: Type.Optional(Type.String({ description: "Call SID" })),
|
||||
message: Type.Optional(Type.String({ description: "Optional intro message" })),
|
||||
}),
|
||||
]);
|
||||
|
||||
const voiceCallPlugin = {
|
||||
id: "voice-call",
|
||||
name: "Voice Call",
|
||||
description: "Voice-call plugin with Telnyx/Twilio/Plivo providers",
|
||||
configSchema: voiceCallConfigSchema,
|
||||
register(api) {
|
||||
const config = resolveVoiceCallConfig(
|
||||
voiceCallConfigSchema.parse(api.pluginConfig),
|
||||
);
|
||||
const validation = validateProviderConfig(config);
|
||||
|
||||
if (api.pluginConfig && typeof api.pluginConfig === "object") {
|
||||
const raw = api.pluginConfig as Record<string, unknown>;
|
||||
const twilio = raw.twilio as Record<string, unknown> | undefined;
|
||||
if (raw.provider === "log") {
|
||||
api.logger.warn(
|
||||
"[voice-call] provider \"log\" is deprecated; use \"mock\" instead",
|
||||
);
|
||||
}
|
||||
if (typeof twilio?.from === "string") {
|
||||
api.logger.warn(
|
||||
"[voice-call] twilio.from is deprecated; use fromNumber instead",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let runtimePromise: Promise<VoiceCallRuntime> | null = null;
|
||||
let runtime: VoiceCallRuntime | null = null;
|
||||
|
||||
const ensureRuntime = async () => {
|
||||
if (!config.enabled) {
|
||||
throw new Error("Voice call disabled in plugin config");
|
||||
}
|
||||
if (!validation.valid) {
|
||||
throw new Error(validation.errors.join("; "));
|
||||
}
|
||||
if (runtime) return runtime;
|
||||
if (!runtimePromise) {
|
||||
runtimePromise = createVoiceCallRuntime({
|
||||
config,
|
||||
coreConfig: api.config as CoreConfig,
|
||||
ttsRuntime: api.runtime.tts,
|
||||
logger: api.logger,
|
||||
});
|
||||
}
|
||||
runtime = await runtimePromise;
|
||||
return runtime;
|
||||
};
|
||||
|
||||
const sendError = (respond: (ok: boolean, payload?: unknown) => void, err: unknown) => {
|
||||
respond(false, { error: err instanceof Error ? err.message : String(err) });
|
||||
};
|
||||
|
||||
api.registerGatewayMethod("voicecall.initiate", async ({ params, respond }) => {
|
||||
try {
|
||||
const message =
|
||||
typeof params?.message === "string" ? params.message.trim() : "";
|
||||
if (!message) {
|
||||
respond(false, { error: "message required" });
|
||||
return;
|
||||
}
|
||||
const rt = await ensureRuntime();
|
||||
const to =
|
||||
typeof params?.to === "string" && params.to.trim()
|
||||
? params.to.trim()
|
||||
: rt.config.toNumber;
|
||||
if (!to) {
|
||||
respond(false, { error: "to required" });
|
||||
return;
|
||||
}
|
||||
const mode =
|
||||
params?.mode === "notify" || params?.mode === "conversation"
|
||||
? params.mode
|
||||
: undefined;
|
||||
const result = await rt.manager.initiateCall(to, undefined, {
|
||||
message,
|
||||
mode,
|
||||
});
|
||||
if (!result.success) {
|
||||
respond(false, { error: result.error || "initiate failed" });
|
||||
return;
|
||||
}
|
||||
respond(true, { callId: result.callId, initiated: true });
|
||||
} catch (err) {
|
||||
sendError(respond, err);
|
||||
}
|
||||
});
|
||||
|
||||
api.registerGatewayMethod("voicecall.continue", async ({ params, respond }) => {
|
||||
try {
|
||||
const callId =
|
||||
typeof params?.callId === "string" ? params.callId.trim() : "";
|
||||
const message =
|
||||
typeof params?.message === "string" ? params.message.trim() : "";
|
||||
if (!callId || !message) {
|
||||
respond(false, { error: "callId and message required" });
|
||||
return;
|
||||
}
|
||||
const rt = await ensureRuntime();
|
||||
const result = await rt.manager.continueCall(callId, message);
|
||||
if (!result.success) {
|
||||
respond(false, { error: result.error || "continue failed" });
|
||||
return;
|
||||
}
|
||||
respond(true, { success: true, transcript: result.transcript });
|
||||
} catch (err) {
|
||||
sendError(respond, err);
|
||||
}
|
||||
});
|
||||
|
||||
api.registerGatewayMethod("voicecall.speak", async ({ params, respond }) => {
|
||||
try {
|
||||
const callId =
|
||||
typeof params?.callId === "string" ? params.callId.trim() : "";
|
||||
const message =
|
||||
typeof params?.message === "string" ? params.message.trim() : "";
|
||||
if (!callId || !message) {
|
||||
respond(false, { error: "callId and message required" });
|
||||
return;
|
||||
}
|
||||
const rt = await ensureRuntime();
|
||||
const result = await rt.manager.speak(callId, message);
|
||||
if (!result.success) {
|
||||
respond(false, { error: result.error || "speak failed" });
|
||||
return;
|
||||
}
|
||||
respond(true, { success: true });
|
||||
} catch (err) {
|
||||
sendError(respond, err);
|
||||
}
|
||||
});
|
||||
|
||||
api.registerGatewayMethod("voicecall.end", async ({ params, respond }) => {
|
||||
try {
|
||||
const callId =
|
||||
typeof params?.callId === "string" ? params.callId.trim() : "";
|
||||
if (!callId) {
|
||||
respond(false, { error: "callId required" });
|
||||
return;
|
||||
}
|
||||
const rt = await ensureRuntime();
|
||||
const result = await rt.manager.endCall(callId);
|
||||
if (!result.success) {
|
||||
respond(false, { error: result.error || "end failed" });
|
||||
return;
|
||||
}
|
||||
respond(true, { success: true });
|
||||
} catch (err) {
|
||||
sendError(respond, err);
|
||||
}
|
||||
});
|
||||
|
||||
api.registerGatewayMethod("voicecall.status", async ({ params, respond }) => {
|
||||
try {
|
||||
const raw =
|
||||
typeof params?.callId === "string"
|
||||
? params.callId.trim()
|
||||
: typeof params?.sid === "string"
|
||||
? params.sid.trim()
|
||||
: "";
|
||||
if (!raw) {
|
||||
respond(false, { error: "callId required" });
|
||||
return;
|
||||
}
|
||||
const rt = await ensureRuntime();
|
||||
const call =
|
||||
rt.manager.getCall(raw) || rt.manager.getCallByProviderCallId(raw);
|
||||
if (!call) {
|
||||
respond(true, { found: false });
|
||||
return;
|
||||
}
|
||||
respond(true, { found: true, call });
|
||||
} catch (err) {
|
||||
sendError(respond, err);
|
||||
}
|
||||
});
|
||||
|
||||
api.registerGatewayMethod("voicecall.start", async ({ params, respond }) => {
|
||||
try {
|
||||
const to = typeof params?.to === "string" ? params.to.trim() : "";
|
||||
const message =
|
||||
typeof params?.message === "string" ? params.message.trim() : "";
|
||||
if (!to) {
|
||||
respond(false, { error: "to required" });
|
||||
return;
|
||||
}
|
||||
const rt = await ensureRuntime();
|
||||
const result = await rt.manager.initiateCall(to, undefined, {
|
||||
message: message || undefined,
|
||||
});
|
||||
if (!result.success) {
|
||||
respond(false, { error: result.error || "initiate failed" });
|
||||
return;
|
||||
}
|
||||
respond(true, { callId: result.callId, initiated: true });
|
||||
} catch (err) {
|
||||
sendError(respond, err);
|
||||
}
|
||||
});
|
||||
|
||||
api.registerTool({
|
||||
name: "voice_call",
|
||||
label: "Voice Call",
|
||||
description:
|
||||
"Make phone calls and have voice conversations via the voice-call plugin.",
|
||||
parameters: VoiceCallToolSchema,
|
||||
async execute(_toolCallId, params) {
|
||||
const json = (payload: unknown) => ({
|
||||
content: [
|
||||
{ type: "text", text: JSON.stringify(payload, null, 2) },
|
||||
],
|
||||
details: payload,
|
||||
});
|
||||
|
||||
try {
|
||||
const rt = await ensureRuntime();
|
||||
|
||||
if (typeof params?.action === "string") {
|
||||
switch (params.action) {
|
||||
case "initiate_call": {
|
||||
const message = String(params.message || "").trim();
|
||||
if (!message) throw new Error("message required");
|
||||
const to =
|
||||
typeof params.to === "string" && params.to.trim()
|
||||
? params.to.trim()
|
||||
: rt.config.toNumber;
|
||||
if (!to) throw new Error("to required");
|
||||
const result = await rt.manager.initiateCall(to, undefined, {
|
||||
message,
|
||||
mode:
|
||||
params.mode === "notify" || params.mode === "conversation"
|
||||
? params.mode
|
||||
: undefined,
|
||||
});
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "initiate failed");
|
||||
}
|
||||
return json({ callId: result.callId, initiated: true });
|
||||
}
|
||||
case "continue_call": {
|
||||
const callId = String(params.callId || "").trim();
|
||||
const message = String(params.message || "").trim();
|
||||
if (!callId || !message) {
|
||||
throw new Error("callId and message required");
|
||||
}
|
||||
const result = await rt.manager.continueCall(callId, message);
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "continue failed");
|
||||
}
|
||||
return json({ success: true, transcript: result.transcript });
|
||||
}
|
||||
case "speak_to_user": {
|
||||
const callId = String(params.callId || "").trim();
|
||||
const message = String(params.message || "").trim();
|
||||
if (!callId || !message) {
|
||||
throw new Error("callId and message required");
|
||||
}
|
||||
const result = await rt.manager.speak(callId, message);
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "speak failed");
|
||||
}
|
||||
return json({ success: true });
|
||||
}
|
||||
case "end_call": {
|
||||
const callId = String(params.callId || "").trim();
|
||||
if (!callId) throw new Error("callId required");
|
||||
const result = await rt.manager.endCall(callId);
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "end failed");
|
||||
}
|
||||
return json({ success: true });
|
||||
}
|
||||
case "get_status": {
|
||||
const callId = String(params.callId || "").trim();
|
||||
if (!callId) throw new Error("callId required");
|
||||
const call =
|
||||
rt.manager.getCall(callId) ||
|
||||
rt.manager.getCallByProviderCallId(callId);
|
||||
return json(call ? { found: true, call } : { found: false });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const mode = params?.mode ?? "call";
|
||||
if (mode === "status") {
|
||||
const sid =
|
||||
typeof params.sid === "string" ? params.sid.trim() : "";
|
||||
if (!sid) throw new Error("sid required for status");
|
||||
const call =
|
||||
rt.manager.getCall(sid) || rt.manager.getCallByProviderCallId(sid);
|
||||
return json(call ? { found: true, call } : { found: false });
|
||||
}
|
||||
|
||||
const to =
|
||||
typeof params.to === "string" && params.to.trim()
|
||||
? params.to.trim()
|
||||
: rt.config.toNumber;
|
||||
if (!to) throw new Error("to required for call");
|
||||
const result = await rt.manager.initiateCall(to, undefined, {
|
||||
message:
|
||||
typeof params.message === "string" && params.message.trim()
|
||||
? params.message.trim()
|
||||
: undefined,
|
||||
});
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "initiate failed");
|
||||
}
|
||||
return json({ callId: result.callId, initiated: true });
|
||||
} catch (err) {
|
||||
return json({
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
});
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
api.registerCli(
|
||||
({ program }) =>
|
||||
registerVoiceCallCli({
|
||||
program,
|
||||
config,
|
||||
ensureRuntime,
|
||||
logger: api.logger,
|
||||
}),
|
||||
{ commands: ["voicecall"] },
|
||||
);
|
||||
|
||||
api.registerService({
|
||||
id: "voicecall",
|
||||
start: async () => {
|
||||
if (!config.enabled) return;
|
||||
try {
|
||||
await ensureRuntime();
|
||||
} catch (err) {
|
||||
api.logger.error(
|
||||
`[voice-call] Failed to start runtime: ${
|
||||
err instanceof Error ? err.message : String(err)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
},
|
||||
stop: async () => {
|
||||
if (!runtimePromise) return;
|
||||
try {
|
||||
const rt = await runtimePromise;
|
||||
await rt.stop();
|
||||
} finally {
|
||||
runtimePromise = null;
|
||||
runtime = null;
|
||||
}
|
||||
},
|
||||
});
|
||||
},
|
||||
};
|
||||
|
||||
export default voiceCallPlugin;
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "@moltbot/voice-call",
|
||||
"version": "2026.1.26",
|
||||
"type": "module",
|
||||
"description": "Moltbot voice-call plugin",
|
||||
"dependencies": {
|
||||
"@sinclair/typebox": "0.34.47",
|
||||
"ws": "^8.19.0",
|
||||
"zod": "^4.3.6"
|
||||
},
|
||||
"moltbot": {
|
||||
"extensions": [
|
||||
"./index.ts"
|
||||
]
|
||||
}
|
||||
}
|
||||
300
docker-compose/ez-assistant/extensions/voice-call/src/cli.ts
Normal file
300
docker-compose/ez-assistant/extensions/voice-call/src/cli.ts
Normal file
@@ -0,0 +1,300 @@
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import type { Command } from "commander";
|
||||
|
||||
import type { VoiceCallConfig } from "./config.js";
|
||||
import type { VoiceCallRuntime } from "./runtime.js";
|
||||
import { resolveUserPath } from "./utils.js";
|
||||
import {
|
||||
cleanupTailscaleExposureRoute,
|
||||
getTailscaleSelfInfo,
|
||||
setupTailscaleExposureRoute,
|
||||
} from "./webhook.js";
|
||||
|
||||
type Logger = {
|
||||
info: (message: string) => void;
|
||||
warn: (message: string) => void;
|
||||
error: (message: string) => void;
|
||||
};
|
||||
|
||||
function resolveMode(input: string): "off" | "serve" | "funnel" {
|
||||
const raw = input.trim().toLowerCase();
|
||||
if (raw === "serve" || raw === "off") return raw;
|
||||
return "funnel";
|
||||
}
|
||||
|
||||
function resolveDefaultStorePath(config: VoiceCallConfig): string {
|
||||
const base =
|
||||
config.store?.trim() || path.join(os.homedir(), "clawd", "voice-calls");
|
||||
return path.join(resolveUserPath(base), "calls.jsonl");
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
export function registerVoiceCallCli(params: {
|
||||
program: Command;
|
||||
config: VoiceCallConfig;
|
||||
ensureRuntime: () => Promise<VoiceCallRuntime>;
|
||||
logger: Logger;
|
||||
}) {
|
||||
const { program, config, ensureRuntime, logger } = params;
|
||||
const root = program
|
||||
.command("voicecall")
|
||||
.description("Voice call utilities")
|
||||
.addHelpText("after", () => `\nDocs: https://docs.molt.bot/cli/voicecall\n`);
|
||||
|
||||
root
|
||||
.command("call")
|
||||
.description("Initiate an outbound voice call")
|
||||
.requiredOption(
|
||||
"-m, --message <text>",
|
||||
"Message to speak when call connects",
|
||||
)
|
||||
.option(
|
||||
"-t, --to <phone>",
|
||||
"Phone number to call (E.164 format, uses config toNumber if not set)",
|
||||
)
|
||||
.option(
|
||||
"--mode <mode>",
|
||||
"Call mode: notify (hangup after message) or conversation (stay open)",
|
||||
"conversation",
|
||||
)
|
||||
.action(
|
||||
async (options: { message: string; to?: string; mode?: string }) => {
|
||||
const rt = await ensureRuntime();
|
||||
const to = options.to ?? rt.config.toNumber;
|
||||
if (!to) {
|
||||
throw new Error("Missing --to and no toNumber configured");
|
||||
}
|
||||
const result = await rt.manager.initiateCall(to, undefined, {
|
||||
message: options.message,
|
||||
mode:
|
||||
options.mode === "notify" || options.mode === "conversation"
|
||||
? options.mode
|
||||
: undefined,
|
||||
});
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "initiate failed");
|
||||
}
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(JSON.stringify({ callId: result.callId }, null, 2));
|
||||
},
|
||||
);
|
||||
|
||||
root
|
||||
.command("start")
|
||||
.description("Alias for voicecall call")
|
||||
.requiredOption("--to <phone>", "Phone number to call")
|
||||
.option("--message <text>", "Message to speak when call connects")
|
||||
.option(
|
||||
"--mode <mode>",
|
||||
"Call mode: notify (hangup after message) or conversation (stay open)",
|
||||
"conversation",
|
||||
)
|
||||
.action(
|
||||
async (options: { to: string; message?: string; mode?: string }) => {
|
||||
const rt = await ensureRuntime();
|
||||
const result = await rt.manager.initiateCall(options.to, undefined, {
|
||||
message: options.message,
|
||||
mode:
|
||||
options.mode === "notify" || options.mode === "conversation"
|
||||
? options.mode
|
||||
: undefined,
|
||||
});
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "initiate failed");
|
||||
}
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(JSON.stringify({ callId: result.callId }, null, 2));
|
||||
},
|
||||
);
|
||||
|
||||
root
|
||||
.command("continue")
|
||||
.description("Speak a message and wait for a response")
|
||||
.requiredOption("--call-id <id>", "Call ID")
|
||||
.requiredOption("--message <text>", "Message to speak")
|
||||
.action(async (options: { callId: string; message: string }) => {
|
||||
const rt = await ensureRuntime();
|
||||
const result = await rt.manager.continueCall(
|
||||
options.callId,
|
||||
options.message,
|
||||
);
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "continue failed");
|
||||
}
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
});
|
||||
|
||||
root
|
||||
.command("speak")
|
||||
.description("Speak a message without waiting for response")
|
||||
.requiredOption("--call-id <id>", "Call ID")
|
||||
.requiredOption("--message <text>", "Message to speak")
|
||||
.action(async (options: { callId: string; message: string }) => {
|
||||
const rt = await ensureRuntime();
|
||||
const result = await rt.manager.speak(options.callId, options.message);
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "speak failed");
|
||||
}
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
});
|
||||
|
||||
root
|
||||
.command("end")
|
||||
.description("Hang up an active call")
|
||||
.requiredOption("--call-id <id>", "Call ID")
|
||||
.action(async (options: { callId: string }) => {
|
||||
const rt = await ensureRuntime();
|
||||
const result = await rt.manager.endCall(options.callId);
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "end failed");
|
||||
}
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
});
|
||||
|
||||
root
|
||||
.command("status")
|
||||
.description("Show call status")
|
||||
.requiredOption("--call-id <id>", "Call ID")
|
||||
.action(async (options: { callId: string }) => {
|
||||
const rt = await ensureRuntime();
|
||||
const call = rt.manager.getCall(options.callId);
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(JSON.stringify(call ?? { found: false }, null, 2));
|
||||
});
|
||||
|
||||
root
|
||||
.command("tail")
|
||||
.description(
|
||||
"Tail voice-call JSONL logs (prints new lines; useful during provider tests)",
|
||||
)
|
||||
.option("--file <path>", "Path to calls.jsonl", resolveDefaultStorePath(config))
|
||||
.option("--since <n>", "Print last N lines first", "25")
|
||||
.option("--poll <ms>", "Poll interval in ms", "250")
|
||||
.action(
|
||||
async (options: { file: string; since?: string; poll?: string }) => {
|
||||
const file = options.file;
|
||||
const since = Math.max(0, Number(options.since ?? 0));
|
||||
const pollMs = Math.max(50, Number(options.poll ?? 250));
|
||||
|
||||
if (!fs.existsSync(file)) {
|
||||
logger.error(`No log file at ${file}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const initial = fs.readFileSync(file, "utf8");
|
||||
const lines = initial.split("\n").filter(Boolean);
|
||||
for (const line of lines.slice(Math.max(0, lines.length - since))) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(line);
|
||||
}
|
||||
|
||||
let offset = Buffer.byteLength(initial, "utf8");
|
||||
|
||||
for (;;) {
|
||||
try {
|
||||
const stat = fs.statSync(file);
|
||||
if (stat.size < offset) {
|
||||
offset = 0;
|
||||
}
|
||||
if (stat.size > offset) {
|
||||
const fd = fs.openSync(file, "r");
|
||||
try {
|
||||
const buf = Buffer.alloc(stat.size - offset);
|
||||
fs.readSync(fd, buf, 0, buf.length, offset);
|
||||
offset = stat.size;
|
||||
const text = buf.toString("utf8");
|
||||
for (const line of text.split("\n").filter(Boolean)) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(line);
|
||||
}
|
||||
} finally {
|
||||
fs.closeSync(fd);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// ignore and retry
|
||||
}
|
||||
await sleep(pollMs);
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
root
|
||||
.command("expose")
|
||||
.description("Enable/disable Tailscale serve/funnel for the webhook")
|
||||
.option("--mode <mode>", "off | serve (tailnet) | funnel (public)", "funnel")
|
||||
.option(
|
||||
"--path <path>",
|
||||
"Tailscale path to expose (recommend matching serve.path)",
|
||||
)
|
||||
.option("--port <port>", "Local webhook port")
|
||||
.option("--serve-path <path>", "Local webhook path")
|
||||
.action(
|
||||
async (options: {
|
||||
mode?: string;
|
||||
port?: string;
|
||||
path?: string;
|
||||
servePath?: string;
|
||||
}) => {
|
||||
const mode = resolveMode(options.mode ?? "funnel");
|
||||
const servePort = Number(options.port ?? config.serve.port ?? 3334);
|
||||
const servePath = String(
|
||||
options.servePath ?? config.serve.path ?? "/voice/webhook",
|
||||
);
|
||||
const tsPath = String(
|
||||
options.path ?? config.tailscale?.path ?? servePath,
|
||||
);
|
||||
|
||||
const localUrl = `http://127.0.0.1:${servePort}`;
|
||||
|
||||
if (mode === "off") {
|
||||
await cleanupTailscaleExposureRoute({ mode: "serve", path: tsPath });
|
||||
await cleanupTailscaleExposureRoute({ mode: "funnel", path: tsPath });
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(JSON.stringify({ ok: true, mode: "off", path: tsPath }, null, 2));
|
||||
return;
|
||||
}
|
||||
|
||||
const publicUrl = await setupTailscaleExposureRoute({
|
||||
mode,
|
||||
path: tsPath,
|
||||
localUrl,
|
||||
});
|
||||
|
||||
const tsInfo = publicUrl ? null : await getTailscaleSelfInfo();
|
||||
const enableUrl = tsInfo?.nodeId
|
||||
? `https://login.tailscale.com/f/${mode}?node=${tsInfo.nodeId}`
|
||||
: null;
|
||||
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(
|
||||
JSON.stringify(
|
||||
{
|
||||
ok: Boolean(publicUrl),
|
||||
mode,
|
||||
path: tsPath,
|
||||
localUrl,
|
||||
publicUrl,
|
||||
hint: publicUrl
|
||||
? undefined
|
||||
: {
|
||||
note: "Tailscale serve/funnel may be disabled on this tailnet (or require admin enable).",
|
||||
enableUrl,
|
||||
},
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,204 @@
|
||||
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||
|
||||
import { validateProviderConfig, resolveVoiceCallConfig, type VoiceCallConfig } from "./config.js";
|
||||
|
||||
function createBaseConfig(
|
||||
provider: "telnyx" | "twilio" | "plivo" | "mock",
|
||||
): VoiceCallConfig {
|
||||
return {
|
||||
enabled: true,
|
||||
provider,
|
||||
fromNumber: "+15550001234",
|
||||
inboundPolicy: "disabled",
|
||||
allowFrom: [],
|
||||
outbound: { defaultMode: "notify", notifyHangupDelaySec: 3 },
|
||||
maxDurationSeconds: 300,
|
||||
silenceTimeoutMs: 800,
|
||||
transcriptTimeoutMs: 180000,
|
||||
ringTimeoutMs: 30000,
|
||||
maxConcurrentCalls: 1,
|
||||
serve: { port: 3334, bind: "127.0.0.1", path: "/voice/webhook" },
|
||||
tailscale: { mode: "off", path: "/voice/webhook" },
|
||||
tunnel: { provider: "none", allowNgrokFreeTierLoopbackBypass: false },
|
||||
streaming: {
|
||||
enabled: false,
|
||||
sttProvider: "openai-realtime",
|
||||
sttModel: "gpt-4o-transcribe",
|
||||
silenceDurationMs: 800,
|
||||
vadThreshold: 0.5,
|
||||
streamPath: "/voice/stream",
|
||||
},
|
||||
skipSignatureVerification: false,
|
||||
stt: { provider: "openai", model: "whisper-1" },
|
||||
tts: { provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" },
|
||||
responseModel: "openai/gpt-4o-mini",
|
||||
responseTimeoutMs: 30000,
|
||||
};
|
||||
}
|
||||
|
||||
describe("validateProviderConfig", () => {
|
||||
const originalEnv = { ...process.env };
|
||||
|
||||
beforeEach(() => {
|
||||
// Clear all relevant env vars before each test
|
||||
delete process.env.TWILIO_ACCOUNT_SID;
|
||||
delete process.env.TWILIO_AUTH_TOKEN;
|
||||
delete process.env.TELNYX_API_KEY;
|
||||
delete process.env.TELNYX_CONNECTION_ID;
|
||||
delete process.env.PLIVO_AUTH_ID;
|
||||
delete process.env.PLIVO_AUTH_TOKEN;
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
// Restore original env
|
||||
process.env = { ...originalEnv };
|
||||
});
|
||||
|
||||
describe("twilio provider", () => {
|
||||
it("passes validation when credentials are in config", () => {
|
||||
const config = createBaseConfig("twilio");
|
||||
config.twilio = { accountSid: "AC123", authToken: "secret" };
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(true);
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
|
||||
it("passes validation when credentials are in environment variables", () => {
|
||||
process.env.TWILIO_ACCOUNT_SID = "AC123";
|
||||
process.env.TWILIO_AUTH_TOKEN = "secret";
|
||||
let config = createBaseConfig("twilio");
|
||||
config = resolveVoiceCallConfig(config);
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(true);
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
|
||||
it("passes validation with mixed config and env vars", () => {
|
||||
process.env.TWILIO_AUTH_TOKEN = "secret";
|
||||
let config = createBaseConfig("twilio");
|
||||
config.twilio = { accountSid: "AC123" };
|
||||
config = resolveVoiceCallConfig(config);
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(true);
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
|
||||
it("fails validation when accountSid is missing everywhere", () => {
|
||||
process.env.TWILIO_AUTH_TOKEN = "secret";
|
||||
let config = createBaseConfig("twilio");
|
||||
config = resolveVoiceCallConfig(config);
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors).toContain(
|
||||
"plugins.entries.voice-call.config.twilio.accountSid is required (or set TWILIO_ACCOUNT_SID env)",
|
||||
);
|
||||
});
|
||||
|
||||
it("fails validation when authToken is missing everywhere", () => {
|
||||
process.env.TWILIO_ACCOUNT_SID = "AC123";
|
||||
let config = createBaseConfig("twilio");
|
||||
config = resolveVoiceCallConfig(config);
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors).toContain(
|
||||
"plugins.entries.voice-call.config.twilio.authToken is required (or set TWILIO_AUTH_TOKEN env)",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("telnyx provider", () => {
|
||||
it("passes validation when credentials are in config", () => {
|
||||
const config = createBaseConfig("telnyx");
|
||||
config.telnyx = { apiKey: "KEY123", connectionId: "CONN456" };
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(true);
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
|
||||
it("passes validation when credentials are in environment variables", () => {
|
||||
process.env.TELNYX_API_KEY = "KEY123";
|
||||
process.env.TELNYX_CONNECTION_ID = "CONN456";
|
||||
let config = createBaseConfig("telnyx");
|
||||
config = resolveVoiceCallConfig(config);
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(true);
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
|
||||
it("fails validation when apiKey is missing everywhere", () => {
|
||||
process.env.TELNYX_CONNECTION_ID = "CONN456";
|
||||
let config = createBaseConfig("telnyx");
|
||||
config = resolveVoiceCallConfig(config);
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors).toContain(
|
||||
"plugins.entries.voice-call.config.telnyx.apiKey is required (or set TELNYX_API_KEY env)",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("plivo provider", () => {
|
||||
it("passes validation when credentials are in config", () => {
|
||||
const config = createBaseConfig("plivo");
|
||||
config.plivo = { authId: "MA123", authToken: "secret" };
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(true);
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
|
||||
it("passes validation when credentials are in environment variables", () => {
|
||||
process.env.PLIVO_AUTH_ID = "MA123";
|
||||
process.env.PLIVO_AUTH_TOKEN = "secret";
|
||||
let config = createBaseConfig("plivo");
|
||||
config = resolveVoiceCallConfig(config);
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(true);
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
|
||||
it("fails validation when authId is missing everywhere", () => {
|
||||
process.env.PLIVO_AUTH_TOKEN = "secret";
|
||||
let config = createBaseConfig("plivo");
|
||||
config = resolveVoiceCallConfig(config);
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors).toContain(
|
||||
"plugins.entries.voice-call.config.plivo.authId is required (or set PLIVO_AUTH_ID env)",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("disabled config", () => {
|
||||
it("skips validation when enabled is false", () => {
|
||||
const config = createBaseConfig("twilio");
|
||||
config.enabled = false;
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(true);
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
});
|
||||
});
|
||||
502
docker-compose/ez-assistant/extensions/voice-call/src/config.ts
Normal file
502
docker-compose/ez-assistant/extensions/voice-call/src/config.ts
Normal file
@@ -0,0 +1,502 @@
|
||||
import { z } from "zod";
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Phone Number Validation
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* E.164 phone number format: +[country code][number]
|
||||
* Examples use 555 prefix (reserved for fictional numbers)
|
||||
*/
|
||||
export const E164Schema = z
|
||||
.string()
|
||||
.regex(/^\+[1-9]\d{1,14}$/, "Expected E.164 format, e.g. +15550001234");
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Inbound Policy
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Controls how inbound calls are handled:
|
||||
* - "disabled": Block all inbound calls (outbound only)
|
||||
* - "allowlist": Only accept calls from numbers in allowFrom
|
||||
* - "pairing": Unknown callers can request pairing (future)
|
||||
* - "open": Accept all inbound calls (dangerous!)
|
||||
*/
|
||||
export const InboundPolicySchema = z.enum([
|
||||
"disabled",
|
||||
"allowlist",
|
||||
"pairing",
|
||||
"open",
|
||||
]);
|
||||
export type InboundPolicy = z.infer<typeof InboundPolicySchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Provider-Specific Configuration
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const TelnyxConfigSchema = z
|
||||
.object({
|
||||
/** Telnyx API v2 key */
|
||||
apiKey: z.string().min(1).optional(),
|
||||
/** Telnyx connection ID (from Call Control app) */
|
||||
connectionId: z.string().min(1).optional(),
|
||||
/** Public key for webhook signature verification */
|
||||
publicKey: z.string().min(1).optional(),
|
||||
})
|
||||
.strict();
|
||||
export type TelnyxConfig = z.infer<typeof TelnyxConfigSchema>;
|
||||
|
||||
export const TwilioConfigSchema = z
|
||||
.object({
|
||||
/** Twilio Account SID */
|
||||
accountSid: z.string().min(1).optional(),
|
||||
/** Twilio Auth Token */
|
||||
authToken: z.string().min(1).optional(),
|
||||
})
|
||||
.strict();
|
||||
export type TwilioConfig = z.infer<typeof TwilioConfigSchema>;
|
||||
|
||||
export const PlivoConfigSchema = z
|
||||
.object({
|
||||
/** Plivo Auth ID (starts with MA/SA) */
|
||||
authId: z.string().min(1).optional(),
|
||||
/** Plivo Auth Token */
|
||||
authToken: z.string().min(1).optional(),
|
||||
})
|
||||
.strict();
|
||||
export type PlivoConfig = z.infer<typeof PlivoConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// STT/TTS Configuration
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const SttConfigSchema = z
|
||||
.object({
|
||||
/** STT provider (currently only OpenAI supported) */
|
||||
provider: z.literal("openai").default("openai"),
|
||||
/** Whisper model to use */
|
||||
model: z.string().min(1).default("whisper-1"),
|
||||
})
|
||||
.strict()
|
||||
.default({ provider: "openai", model: "whisper-1" });
|
||||
export type SttConfig = z.infer<typeof SttConfigSchema>;
|
||||
|
||||
export const TtsProviderSchema = z.enum(["openai", "elevenlabs", "edge"]);
|
||||
export const TtsModeSchema = z.enum(["final", "all"]);
|
||||
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
|
||||
|
||||
export const TtsConfigSchema = z
|
||||
.object({
|
||||
auto: TtsAutoSchema.optional(),
|
||||
enabled: z.boolean().optional(),
|
||||
mode: TtsModeSchema.optional(),
|
||||
provider: TtsProviderSchema.optional(),
|
||||
summaryModel: z.string().optional(),
|
||||
modelOverrides: z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
allowText: z.boolean().optional(),
|
||||
allowProvider: z.boolean().optional(),
|
||||
allowVoice: z.boolean().optional(),
|
||||
allowModelId: z.boolean().optional(),
|
||||
allowVoiceSettings: z.boolean().optional(),
|
||||
allowNormalization: z.boolean().optional(),
|
||||
allowSeed: z.boolean().optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
elevenlabs: z
|
||||
.object({
|
||||
apiKey: z.string().optional(),
|
||||
baseUrl: z.string().optional(),
|
||||
voiceId: z.string().optional(),
|
||||
modelId: z.string().optional(),
|
||||
seed: z.number().int().min(0).max(4294967295).optional(),
|
||||
applyTextNormalization: z.enum(["auto", "on", "off"]).optional(),
|
||||
languageCode: z.string().optional(),
|
||||
voiceSettings: z
|
||||
.object({
|
||||
stability: z.number().min(0).max(1).optional(),
|
||||
similarityBoost: z.number().min(0).max(1).optional(),
|
||||
style: z.number().min(0).max(1).optional(),
|
||||
useSpeakerBoost: z.boolean().optional(),
|
||||
speed: z.number().min(0.5).max(2).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
openai: z
|
||||
.object({
|
||||
apiKey: z.string().optional(),
|
||||
model: z.string().optional(),
|
||||
voice: z.string().optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
edge: z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
voice: z.string().optional(),
|
||||
lang: z.string().optional(),
|
||||
outputFormat: z.string().optional(),
|
||||
pitch: z.string().optional(),
|
||||
rate: z.string().optional(),
|
||||
volume: z.string().optional(),
|
||||
saveSubtitles: z.boolean().optional(),
|
||||
proxy: z.string().optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
prefsPath: z.string().optional(),
|
||||
maxTextLength: z.number().int().min(1).optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional();
|
||||
export type VoiceCallTtsConfig = z.infer<typeof TtsConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Webhook Server Configuration
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const VoiceCallServeConfigSchema = z
|
||||
.object({
|
||||
/** Port to listen on */
|
||||
port: z.number().int().positive().default(3334),
|
||||
/** Bind address */
|
||||
bind: z.string().default("127.0.0.1"),
|
||||
/** Webhook path */
|
||||
path: z.string().min(1).default("/voice/webhook"),
|
||||
})
|
||||
.strict()
|
||||
.default({ port: 3334, bind: "127.0.0.1", path: "/voice/webhook" });
|
||||
export type VoiceCallServeConfig = z.infer<typeof VoiceCallServeConfigSchema>;
|
||||
|
||||
export const VoiceCallTailscaleConfigSchema = z
|
||||
.object({
|
||||
/**
|
||||
* Tailscale exposure mode:
|
||||
* - "off": No Tailscale exposure
|
||||
* - "serve": Tailscale serve (private to tailnet)
|
||||
* - "funnel": Tailscale funnel (public HTTPS)
|
||||
*/
|
||||
mode: z.enum(["off", "serve", "funnel"]).default("off"),
|
||||
/** Path for Tailscale serve/funnel (should usually match serve.path) */
|
||||
path: z.string().min(1).default("/voice/webhook"),
|
||||
})
|
||||
.strict()
|
||||
.default({ mode: "off", path: "/voice/webhook" });
|
||||
export type VoiceCallTailscaleConfig = z.infer<
|
||||
typeof VoiceCallTailscaleConfigSchema
|
||||
>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Tunnel Configuration (unified ngrok/tailscale)
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const VoiceCallTunnelConfigSchema = z
|
||||
.object({
|
||||
/**
|
||||
* Tunnel provider:
|
||||
* - "none": No tunnel (use publicUrl if set, or manual setup)
|
||||
* - "ngrok": Use ngrok for public HTTPS tunnel
|
||||
* - "tailscale-serve": Tailscale serve (private to tailnet)
|
||||
* - "tailscale-funnel": Tailscale funnel (public HTTPS)
|
||||
*/
|
||||
provider: z
|
||||
.enum(["none", "ngrok", "tailscale-serve", "tailscale-funnel"])
|
||||
.default("none"),
|
||||
/** ngrok auth token (optional, enables longer sessions and more features) */
|
||||
ngrokAuthToken: z.string().min(1).optional(),
|
||||
/** ngrok custom domain (paid feature, e.g., "myapp.ngrok.io") */
|
||||
ngrokDomain: z.string().min(1).optional(),
|
||||
/**
|
||||
* Allow ngrok free tier compatibility mode.
|
||||
* When true, signature verification failures on ngrok-free.app URLs
|
||||
* will be allowed only for loopback requests (ngrok local agent).
|
||||
*/
|
||||
allowNgrokFreeTierLoopbackBypass: z.boolean().default(false),
|
||||
/**
|
||||
* Legacy ngrok free tier compatibility mode (deprecated).
|
||||
* Use allowNgrokFreeTierLoopbackBypass instead.
|
||||
*/
|
||||
allowNgrokFreeTier: z.boolean().optional(),
|
||||
})
|
||||
.strict()
|
||||
.default({ provider: "none", allowNgrokFreeTierLoopbackBypass: false });
|
||||
export type VoiceCallTunnelConfig = z.infer<typeof VoiceCallTunnelConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Outbound Call Configuration
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Call mode determines how outbound calls behave:
|
||||
* - "notify": Deliver message and auto-hangup after delay (one-way notification)
|
||||
* - "conversation": Stay open for back-and-forth until explicit end or timeout
|
||||
*/
|
||||
export const CallModeSchema = z.enum(["notify", "conversation"]);
|
||||
export type CallMode = z.infer<typeof CallModeSchema>;
|
||||
|
||||
export const OutboundConfigSchema = z
|
||||
.object({
|
||||
/** Default call mode for outbound calls */
|
||||
defaultMode: CallModeSchema.default("notify"),
|
||||
/** Seconds to wait after TTS before auto-hangup in notify mode */
|
||||
notifyHangupDelaySec: z.number().int().nonnegative().default(3),
|
||||
})
|
||||
.strict()
|
||||
.default({ defaultMode: "notify", notifyHangupDelaySec: 3 });
|
||||
export type OutboundConfig = z.infer<typeof OutboundConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Streaming Configuration (OpenAI Realtime STT)
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const VoiceCallStreamingConfigSchema = z
|
||||
.object({
|
||||
/** Enable real-time audio streaming (requires WebSocket support) */
|
||||
enabled: z.boolean().default(false),
|
||||
/** STT provider for real-time transcription */
|
||||
sttProvider: z.enum(["openai-realtime"]).default("openai-realtime"),
|
||||
/** OpenAI API key for Realtime API (uses OPENAI_API_KEY env if not set) */
|
||||
openaiApiKey: z.string().min(1).optional(),
|
||||
/** OpenAI transcription model (default: gpt-4o-transcribe) */
|
||||
sttModel: z.string().min(1).default("gpt-4o-transcribe"),
|
||||
/** VAD silence duration in ms before considering speech ended */
|
||||
silenceDurationMs: z.number().int().positive().default(800),
|
||||
/** VAD threshold 0-1 (higher = less sensitive) */
|
||||
vadThreshold: z.number().min(0).max(1).default(0.5),
|
||||
/** WebSocket path for media stream connections */
|
||||
streamPath: z.string().min(1).default("/voice/stream"),
|
||||
})
|
||||
.strict()
|
||||
.default({
|
||||
enabled: false,
|
||||
sttProvider: "openai-realtime",
|
||||
sttModel: "gpt-4o-transcribe",
|
||||
silenceDurationMs: 800,
|
||||
vadThreshold: 0.5,
|
||||
streamPath: "/voice/stream",
|
||||
});
|
||||
export type VoiceCallStreamingConfig = z.infer<
|
||||
typeof VoiceCallStreamingConfigSchema
|
||||
>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Main Voice Call Configuration
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const VoiceCallConfigSchema = z
|
||||
.object({
|
||||
/** Enable voice call functionality */
|
||||
enabled: z.boolean().default(false),
|
||||
|
||||
/** Active provider (telnyx, twilio, plivo, or mock) */
|
||||
provider: z.enum(["telnyx", "twilio", "plivo", "mock"]).optional(),
|
||||
|
||||
/** Telnyx-specific configuration */
|
||||
telnyx: TelnyxConfigSchema.optional(),
|
||||
|
||||
/** Twilio-specific configuration */
|
||||
twilio: TwilioConfigSchema.optional(),
|
||||
|
||||
/** Plivo-specific configuration */
|
||||
plivo: PlivoConfigSchema.optional(),
|
||||
|
||||
/** Phone number to call from (E.164) */
|
||||
fromNumber: E164Schema.optional(),
|
||||
|
||||
/** Default phone number to call (E.164) */
|
||||
toNumber: E164Schema.optional(),
|
||||
|
||||
/** Inbound call policy */
|
||||
inboundPolicy: InboundPolicySchema.default("disabled"),
|
||||
|
||||
/** Allowlist of phone numbers for inbound calls (E.164) */
|
||||
allowFrom: z.array(E164Schema).default([]),
|
||||
|
||||
/** Greeting message for inbound calls */
|
||||
inboundGreeting: z.string().optional(),
|
||||
|
||||
/** Outbound call configuration */
|
||||
outbound: OutboundConfigSchema,
|
||||
|
||||
/** Maximum call duration in seconds */
|
||||
maxDurationSeconds: z.number().int().positive().default(300),
|
||||
|
||||
/** Silence timeout for end-of-speech detection (ms) */
|
||||
silenceTimeoutMs: z.number().int().positive().default(800),
|
||||
|
||||
/** Timeout for user transcript (ms) */
|
||||
transcriptTimeoutMs: z.number().int().positive().default(180000),
|
||||
|
||||
/** Ring timeout for outbound calls (ms) */
|
||||
ringTimeoutMs: z.number().int().positive().default(30000),
|
||||
|
||||
/** Maximum concurrent calls */
|
||||
maxConcurrentCalls: z.number().int().positive().default(1),
|
||||
|
||||
/** Webhook server configuration */
|
||||
serve: VoiceCallServeConfigSchema,
|
||||
|
||||
/** Tailscale exposure configuration (legacy, prefer tunnel config) */
|
||||
tailscale: VoiceCallTailscaleConfigSchema,
|
||||
|
||||
/** Tunnel configuration (unified ngrok/tailscale) */
|
||||
tunnel: VoiceCallTunnelConfigSchema,
|
||||
|
||||
/** Real-time audio streaming configuration */
|
||||
streaming: VoiceCallStreamingConfigSchema,
|
||||
|
||||
/** Public webhook URL override (if set, bypasses tunnel auto-detection) */
|
||||
publicUrl: z.string().url().optional(),
|
||||
|
||||
/** Skip webhook signature verification (development only, NOT for production) */
|
||||
skipSignatureVerification: z.boolean().default(false),
|
||||
|
||||
/** STT configuration */
|
||||
stt: SttConfigSchema,
|
||||
|
||||
/** TTS override (deep-merges with core messages.tts) */
|
||||
tts: TtsConfigSchema,
|
||||
|
||||
/** Store path for call logs */
|
||||
store: z.string().optional(),
|
||||
|
||||
/** Model for generating voice responses (e.g., "anthropic/claude-sonnet-4", "openai/gpt-4o") */
|
||||
responseModel: z.string().default("openai/gpt-4o-mini"),
|
||||
|
||||
/** System prompt for voice responses */
|
||||
responseSystemPrompt: z.string().optional(),
|
||||
|
||||
/** Timeout for response generation in ms (default 30s) */
|
||||
responseTimeoutMs: z.number().int().positive().default(30000),
|
||||
})
|
||||
.strict();
|
||||
|
||||
export type VoiceCallConfig = z.infer<typeof VoiceCallConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Configuration Helpers
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Resolves the configuration by merging environment variables into missing fields.
|
||||
* Returns a new configuration object with environment variables applied.
|
||||
*/
|
||||
export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig {
|
||||
const resolved = JSON.parse(JSON.stringify(config)) as VoiceCallConfig;
|
||||
|
||||
// Telnyx
|
||||
if (resolved.provider === "telnyx") {
|
||||
resolved.telnyx = resolved.telnyx ?? {};
|
||||
resolved.telnyx.apiKey =
|
||||
resolved.telnyx.apiKey ?? process.env.TELNYX_API_KEY;
|
||||
resolved.telnyx.connectionId =
|
||||
resolved.telnyx.connectionId ?? process.env.TELNYX_CONNECTION_ID;
|
||||
resolved.telnyx.publicKey =
|
||||
resolved.telnyx.publicKey ?? process.env.TELNYX_PUBLIC_KEY;
|
||||
}
|
||||
|
||||
// Twilio
|
||||
if (resolved.provider === "twilio") {
|
||||
resolved.twilio = resolved.twilio ?? {};
|
||||
resolved.twilio.accountSid =
|
||||
resolved.twilio.accountSid ?? process.env.TWILIO_ACCOUNT_SID;
|
||||
resolved.twilio.authToken =
|
||||
resolved.twilio.authToken ?? process.env.TWILIO_AUTH_TOKEN;
|
||||
}
|
||||
|
||||
// Plivo
|
||||
if (resolved.provider === "plivo") {
|
||||
resolved.plivo = resolved.plivo ?? {};
|
||||
resolved.plivo.authId =
|
||||
resolved.plivo.authId ?? process.env.PLIVO_AUTH_ID;
|
||||
resolved.plivo.authToken =
|
||||
resolved.plivo.authToken ?? process.env.PLIVO_AUTH_TOKEN;
|
||||
}
|
||||
|
||||
// Tunnel Config
|
||||
resolved.tunnel = resolved.tunnel ?? {
|
||||
provider: "none",
|
||||
allowNgrokFreeTierLoopbackBypass: false,
|
||||
};
|
||||
resolved.tunnel.allowNgrokFreeTierLoopbackBypass =
|
||||
resolved.tunnel.allowNgrokFreeTierLoopbackBypass ||
|
||||
resolved.tunnel.allowNgrokFreeTier ||
|
||||
false;
|
||||
resolved.tunnel.ngrokAuthToken =
|
||||
resolved.tunnel.ngrokAuthToken ?? process.env.NGROK_AUTHTOKEN;
|
||||
resolved.tunnel.ngrokDomain =
|
||||
resolved.tunnel.ngrokDomain ?? process.env.NGROK_DOMAIN;
|
||||
|
||||
return resolved;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate that the configuration has all required fields for the selected provider.
|
||||
*/
|
||||
export function validateProviderConfig(config: VoiceCallConfig): {
|
||||
valid: boolean;
|
||||
errors: string[];
|
||||
} {
|
||||
const errors: string[] = [];
|
||||
|
||||
if (!config.enabled) {
|
||||
return { valid: true, errors: [] };
|
||||
}
|
||||
|
||||
if (!config.provider) {
|
||||
errors.push("plugins.entries.voice-call.config.provider is required");
|
||||
}
|
||||
|
||||
if (!config.fromNumber && config.provider !== "mock") {
|
||||
errors.push("plugins.entries.voice-call.config.fromNumber is required");
|
||||
}
|
||||
|
||||
if (config.provider === "telnyx") {
|
||||
if (!config.telnyx?.apiKey) {
|
||||
errors.push(
|
||||
"plugins.entries.voice-call.config.telnyx.apiKey is required (or set TELNYX_API_KEY env)",
|
||||
);
|
||||
}
|
||||
if (!config.telnyx?.connectionId) {
|
||||
errors.push(
|
||||
"plugins.entries.voice-call.config.telnyx.connectionId is required (or set TELNYX_CONNECTION_ID env)",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (config.provider === "twilio") {
|
||||
if (!config.twilio?.accountSid) {
|
||||
errors.push(
|
||||
"plugins.entries.voice-call.config.twilio.accountSid is required (or set TWILIO_ACCOUNT_SID env)",
|
||||
);
|
||||
}
|
||||
if (!config.twilio?.authToken) {
|
||||
errors.push(
|
||||
"plugins.entries.voice-call.config.twilio.authToken is required (or set TWILIO_AUTH_TOKEN env)",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (config.provider === "plivo") {
|
||||
if (!config.plivo?.authId) {
|
||||
errors.push(
|
||||
"plugins.entries.voice-call.config.plivo.authId is required (or set PLIVO_AUTH_ID env)",
|
||||
);
|
||||
}
|
||||
if (!config.plivo?.authToken) {
|
||||
errors.push(
|
||||
"plugins.entries.voice-call.config.plivo.authToken is required (or set PLIVO_AUTH_TOKEN env)",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return { valid: errors.length === 0, errors };
|
||||
}
|
||||
@@ -0,0 +1,198 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath, pathToFileURL } from "node:url";
|
||||
|
||||
import type { VoiceCallTtsConfig } from "./config.js";
|
||||
|
||||
export type CoreConfig = {
|
||||
session?: {
|
||||
store?: string;
|
||||
};
|
||||
messages?: {
|
||||
tts?: VoiceCallTtsConfig;
|
||||
};
|
||||
[key: string]: unknown;
|
||||
};
|
||||
|
||||
type CoreAgentDeps = {
|
||||
resolveAgentDir: (cfg: CoreConfig, agentId: string) => string;
|
||||
resolveAgentWorkspaceDir: (cfg: CoreConfig, agentId: string) => string;
|
||||
resolveAgentIdentity: (
|
||||
cfg: CoreConfig,
|
||||
agentId: string,
|
||||
) => { name?: string | null } | null | undefined;
|
||||
resolveThinkingDefault: (params: {
|
||||
cfg: CoreConfig;
|
||||
provider?: string;
|
||||
model?: string;
|
||||
}) => string;
|
||||
runEmbeddedPiAgent: (params: {
|
||||
sessionId: string;
|
||||
sessionKey?: string;
|
||||
messageProvider?: string;
|
||||
sessionFile: string;
|
||||
workspaceDir: string;
|
||||
config?: CoreConfig;
|
||||
prompt: string;
|
||||
provider?: string;
|
||||
model?: string;
|
||||
thinkLevel?: string;
|
||||
verboseLevel?: string;
|
||||
timeoutMs: number;
|
||||
runId: string;
|
||||
lane?: string;
|
||||
extraSystemPrompt?: string;
|
||||
agentDir?: string;
|
||||
}) => Promise<{
|
||||
payloads?: Array<{ text?: string; isError?: boolean }>;
|
||||
meta?: { aborted?: boolean };
|
||||
}>;
|
||||
resolveAgentTimeoutMs: (opts: { cfg: CoreConfig }) => number;
|
||||
ensureAgentWorkspace: (params?: { dir: string }) => Promise<void>;
|
||||
resolveStorePath: (store?: string, opts?: { agentId?: string }) => string;
|
||||
loadSessionStore: (storePath: string) => Record<string, unknown>;
|
||||
saveSessionStore: (
|
||||
storePath: string,
|
||||
store: Record<string, unknown>,
|
||||
) => Promise<void>;
|
||||
resolveSessionFilePath: (
|
||||
sessionId: string,
|
||||
entry: unknown,
|
||||
opts?: { agentId?: string },
|
||||
) => string;
|
||||
DEFAULT_MODEL: string;
|
||||
DEFAULT_PROVIDER: string;
|
||||
};
|
||||
|
||||
let coreRootCache: string | null = null;
|
||||
let coreDepsPromise: Promise<CoreAgentDeps> | null = null;
|
||||
|
||||
function findPackageRoot(startDir: string, name: string): string | null {
|
||||
let dir = startDir;
|
||||
for (;;) {
|
||||
const pkgPath = path.join(dir, "package.json");
|
||||
try {
|
||||
if (fs.existsSync(pkgPath)) {
|
||||
const raw = fs.readFileSync(pkgPath, "utf8");
|
||||
const pkg = JSON.parse(raw) as { name?: string };
|
||||
if (pkg.name === name) return dir;
|
||||
}
|
||||
} catch {
|
||||
// ignore parse errors and keep walking
|
||||
}
|
||||
const parent = path.dirname(dir);
|
||||
if (parent === dir) return null;
|
||||
dir = parent;
|
||||
}
|
||||
}
|
||||
|
||||
function resolveMoltbotRoot(): string {
|
||||
if (coreRootCache) return coreRootCache;
|
||||
const override = process.env.MOLTBOT_ROOT?.trim() || process.env.CLAWDBOT_ROOT?.trim();
|
||||
if (override) {
|
||||
coreRootCache = override;
|
||||
return override;
|
||||
}
|
||||
|
||||
const candidates = new Set<string>();
|
||||
if (process.argv[1]) {
|
||||
candidates.add(path.dirname(process.argv[1]));
|
||||
}
|
||||
candidates.add(process.cwd());
|
||||
try {
|
||||
const urlPath = fileURLToPath(import.meta.url);
|
||||
candidates.add(path.dirname(urlPath));
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
|
||||
for (const start of candidates) {
|
||||
for (const name of ["moltbot", "moltbot"]) {
|
||||
const found = findPackageRoot(start, name);
|
||||
if (found) {
|
||||
coreRootCache = found;
|
||||
return found;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
"Unable to resolve core root. Set MOLTBOT_ROOT (or legacy CLAWDBOT_ROOT) to the package root.",
|
||||
);
|
||||
}
|
||||
|
||||
async function importCoreModule<T>(relativePath: string): Promise<T> {
|
||||
const root = resolveMoltbotRoot();
|
||||
const distPath = path.join(root, "dist", relativePath);
|
||||
if (!fs.existsSync(distPath)) {
|
||||
throw new Error(
|
||||
`Missing core module at ${distPath}. Run \`pnpm build\` or install the official package.`,
|
||||
);
|
||||
}
|
||||
return (await import(pathToFileURL(distPath).href)) as T;
|
||||
}
|
||||
|
||||
export async function loadCoreAgentDeps(): Promise<CoreAgentDeps> {
|
||||
if (coreDepsPromise) return coreDepsPromise;
|
||||
|
||||
coreDepsPromise = (async () => {
|
||||
const [
|
||||
agentScope,
|
||||
defaults,
|
||||
identity,
|
||||
modelSelection,
|
||||
piEmbedded,
|
||||
timeout,
|
||||
workspace,
|
||||
sessions,
|
||||
] = await Promise.all([
|
||||
importCoreModule<{
|
||||
resolveAgentDir: CoreAgentDeps["resolveAgentDir"];
|
||||
resolveAgentWorkspaceDir: CoreAgentDeps["resolveAgentWorkspaceDir"];
|
||||
}>("agents/agent-scope.js"),
|
||||
importCoreModule<{
|
||||
DEFAULT_MODEL: string;
|
||||
DEFAULT_PROVIDER: string;
|
||||
}>("agents/defaults.js"),
|
||||
importCoreModule<{
|
||||
resolveAgentIdentity: CoreAgentDeps["resolveAgentIdentity"];
|
||||
}>("agents/identity.js"),
|
||||
importCoreModule<{
|
||||
resolveThinkingDefault: CoreAgentDeps["resolveThinkingDefault"];
|
||||
}>("agents/model-selection.js"),
|
||||
importCoreModule<{
|
||||
runEmbeddedPiAgent: CoreAgentDeps["runEmbeddedPiAgent"];
|
||||
}>("agents/pi-embedded.js"),
|
||||
importCoreModule<{
|
||||
resolveAgentTimeoutMs: CoreAgentDeps["resolveAgentTimeoutMs"];
|
||||
}>("agents/timeout.js"),
|
||||
importCoreModule<{
|
||||
ensureAgentWorkspace: CoreAgentDeps["ensureAgentWorkspace"];
|
||||
}>("agents/workspace.js"),
|
||||
importCoreModule<{
|
||||
resolveStorePath: CoreAgentDeps["resolveStorePath"];
|
||||
loadSessionStore: CoreAgentDeps["loadSessionStore"];
|
||||
saveSessionStore: CoreAgentDeps["saveSessionStore"];
|
||||
resolveSessionFilePath: CoreAgentDeps["resolveSessionFilePath"];
|
||||
}>("config/sessions.js"),
|
||||
]);
|
||||
|
||||
return {
|
||||
resolveAgentDir: agentScope.resolveAgentDir,
|
||||
resolveAgentWorkspaceDir: agentScope.resolveAgentWorkspaceDir,
|
||||
resolveAgentIdentity: identity.resolveAgentIdentity,
|
||||
resolveThinkingDefault: modelSelection.resolveThinkingDefault,
|
||||
runEmbeddedPiAgent: piEmbedded.runEmbeddedPiAgent,
|
||||
resolveAgentTimeoutMs: timeout.resolveAgentTimeoutMs,
|
||||
ensureAgentWorkspace: workspace.ensureAgentWorkspace,
|
||||
resolveStorePath: sessions.resolveStorePath,
|
||||
loadSessionStore: sessions.loadSessionStore,
|
||||
saveSessionStore: sessions.saveSessionStore,
|
||||
resolveSessionFilePath: sessions.resolveSessionFilePath,
|
||||
DEFAULT_MODEL: defaults.DEFAULT_MODEL,
|
||||
DEFAULT_PROVIDER: defaults.DEFAULT_PROVIDER,
|
||||
};
|
||||
})();
|
||||
|
||||
return coreDepsPromise;
|
||||
}
|
||||
@@ -0,0 +1,108 @@
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { VoiceCallConfigSchema } from "./config.js";
|
||||
import { CallManager } from "./manager.js";
|
||||
import type {
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
PlayTtsInput,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookVerificationResult,
|
||||
} from "./types.js";
|
||||
import type { VoiceCallProvider } from "./providers/base.js";
|
||||
|
||||
class FakeProvider implements VoiceCallProvider {
|
||||
readonly name = "plivo" as const;
|
||||
readonly playTtsCalls: PlayTtsInput[] = [];
|
||||
|
||||
verifyWebhook(_ctx: WebhookContext): WebhookVerificationResult {
|
||||
return { ok: true };
|
||||
}
|
||||
parseWebhookEvent(_ctx: WebhookContext): ProviderWebhookParseResult {
|
||||
return { events: [], statusCode: 200 };
|
||||
}
|
||||
async initiateCall(_input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
return { providerCallId: "request-uuid", status: "initiated" };
|
||||
}
|
||||
async hangupCall(_input: HangupCallInput): Promise<void> {}
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
this.playTtsCalls.push(input);
|
||||
}
|
||||
async startListening(_input: StartListeningInput): Promise<void> {}
|
||||
async stopListening(_input: StopListeningInput): Promise<void> {}
|
||||
}
|
||||
|
||||
describe("CallManager", () => {
|
||||
it("upgrades providerCallId mapping when provider ID changes", async () => {
|
||||
const config = VoiceCallConfigSchema.parse({
|
||||
enabled: true,
|
||||
provider: "plivo",
|
||||
fromNumber: "+15550000000",
|
||||
});
|
||||
|
||||
const storePath = path.join(os.tmpdir(), `moltbot-voice-call-test-${Date.now()}`);
|
||||
const manager = new CallManager(config, storePath);
|
||||
manager.initialize(new FakeProvider(), "https://example.com/voice/webhook");
|
||||
|
||||
const { callId, success, error } = await manager.initiateCall("+15550000001");
|
||||
expect(success).toBe(true);
|
||||
expect(error).toBeUndefined();
|
||||
|
||||
// The provider returned a request UUID as the initial providerCallId.
|
||||
expect(manager.getCall(callId)?.providerCallId).toBe("request-uuid");
|
||||
expect(manager.getCallByProviderCallId("request-uuid")?.callId).toBe(callId);
|
||||
|
||||
// Provider later reports the actual call UUID.
|
||||
manager.processEvent({
|
||||
id: "evt-1",
|
||||
type: "call.answered",
|
||||
callId,
|
||||
providerCallId: "call-uuid",
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
|
||||
expect(manager.getCall(callId)?.providerCallId).toBe("call-uuid");
|
||||
expect(manager.getCallByProviderCallId("call-uuid")?.callId).toBe(callId);
|
||||
expect(manager.getCallByProviderCallId("request-uuid")).toBeUndefined();
|
||||
});
|
||||
|
||||
it("speaks initial message on answered for notify mode (non-Twilio)", async () => {
|
||||
const config = VoiceCallConfigSchema.parse({
|
||||
enabled: true,
|
||||
provider: "plivo",
|
||||
fromNumber: "+15550000000",
|
||||
});
|
||||
|
||||
const storePath = path.join(os.tmpdir(), `moltbot-voice-call-test-${Date.now()}`);
|
||||
const provider = new FakeProvider();
|
||||
const manager = new CallManager(config, storePath);
|
||||
manager.initialize(provider, "https://example.com/voice/webhook");
|
||||
|
||||
const { callId, success } = await manager.initiateCall(
|
||||
"+15550000002",
|
||||
undefined,
|
||||
{ message: "Hello there", mode: "notify" },
|
||||
);
|
||||
expect(success).toBe(true);
|
||||
|
||||
manager.processEvent({
|
||||
id: "evt-2",
|
||||
type: "call.answered",
|
||||
callId,
|
||||
providerCallId: "call-uuid",
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 0));
|
||||
|
||||
expect(provider.playTtsCalls).toHaveLength(1);
|
||||
expect(provider.playTtsCalls[0]?.text).toBe("Hello there");
|
||||
});
|
||||
});
|
||||
876
docker-compose/ez-assistant/extensions/voice-call/src/manager.ts
Normal file
876
docker-compose/ez-assistant/extensions/voice-call/src/manager.ts
Normal file
@@ -0,0 +1,876 @@
|
||||
import crypto from "node:crypto";
|
||||
import fs from "node:fs";
|
||||
import fsp from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import { resolveUserPath } from "./utils.js";
|
||||
import type { CallMode, VoiceCallConfig } from "./config.js";
|
||||
import type { VoiceCallProvider } from "./providers/base.js";
|
||||
import {
|
||||
type CallId,
|
||||
type CallRecord,
|
||||
CallRecordSchema,
|
||||
type CallState,
|
||||
type NormalizedEvent,
|
||||
type OutboundCallOptions,
|
||||
TerminalStates,
|
||||
type TranscriptEntry,
|
||||
} from "./types.js";
|
||||
import { escapeXml, mapVoiceToPolly } from "./voice-mapping.js";
|
||||
|
||||
/**
|
||||
* Manages voice calls: state machine, persistence, and provider coordination.
|
||||
*/
|
||||
export class CallManager {
|
||||
private activeCalls = new Map<CallId, CallRecord>();
|
||||
private providerCallIdMap = new Map<string, CallId>(); // providerCallId -> internal callId
|
||||
private processedEventIds = new Set<string>();
|
||||
private provider: VoiceCallProvider | null = null;
|
||||
private config: VoiceCallConfig;
|
||||
private storePath: string;
|
||||
private webhookUrl: string | null = null;
|
||||
private transcriptWaiters = new Map<
|
||||
CallId,
|
||||
{
|
||||
resolve: (text: string) => void;
|
||||
reject: (err: Error) => void;
|
||||
timeout: NodeJS.Timeout;
|
||||
}
|
||||
>();
|
||||
/** Max duration timers to auto-hangup calls after configured timeout */
|
||||
private maxDurationTimers = new Map<CallId, NodeJS.Timeout>();
|
||||
|
||||
constructor(config: VoiceCallConfig, storePath?: string) {
|
||||
this.config = config;
|
||||
// Resolve store path with tilde expansion (like other config values)
|
||||
const rawPath =
|
||||
storePath ||
|
||||
config.store ||
|
||||
path.join(os.homedir(), "clawd", "voice-calls");
|
||||
this.storePath = resolveUserPath(rawPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the call manager with a provider.
|
||||
*/
|
||||
initialize(provider: VoiceCallProvider, webhookUrl: string): void {
|
||||
this.provider = provider;
|
||||
this.webhookUrl = webhookUrl;
|
||||
|
||||
// Ensure store directory exists
|
||||
fs.mkdirSync(this.storePath, { recursive: true });
|
||||
|
||||
// Load any persisted active calls
|
||||
this.loadActiveCalls();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current provider.
|
||||
*/
|
||||
getProvider(): VoiceCallProvider | null {
|
||||
return this.provider;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate an outbound call.
|
||||
* @param to - The phone number to call
|
||||
* @param sessionKey - Optional session key for context
|
||||
* @param options - Optional call options (message, mode)
|
||||
*/
|
||||
async initiateCall(
|
||||
to: string,
|
||||
sessionKey?: string,
|
||||
options?: OutboundCallOptions | string,
|
||||
): Promise<{ callId: CallId; success: boolean; error?: string }> {
|
||||
// Support legacy string argument for initialMessage
|
||||
const opts: OutboundCallOptions =
|
||||
typeof options === "string" ? { message: options } : (options ?? {});
|
||||
const initialMessage = opts.message;
|
||||
const mode = opts.mode ?? this.config.outbound.defaultMode;
|
||||
if (!this.provider) {
|
||||
return { callId: "", success: false, error: "Provider not initialized" };
|
||||
}
|
||||
|
||||
if (!this.webhookUrl) {
|
||||
return {
|
||||
callId: "",
|
||||
success: false,
|
||||
error: "Webhook URL not configured",
|
||||
};
|
||||
}
|
||||
|
||||
// Check concurrent call limit
|
||||
const activeCalls = this.getActiveCalls();
|
||||
if (activeCalls.length >= this.config.maxConcurrentCalls) {
|
||||
return {
|
||||
callId: "",
|
||||
success: false,
|
||||
error: `Maximum concurrent calls (${this.config.maxConcurrentCalls}) reached`,
|
||||
};
|
||||
}
|
||||
|
||||
const callId = crypto.randomUUID();
|
||||
const from =
|
||||
this.config.fromNumber ||
|
||||
(this.provider?.name === "mock" ? "+15550000000" : undefined);
|
||||
if (!from) {
|
||||
return { callId: "", success: false, error: "fromNumber not configured" };
|
||||
}
|
||||
|
||||
// Create call record with mode in metadata
|
||||
const callRecord: CallRecord = {
|
||||
callId,
|
||||
provider: this.provider.name,
|
||||
direction: "outbound",
|
||||
state: "initiated",
|
||||
from,
|
||||
to,
|
||||
sessionKey,
|
||||
startedAt: Date.now(),
|
||||
transcript: [],
|
||||
processedEventIds: [],
|
||||
metadata: {
|
||||
...(initialMessage && { initialMessage }),
|
||||
mode,
|
||||
},
|
||||
};
|
||||
|
||||
this.activeCalls.set(callId, callRecord);
|
||||
this.persistCallRecord(callRecord);
|
||||
|
||||
try {
|
||||
// For notify mode with a message, use inline TwiML with <Say>
|
||||
let inlineTwiml: string | undefined;
|
||||
if (mode === "notify" && initialMessage) {
|
||||
const pollyVoice = mapVoiceToPolly(this.config.tts?.openai?.voice);
|
||||
inlineTwiml = this.generateNotifyTwiml(initialMessage, pollyVoice);
|
||||
console.log(
|
||||
`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`,
|
||||
);
|
||||
}
|
||||
|
||||
const result = await this.provider.initiateCall({
|
||||
callId,
|
||||
from,
|
||||
to,
|
||||
webhookUrl: this.webhookUrl,
|
||||
inlineTwiml,
|
||||
});
|
||||
|
||||
callRecord.providerCallId = result.providerCallId;
|
||||
this.providerCallIdMap.set(result.providerCallId, callId); // Map providerCallId to internal callId
|
||||
this.persistCallRecord(callRecord);
|
||||
|
||||
return { callId, success: true };
|
||||
} catch (err) {
|
||||
callRecord.state = "failed";
|
||||
callRecord.endedAt = Date.now();
|
||||
callRecord.endReason = "failed";
|
||||
this.persistCallRecord(callRecord);
|
||||
this.activeCalls.delete(callId);
|
||||
if (callRecord.providerCallId) {
|
||||
this.providerCallIdMap.delete(callRecord.providerCallId);
|
||||
}
|
||||
|
||||
return {
|
||||
callId,
|
||||
success: false,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Speak to user in an active call.
|
||||
*/
|
||||
async speak(
|
||||
callId: CallId,
|
||||
text: string,
|
||||
): Promise<{ success: boolean; error?: string }> {
|
||||
const call = this.activeCalls.get(callId);
|
||||
if (!call) {
|
||||
return { success: false, error: "Call not found" };
|
||||
}
|
||||
|
||||
if (!this.provider || !call.providerCallId) {
|
||||
return { success: false, error: "Call not connected" };
|
||||
}
|
||||
|
||||
if (TerminalStates.has(call.state)) {
|
||||
return { success: false, error: "Call has ended" };
|
||||
}
|
||||
|
||||
try {
|
||||
// Update state
|
||||
call.state = "speaking";
|
||||
this.persistCallRecord(call);
|
||||
|
||||
// Add to transcript
|
||||
this.addTranscriptEntry(call, "bot", text);
|
||||
|
||||
// Play TTS
|
||||
const voice =
|
||||
this.provider?.name === "twilio" ? this.config.tts?.openai?.voice : undefined;
|
||||
await this.provider.playTts({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
text,
|
||||
voice,
|
||||
});
|
||||
|
||||
return { success: true };
|
||||
} catch (err) {
|
||||
return {
|
||||
success: false,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Speak the initial message for a call (called when media stream connects).
|
||||
* This is used to auto-play the message passed to initiateCall.
|
||||
* In notify mode, auto-hangup after the message is delivered.
|
||||
*/
|
||||
async speakInitialMessage(providerCallId: string): Promise<void> {
|
||||
const call = this.getCallByProviderCallId(providerCallId);
|
||||
if (!call) {
|
||||
console.warn(
|
||||
`[voice-call] speakInitialMessage: no call found for ${providerCallId}`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const initialMessage = call.metadata?.initialMessage as string | undefined;
|
||||
const mode = (call.metadata?.mode as CallMode) ?? "conversation";
|
||||
|
||||
if (!initialMessage) {
|
||||
console.log(
|
||||
`[voice-call] speakInitialMessage: no initial message for ${call.callId}`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Clear the initial message so we don't speak it again
|
||||
if (call.metadata) {
|
||||
delete call.metadata.initialMessage;
|
||||
this.persistCallRecord(call);
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[voice-call] Speaking initial message for call ${call.callId} (mode: ${mode})`,
|
||||
);
|
||||
const result = await this.speak(call.callId, initialMessage);
|
||||
if (!result.success) {
|
||||
console.warn(
|
||||
`[voice-call] Failed to speak initial message: ${result.error}`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// In notify mode, auto-hangup after delay
|
||||
if (mode === "notify") {
|
||||
const delaySec = this.config.outbound.notifyHangupDelaySec;
|
||||
console.log(
|
||||
`[voice-call] Notify mode: auto-hangup in ${delaySec}s for call ${call.callId}`,
|
||||
);
|
||||
setTimeout(async () => {
|
||||
const currentCall = this.getCall(call.callId);
|
||||
if (currentCall && !TerminalStates.has(currentCall.state)) {
|
||||
console.log(
|
||||
`[voice-call] Notify mode: hanging up call ${call.callId}`,
|
||||
);
|
||||
await this.endCall(call.callId);
|
||||
}
|
||||
}, delaySec * 1000);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start max duration timer for a call.
|
||||
* Auto-hangup when maxDurationSeconds is reached.
|
||||
*/
|
||||
private startMaxDurationTimer(callId: CallId): void {
|
||||
// Clear any existing timer
|
||||
this.clearMaxDurationTimer(callId);
|
||||
|
||||
const maxDurationMs = this.config.maxDurationSeconds * 1000;
|
||||
console.log(
|
||||
`[voice-call] Starting max duration timer (${this.config.maxDurationSeconds}s) for call ${callId}`,
|
||||
);
|
||||
|
||||
const timer = setTimeout(async () => {
|
||||
this.maxDurationTimers.delete(callId);
|
||||
const call = this.getCall(callId);
|
||||
if (call && !TerminalStates.has(call.state)) {
|
||||
console.log(
|
||||
`[voice-call] Max duration reached (${this.config.maxDurationSeconds}s), ending call ${callId}`,
|
||||
);
|
||||
call.endReason = "timeout";
|
||||
this.persistCallRecord(call);
|
||||
await this.endCall(callId);
|
||||
}
|
||||
}, maxDurationMs);
|
||||
|
||||
this.maxDurationTimers.set(callId, timer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear max duration timer for a call.
|
||||
*/
|
||||
private clearMaxDurationTimer(callId: CallId): void {
|
||||
const timer = this.maxDurationTimers.get(callId);
|
||||
if (timer) {
|
||||
clearTimeout(timer);
|
||||
this.maxDurationTimers.delete(callId);
|
||||
}
|
||||
}
|
||||
|
||||
private clearTranscriptWaiter(callId: CallId): void {
|
||||
const waiter = this.transcriptWaiters.get(callId);
|
||||
if (!waiter) return;
|
||||
clearTimeout(waiter.timeout);
|
||||
this.transcriptWaiters.delete(callId);
|
||||
}
|
||||
|
||||
private rejectTranscriptWaiter(callId: CallId, reason: string): void {
|
||||
const waiter = this.transcriptWaiters.get(callId);
|
||||
if (!waiter) return;
|
||||
this.clearTranscriptWaiter(callId);
|
||||
waiter.reject(new Error(reason));
|
||||
}
|
||||
|
||||
private resolveTranscriptWaiter(callId: CallId, transcript: string): void {
|
||||
const waiter = this.transcriptWaiters.get(callId);
|
||||
if (!waiter) return;
|
||||
this.clearTranscriptWaiter(callId);
|
||||
waiter.resolve(transcript);
|
||||
}
|
||||
|
||||
private waitForFinalTranscript(callId: CallId): Promise<string> {
|
||||
// Only allow one in-flight waiter per call.
|
||||
this.rejectTranscriptWaiter(callId, "Transcript waiter replaced");
|
||||
|
||||
const timeoutMs = this.config.transcriptTimeoutMs;
|
||||
return new Promise((resolve, reject) => {
|
||||
const timeout = setTimeout(() => {
|
||||
this.transcriptWaiters.delete(callId);
|
||||
reject(
|
||||
new Error(`Timed out waiting for transcript after ${timeoutMs}ms`),
|
||||
);
|
||||
}, timeoutMs);
|
||||
|
||||
this.transcriptWaiters.set(callId, { resolve, reject, timeout });
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Continue call: speak prompt, then wait for user's final transcript.
|
||||
*/
|
||||
async continueCall(
|
||||
callId: CallId,
|
||||
prompt: string,
|
||||
): Promise<{ success: boolean; transcript?: string; error?: string }> {
|
||||
const call = this.activeCalls.get(callId);
|
||||
if (!call) {
|
||||
return { success: false, error: "Call not found" };
|
||||
}
|
||||
|
||||
if (!this.provider || !call.providerCallId) {
|
||||
return { success: false, error: "Call not connected" };
|
||||
}
|
||||
|
||||
if (TerminalStates.has(call.state)) {
|
||||
return { success: false, error: "Call has ended" };
|
||||
}
|
||||
|
||||
try {
|
||||
await this.speak(callId, prompt);
|
||||
|
||||
call.state = "listening";
|
||||
this.persistCallRecord(call);
|
||||
|
||||
await this.provider.startListening({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
});
|
||||
|
||||
const transcript = await this.waitForFinalTranscript(callId);
|
||||
|
||||
// Best-effort: stop listening after final transcript.
|
||||
await this.provider.stopListening({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
});
|
||||
|
||||
return { success: true, transcript };
|
||||
} catch (err) {
|
||||
return {
|
||||
success: false,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
} finally {
|
||||
this.clearTranscriptWaiter(callId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* End an active call.
|
||||
*/
|
||||
async endCall(callId: CallId): Promise<{ success: boolean; error?: string }> {
|
||||
const call = this.activeCalls.get(callId);
|
||||
if (!call) {
|
||||
return { success: false, error: "Call not found" };
|
||||
}
|
||||
|
||||
if (!this.provider || !call.providerCallId) {
|
||||
return { success: false, error: "Call not connected" };
|
||||
}
|
||||
|
||||
if (TerminalStates.has(call.state)) {
|
||||
return { success: true }; // Already ended
|
||||
}
|
||||
|
||||
try {
|
||||
await this.provider.hangupCall({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
reason: "hangup-bot",
|
||||
});
|
||||
|
||||
call.state = "hangup-bot";
|
||||
call.endedAt = Date.now();
|
||||
call.endReason = "hangup-bot";
|
||||
this.persistCallRecord(call);
|
||||
this.clearMaxDurationTimer(callId);
|
||||
this.rejectTranscriptWaiter(callId, "Call ended: hangup-bot");
|
||||
this.activeCalls.delete(callId);
|
||||
if (call.providerCallId) {
|
||||
this.providerCallIdMap.delete(call.providerCallId);
|
||||
}
|
||||
|
||||
return { success: true };
|
||||
} catch (err) {
|
||||
return {
|
||||
success: false,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if an inbound call should be accepted based on policy.
|
||||
*/
|
||||
private shouldAcceptInbound(from: string | undefined): boolean {
|
||||
const { inboundPolicy: policy, allowFrom } = this.config;
|
||||
|
||||
switch (policy) {
|
||||
case "disabled":
|
||||
console.log("[voice-call] Inbound call rejected: policy is disabled");
|
||||
return false;
|
||||
|
||||
case "open":
|
||||
console.log("[voice-call] Inbound call accepted: policy is open");
|
||||
return true;
|
||||
|
||||
case "allowlist":
|
||||
case "pairing": {
|
||||
const normalized = from?.replace(/\D/g, "") || "";
|
||||
const allowed = (allowFrom || []).some((num) => {
|
||||
const normalizedAllow = num.replace(/\D/g, "");
|
||||
return (
|
||||
normalized.endsWith(normalizedAllow) ||
|
||||
normalizedAllow.endsWith(normalized)
|
||||
);
|
||||
});
|
||||
const status = allowed ? "accepted" : "rejected";
|
||||
console.log(
|
||||
`[voice-call] Inbound call ${status}: ${from} ${allowed ? "is in" : "not in"} allowlist`,
|
||||
);
|
||||
return allowed;
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a call record for an inbound call.
|
||||
*/
|
||||
private createInboundCall(
|
||||
providerCallId: string,
|
||||
from: string,
|
||||
to: string,
|
||||
): CallRecord {
|
||||
const callId = crypto.randomUUID();
|
||||
|
||||
const callRecord: CallRecord = {
|
||||
callId,
|
||||
providerCallId,
|
||||
provider: this.provider?.name || "twilio",
|
||||
direction: "inbound",
|
||||
state: "ringing",
|
||||
from,
|
||||
to,
|
||||
startedAt: Date.now(),
|
||||
transcript: [],
|
||||
processedEventIds: [],
|
||||
metadata: {
|
||||
initialMessage:
|
||||
this.config.inboundGreeting || "Hello! How can I help you today?",
|
||||
},
|
||||
};
|
||||
|
||||
this.activeCalls.set(callId, callRecord);
|
||||
this.providerCallIdMap.set(providerCallId, callId); // Map providerCallId to internal callId
|
||||
this.persistCallRecord(callRecord);
|
||||
|
||||
console.log(
|
||||
`[voice-call] Created inbound call record: ${callId} from ${from}`,
|
||||
);
|
||||
return callRecord;
|
||||
}
|
||||
|
||||
/**
|
||||
* Look up a call by either internal callId or providerCallId.
|
||||
*/
|
||||
private findCall(callIdOrProviderCallId: string): CallRecord | undefined {
|
||||
// Try direct lookup by internal callId
|
||||
const directCall = this.activeCalls.get(callIdOrProviderCallId);
|
||||
if (directCall) return directCall;
|
||||
|
||||
// Try lookup by providerCallId
|
||||
return this.getCallByProviderCallId(callIdOrProviderCallId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a webhook event.
|
||||
*/
|
||||
processEvent(event: NormalizedEvent): void {
|
||||
// Idempotency check
|
||||
if (this.processedEventIds.has(event.id)) {
|
||||
return;
|
||||
}
|
||||
this.processedEventIds.add(event.id);
|
||||
|
||||
let call = this.findCall(event.callId);
|
||||
|
||||
// Handle inbound calls - create record if it doesn't exist
|
||||
if (!call && event.direction === "inbound" && event.providerCallId) {
|
||||
// Check if we should accept this inbound call
|
||||
if (!this.shouldAcceptInbound(event.from)) {
|
||||
// TODO: Could hang up the call here
|
||||
return;
|
||||
}
|
||||
|
||||
// Create a new call record for this inbound call
|
||||
call = this.createInboundCall(
|
||||
event.providerCallId,
|
||||
event.from || "unknown",
|
||||
event.to || this.config.fromNumber || "unknown",
|
||||
);
|
||||
|
||||
// Update the event's callId to use our internal ID
|
||||
event.callId = call.callId;
|
||||
}
|
||||
|
||||
if (!call) {
|
||||
// Still no call record - ignore event
|
||||
return;
|
||||
}
|
||||
|
||||
// Update provider call ID if we got it
|
||||
if (event.providerCallId && event.providerCallId !== call.providerCallId) {
|
||||
const previousProviderCallId = call.providerCallId;
|
||||
call.providerCallId = event.providerCallId;
|
||||
this.providerCallIdMap.set(event.providerCallId, call.callId);
|
||||
if (previousProviderCallId) {
|
||||
const mapped = this.providerCallIdMap.get(previousProviderCallId);
|
||||
if (mapped === call.callId) {
|
||||
this.providerCallIdMap.delete(previousProviderCallId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Track processed event
|
||||
call.processedEventIds.push(event.id);
|
||||
|
||||
// Process event based on type
|
||||
switch (event.type) {
|
||||
case "call.initiated":
|
||||
this.transitionState(call, "initiated");
|
||||
break;
|
||||
|
||||
case "call.ringing":
|
||||
this.transitionState(call, "ringing");
|
||||
break;
|
||||
|
||||
case "call.answered":
|
||||
call.answeredAt = event.timestamp;
|
||||
this.transitionState(call, "answered");
|
||||
// Start max duration timer when call is answered
|
||||
this.startMaxDurationTimer(call.callId);
|
||||
// Best-effort: speak initial message (for inbound greetings and outbound
|
||||
// conversation mode) once the call is answered.
|
||||
this.maybeSpeakInitialMessageOnAnswered(call);
|
||||
break;
|
||||
|
||||
case "call.active":
|
||||
this.transitionState(call, "active");
|
||||
break;
|
||||
|
||||
case "call.speaking":
|
||||
this.transitionState(call, "speaking");
|
||||
break;
|
||||
|
||||
case "call.speech":
|
||||
if (event.isFinal) {
|
||||
this.addTranscriptEntry(call, "user", event.transcript);
|
||||
this.resolveTranscriptWaiter(call.callId, event.transcript);
|
||||
}
|
||||
this.transitionState(call, "listening");
|
||||
break;
|
||||
|
||||
case "call.ended":
|
||||
call.endedAt = event.timestamp;
|
||||
call.endReason = event.reason;
|
||||
this.transitionState(call, event.reason as CallState);
|
||||
this.clearMaxDurationTimer(call.callId);
|
||||
this.rejectTranscriptWaiter(call.callId, `Call ended: ${event.reason}`);
|
||||
this.activeCalls.delete(call.callId);
|
||||
if (call.providerCallId) {
|
||||
this.providerCallIdMap.delete(call.providerCallId);
|
||||
}
|
||||
break;
|
||||
|
||||
case "call.error":
|
||||
if (!event.retryable) {
|
||||
call.endedAt = event.timestamp;
|
||||
call.endReason = "error";
|
||||
this.transitionState(call, "error");
|
||||
this.clearMaxDurationTimer(call.callId);
|
||||
this.rejectTranscriptWaiter(
|
||||
call.callId,
|
||||
`Call error: ${event.error}`,
|
||||
);
|
||||
this.activeCalls.delete(call.callId);
|
||||
if (call.providerCallId) {
|
||||
this.providerCallIdMap.delete(call.providerCallId);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
this.persistCallRecord(call);
|
||||
}
|
||||
|
||||
private maybeSpeakInitialMessageOnAnswered(call: CallRecord): void {
|
||||
const initialMessage =
|
||||
typeof call.metadata?.initialMessage === "string"
|
||||
? call.metadata.initialMessage.trim()
|
||||
: "";
|
||||
|
||||
if (!initialMessage) return;
|
||||
|
||||
if (!this.provider || !call.providerCallId) return;
|
||||
|
||||
// Twilio has provider-specific state for speaking (<Say> fallback) and can
|
||||
// fail for inbound calls; keep existing Twilio behavior unchanged.
|
||||
if (this.provider.name === "twilio") return;
|
||||
|
||||
void this.speakInitialMessage(call.providerCallId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an active call by ID.
|
||||
*/
|
||||
getCall(callId: CallId): CallRecord | undefined {
|
||||
return this.activeCalls.get(callId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an active call by provider call ID (e.g., Twilio CallSid).
|
||||
*/
|
||||
getCallByProviderCallId(providerCallId: string): CallRecord | undefined {
|
||||
// Fast path: use the providerCallIdMap for O(1) lookup
|
||||
const callId = this.providerCallIdMap.get(providerCallId);
|
||||
if (callId) {
|
||||
return this.activeCalls.get(callId);
|
||||
}
|
||||
|
||||
// Fallback: linear search for cases where map wasn't populated
|
||||
// (e.g., providerCallId set directly on call record)
|
||||
for (const call of this.activeCalls.values()) {
|
||||
if (call.providerCallId === providerCallId) {
|
||||
return call;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all active calls.
|
||||
*/
|
||||
getActiveCalls(): CallRecord[] {
|
||||
return Array.from(this.activeCalls.values());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get call history (from persisted logs).
|
||||
*/
|
||||
async getCallHistory(limit = 50): Promise<CallRecord[]> {
|
||||
const logPath = path.join(this.storePath, "calls.jsonl");
|
||||
|
||||
try {
|
||||
await fsp.access(logPath);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
|
||||
const content = await fsp.readFile(logPath, "utf-8");
|
||||
const lines = content.trim().split("\n").filter(Boolean);
|
||||
const calls: CallRecord[] = [];
|
||||
|
||||
// Parse last N lines
|
||||
for (const line of lines.slice(-limit)) {
|
||||
try {
|
||||
const parsed = CallRecordSchema.parse(JSON.parse(line));
|
||||
calls.push(parsed);
|
||||
} catch {
|
||||
// Skip invalid lines
|
||||
}
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
// States that can cycle during multi-turn conversations
|
||||
private static readonly ConversationStates = new Set<CallState>([
|
||||
"speaking",
|
||||
"listening",
|
||||
]);
|
||||
|
||||
// Non-terminal state order for monotonic transitions
|
||||
private static readonly StateOrder: readonly CallState[] = [
|
||||
"initiated",
|
||||
"ringing",
|
||||
"answered",
|
||||
"active",
|
||||
"speaking",
|
||||
"listening",
|
||||
];
|
||||
|
||||
/**
|
||||
* Transition call state with monotonic enforcement.
|
||||
*/
|
||||
private transitionState(call: CallRecord, newState: CallState): void {
|
||||
// No-op for same state or already terminal
|
||||
if (call.state === newState || TerminalStates.has(call.state)) return;
|
||||
|
||||
// Terminal states can always be reached from non-terminal
|
||||
if (TerminalStates.has(newState)) {
|
||||
call.state = newState;
|
||||
return;
|
||||
}
|
||||
|
||||
// Allow cycling between speaking and listening (multi-turn conversations)
|
||||
if (
|
||||
CallManager.ConversationStates.has(call.state) &&
|
||||
CallManager.ConversationStates.has(newState)
|
||||
) {
|
||||
call.state = newState;
|
||||
return;
|
||||
}
|
||||
|
||||
// Only allow forward transitions in state order
|
||||
const currentIndex = CallManager.StateOrder.indexOf(call.state);
|
||||
const newIndex = CallManager.StateOrder.indexOf(newState);
|
||||
|
||||
if (newIndex > currentIndex) {
|
||||
call.state = newState;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an entry to the call transcript.
|
||||
*/
|
||||
private addTranscriptEntry(
|
||||
call: CallRecord,
|
||||
speaker: "bot" | "user",
|
||||
text: string,
|
||||
): void {
|
||||
const entry: TranscriptEntry = {
|
||||
timestamp: Date.now(),
|
||||
speaker,
|
||||
text,
|
||||
isFinal: true,
|
||||
};
|
||||
call.transcript.push(entry);
|
||||
}
|
||||
|
||||
/**
|
||||
* Persist a call record to disk (fire-and-forget async).
|
||||
*/
|
||||
private persistCallRecord(call: CallRecord): void {
|
||||
const logPath = path.join(this.storePath, "calls.jsonl");
|
||||
const line = `${JSON.stringify(call)}\n`;
|
||||
// Fire-and-forget async write to avoid blocking event loop
|
||||
fsp.appendFile(logPath, line).catch((err) => {
|
||||
console.error("[voice-call] Failed to persist call record:", err);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Load active calls from persistence (for crash recovery).
|
||||
* Uses streaming to handle large log files efficiently.
|
||||
*/
|
||||
private loadActiveCalls(): void {
|
||||
const logPath = path.join(this.storePath, "calls.jsonl");
|
||||
if (!fs.existsSync(logPath)) return;
|
||||
|
||||
// Read file synchronously and parse lines
|
||||
const content = fs.readFileSync(logPath, "utf-8");
|
||||
const lines = content.split("\n");
|
||||
|
||||
// Build map of latest state per call
|
||||
const callMap = new Map<CallId, CallRecord>();
|
||||
|
||||
for (const line of lines) {
|
||||
if (!line.trim()) continue;
|
||||
try {
|
||||
const call = CallRecordSchema.parse(JSON.parse(line));
|
||||
callMap.set(call.callId, call);
|
||||
} catch {
|
||||
// Skip invalid lines
|
||||
}
|
||||
}
|
||||
|
||||
// Only keep non-terminal calls
|
||||
for (const [callId, call] of callMap) {
|
||||
if (!TerminalStates.has(call.state)) {
|
||||
this.activeCalls.set(callId, call);
|
||||
// Populate providerCallId mapping for lookups
|
||||
if (call.providerCallId) {
|
||||
this.providerCallIdMap.set(call.providerCallId, callId);
|
||||
}
|
||||
// Populate processed event IDs
|
||||
for (const eventId of call.processedEventIds) {
|
||||
this.processedEventIds.add(eventId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate TwiML for notify mode (speak message and hang up).
|
||||
*/
|
||||
private generateNotifyTwiml(message: string, voice: string): string {
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Say voice="${voice}">${escapeXml(message)}</Say>
|
||||
<Hangup/>
|
||||
</Response>`;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
import type { CallId, CallRecord } from "../types.js";
|
||||
import type { VoiceCallConfig } from "../config.js";
|
||||
import type { VoiceCallProvider } from "../providers/base.js";
|
||||
|
||||
export type TranscriptWaiter = {
|
||||
resolve: (text: string) => void;
|
||||
reject: (err: Error) => void;
|
||||
timeout: NodeJS.Timeout;
|
||||
};
|
||||
|
||||
export type CallManagerContext = {
|
||||
activeCalls: Map<CallId, CallRecord>;
|
||||
providerCallIdMap: Map<string, CallId>;
|
||||
processedEventIds: Set<string>;
|
||||
provider: VoiceCallProvider | null;
|
||||
config: VoiceCallConfig;
|
||||
storePath: string;
|
||||
webhookUrl: string | null;
|
||||
transcriptWaiters: Map<CallId, TranscriptWaiter>;
|
||||
maxDurationTimers: Map<CallId, NodeJS.Timeout>;
|
||||
};
|
||||
@@ -0,0 +1,177 @@
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import type { CallId, CallRecord, CallState, NormalizedEvent } from "../types.js";
|
||||
import { TerminalStates } from "../types.js";
|
||||
import type { CallManagerContext } from "./context.js";
|
||||
import { findCall } from "./lookup.js";
|
||||
import { addTranscriptEntry, transitionState } from "./state.js";
|
||||
import { persistCallRecord } from "./store.js";
|
||||
import {
|
||||
clearMaxDurationTimer,
|
||||
rejectTranscriptWaiter,
|
||||
resolveTranscriptWaiter,
|
||||
startMaxDurationTimer,
|
||||
} from "./timers.js";
|
||||
import { endCall } from "./outbound.js";
|
||||
|
||||
function shouldAcceptInbound(config: CallManagerContext["config"], from: string | undefined): boolean {
|
||||
const { inboundPolicy: policy, allowFrom } = config;
|
||||
|
||||
switch (policy) {
|
||||
case "disabled":
|
||||
console.log("[voice-call] Inbound call rejected: policy is disabled");
|
||||
return false;
|
||||
|
||||
case "open":
|
||||
console.log("[voice-call] Inbound call accepted: policy is open");
|
||||
return true;
|
||||
|
||||
case "allowlist":
|
||||
case "pairing": {
|
||||
const normalized = from?.replace(/\D/g, "") || "";
|
||||
const allowed = (allowFrom || []).some((num) => {
|
||||
const normalizedAllow = num.replace(/\D/g, "");
|
||||
return normalized.endsWith(normalizedAllow) || normalizedAllow.endsWith(normalized);
|
||||
});
|
||||
const status = allowed ? "accepted" : "rejected";
|
||||
console.log(
|
||||
`[voice-call] Inbound call ${status}: ${from} ${allowed ? "is in" : "not in"} allowlist`,
|
||||
);
|
||||
return allowed;
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function createInboundCall(params: {
|
||||
ctx: CallManagerContext;
|
||||
providerCallId: string;
|
||||
from: string;
|
||||
to: string;
|
||||
}): CallRecord {
|
||||
const callId = crypto.randomUUID();
|
||||
|
||||
const callRecord: CallRecord = {
|
||||
callId,
|
||||
providerCallId: params.providerCallId,
|
||||
provider: params.ctx.provider?.name || "twilio",
|
||||
direction: "inbound",
|
||||
state: "ringing",
|
||||
from: params.from,
|
||||
to: params.to,
|
||||
startedAt: Date.now(),
|
||||
transcript: [],
|
||||
processedEventIds: [],
|
||||
metadata: {
|
||||
initialMessage: params.ctx.config.inboundGreeting || "Hello! How can I help you today?",
|
||||
},
|
||||
};
|
||||
|
||||
params.ctx.activeCalls.set(callId, callRecord);
|
||||
params.ctx.providerCallIdMap.set(params.providerCallId, callId);
|
||||
persistCallRecord(params.ctx.storePath, callRecord);
|
||||
|
||||
console.log(`[voice-call] Created inbound call record: ${callId} from ${params.from}`);
|
||||
return callRecord;
|
||||
}
|
||||
|
||||
export function processEvent(ctx: CallManagerContext, event: NormalizedEvent): void {
|
||||
if (ctx.processedEventIds.has(event.id)) return;
|
||||
ctx.processedEventIds.add(event.id);
|
||||
|
||||
let call = findCall({
|
||||
activeCalls: ctx.activeCalls,
|
||||
providerCallIdMap: ctx.providerCallIdMap,
|
||||
callIdOrProviderCallId: event.callId,
|
||||
});
|
||||
|
||||
if (!call && event.direction === "inbound" && event.providerCallId) {
|
||||
if (!shouldAcceptInbound(ctx.config, event.from)) {
|
||||
// TODO: Could hang up the call here.
|
||||
return;
|
||||
}
|
||||
|
||||
call = createInboundCall({
|
||||
ctx,
|
||||
providerCallId: event.providerCallId,
|
||||
from: event.from || "unknown",
|
||||
to: event.to || ctx.config.fromNumber || "unknown",
|
||||
});
|
||||
|
||||
// Normalize event to internal ID for downstream consumers.
|
||||
event.callId = call.callId;
|
||||
}
|
||||
|
||||
if (!call) return;
|
||||
|
||||
if (event.providerCallId && !call.providerCallId) {
|
||||
call.providerCallId = event.providerCallId;
|
||||
ctx.providerCallIdMap.set(event.providerCallId, call.callId);
|
||||
}
|
||||
|
||||
call.processedEventIds.push(event.id);
|
||||
|
||||
switch (event.type) {
|
||||
case "call.initiated":
|
||||
transitionState(call, "initiated");
|
||||
break;
|
||||
|
||||
case "call.ringing":
|
||||
transitionState(call, "ringing");
|
||||
break;
|
||||
|
||||
case "call.answered":
|
||||
call.answeredAt = event.timestamp;
|
||||
transitionState(call, "answered");
|
||||
startMaxDurationTimer({
|
||||
ctx,
|
||||
callId: call.callId,
|
||||
onTimeout: async (callId) => {
|
||||
await endCall(ctx, callId);
|
||||
},
|
||||
});
|
||||
break;
|
||||
|
||||
case "call.active":
|
||||
transitionState(call, "active");
|
||||
break;
|
||||
|
||||
case "call.speaking":
|
||||
transitionState(call, "speaking");
|
||||
break;
|
||||
|
||||
case "call.speech":
|
||||
if (event.isFinal) {
|
||||
addTranscriptEntry(call, "user", event.transcript);
|
||||
resolveTranscriptWaiter(ctx, call.callId, event.transcript);
|
||||
}
|
||||
transitionState(call, "listening");
|
||||
break;
|
||||
|
||||
case "call.ended":
|
||||
call.endedAt = event.timestamp;
|
||||
call.endReason = event.reason;
|
||||
transitionState(call, event.reason as CallState);
|
||||
clearMaxDurationTimer(ctx, call.callId);
|
||||
rejectTranscriptWaiter(ctx, call.callId, `Call ended: ${event.reason}`);
|
||||
ctx.activeCalls.delete(call.callId);
|
||||
if (call.providerCallId) ctx.providerCallIdMap.delete(call.providerCallId);
|
||||
break;
|
||||
|
||||
case "call.error":
|
||||
if (!event.retryable) {
|
||||
call.endedAt = event.timestamp;
|
||||
call.endReason = "error";
|
||||
transitionState(call, "error");
|
||||
clearMaxDurationTimer(ctx, call.callId);
|
||||
rejectTranscriptWaiter(ctx, call.callId, `Call error: ${event.error}`);
|
||||
ctx.activeCalls.delete(call.callId);
|
||||
if (call.providerCallId) ctx.providerCallIdMap.delete(call.providerCallId);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
persistCallRecord(ctx.storePath, call);
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
import type { CallId, CallRecord } from "../types.js";
|
||||
|
||||
export function getCallByProviderCallId(params: {
|
||||
activeCalls: Map<CallId, CallRecord>;
|
||||
providerCallIdMap: Map<string, CallId>;
|
||||
providerCallId: string;
|
||||
}): CallRecord | undefined {
|
||||
const callId = params.providerCallIdMap.get(params.providerCallId);
|
||||
if (callId) {
|
||||
return params.activeCalls.get(callId);
|
||||
}
|
||||
|
||||
for (const call of params.activeCalls.values()) {
|
||||
if (call.providerCallId === params.providerCallId) {
|
||||
return call;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function findCall(params: {
|
||||
activeCalls: Map<CallId, CallRecord>;
|
||||
providerCallIdMap: Map<string, CallId>;
|
||||
callIdOrProviderCallId: string;
|
||||
}): CallRecord | undefined {
|
||||
const directCall = params.activeCalls.get(params.callIdOrProviderCallId);
|
||||
if (directCall) return directCall;
|
||||
return getCallByProviderCallId({
|
||||
activeCalls: params.activeCalls,
|
||||
providerCallIdMap: params.providerCallIdMap,
|
||||
providerCallId: params.callIdOrProviderCallId,
|
||||
});
|
||||
}
|
||||
@@ -0,0 +1,248 @@
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import { TerminalStates, type CallId, type CallRecord, type OutboundCallOptions } from "../types.js";
|
||||
import type { CallMode } from "../config.js";
|
||||
import { mapVoiceToPolly } from "../voice-mapping.js";
|
||||
import type { CallManagerContext } from "./context.js";
|
||||
import { getCallByProviderCallId } from "./lookup.js";
|
||||
import { generateNotifyTwiml } from "./twiml.js";
|
||||
import { addTranscriptEntry, transitionState } from "./state.js";
|
||||
import { persistCallRecord } from "./store.js";
|
||||
import { clearMaxDurationTimer, clearTranscriptWaiter, rejectTranscriptWaiter, waitForFinalTranscript } from "./timers.js";
|
||||
|
||||
export async function initiateCall(
|
||||
ctx: CallManagerContext,
|
||||
to: string,
|
||||
sessionKey?: string,
|
||||
options?: OutboundCallOptions | string,
|
||||
): Promise<{ callId: CallId; success: boolean; error?: string }> {
|
||||
const opts: OutboundCallOptions =
|
||||
typeof options === "string" ? { message: options } : (options ?? {});
|
||||
const initialMessage = opts.message;
|
||||
const mode = opts.mode ?? ctx.config.outbound.defaultMode;
|
||||
|
||||
if (!ctx.provider) {
|
||||
return { callId: "", success: false, error: "Provider not initialized" };
|
||||
}
|
||||
if (!ctx.webhookUrl) {
|
||||
return { callId: "", success: false, error: "Webhook URL not configured" };
|
||||
}
|
||||
|
||||
if (ctx.activeCalls.size >= ctx.config.maxConcurrentCalls) {
|
||||
return {
|
||||
callId: "",
|
||||
success: false,
|
||||
error: `Maximum concurrent calls (${ctx.config.maxConcurrentCalls}) reached`,
|
||||
};
|
||||
}
|
||||
|
||||
const callId = crypto.randomUUID();
|
||||
const from =
|
||||
ctx.config.fromNumber ||
|
||||
(ctx.provider?.name === "mock" ? "+15550000000" : undefined);
|
||||
if (!from) {
|
||||
return { callId: "", success: false, error: "fromNumber not configured" };
|
||||
}
|
||||
|
||||
const callRecord: CallRecord = {
|
||||
callId,
|
||||
provider: ctx.provider.name,
|
||||
direction: "outbound",
|
||||
state: "initiated",
|
||||
from,
|
||||
to,
|
||||
sessionKey,
|
||||
startedAt: Date.now(),
|
||||
transcript: [],
|
||||
processedEventIds: [],
|
||||
metadata: {
|
||||
...(initialMessage && { initialMessage }),
|
||||
mode,
|
||||
},
|
||||
};
|
||||
|
||||
ctx.activeCalls.set(callId, callRecord);
|
||||
persistCallRecord(ctx.storePath, callRecord);
|
||||
|
||||
try {
|
||||
// For notify mode with a message, use inline TwiML with <Say>.
|
||||
let inlineTwiml: string | undefined;
|
||||
if (mode === "notify" && initialMessage) {
|
||||
const pollyVoice = mapVoiceToPolly(ctx.config.tts?.openai?.voice);
|
||||
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
|
||||
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
|
||||
}
|
||||
|
||||
const result = await ctx.provider.initiateCall({
|
||||
callId,
|
||||
from,
|
||||
to,
|
||||
webhookUrl: ctx.webhookUrl,
|
||||
inlineTwiml,
|
||||
});
|
||||
|
||||
callRecord.providerCallId = result.providerCallId;
|
||||
ctx.providerCallIdMap.set(result.providerCallId, callId);
|
||||
persistCallRecord(ctx.storePath, callRecord);
|
||||
|
||||
return { callId, success: true };
|
||||
} catch (err) {
|
||||
callRecord.state = "failed";
|
||||
callRecord.endedAt = Date.now();
|
||||
callRecord.endReason = "failed";
|
||||
persistCallRecord(ctx.storePath, callRecord);
|
||||
ctx.activeCalls.delete(callId);
|
||||
if (callRecord.providerCallId) {
|
||||
ctx.providerCallIdMap.delete(callRecord.providerCallId);
|
||||
}
|
||||
|
||||
return {
|
||||
callId,
|
||||
success: false,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function speak(
|
||||
ctx: CallManagerContext,
|
||||
callId: CallId,
|
||||
text: string,
|
||||
): Promise<{ success: boolean; error?: string }> {
|
||||
const call = ctx.activeCalls.get(callId);
|
||||
if (!call) return { success: false, error: "Call not found" };
|
||||
if (!ctx.provider || !call.providerCallId) return { success: false, error: "Call not connected" };
|
||||
if (TerminalStates.has(call.state)) return { success: false, error: "Call has ended" };
|
||||
|
||||
try {
|
||||
transitionState(call, "speaking");
|
||||
persistCallRecord(ctx.storePath, call);
|
||||
|
||||
addTranscriptEntry(call, "bot", text);
|
||||
|
||||
const voice =
|
||||
ctx.provider?.name === "twilio" ? ctx.config.tts?.openai?.voice : undefined;
|
||||
await ctx.provider.playTts({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
text,
|
||||
voice,
|
||||
});
|
||||
|
||||
return { success: true };
|
||||
} catch (err) {
|
||||
return { success: false, error: err instanceof Error ? err.message : String(err) };
|
||||
}
|
||||
}
|
||||
|
||||
export async function speakInitialMessage(
|
||||
ctx: CallManagerContext,
|
||||
providerCallId: string,
|
||||
): Promise<void> {
|
||||
const call = getCallByProviderCallId({
|
||||
activeCalls: ctx.activeCalls,
|
||||
providerCallIdMap: ctx.providerCallIdMap,
|
||||
providerCallId,
|
||||
});
|
||||
if (!call) {
|
||||
console.warn(`[voice-call] speakInitialMessage: no call found for ${providerCallId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const initialMessage = call.metadata?.initialMessage as string | undefined;
|
||||
const mode = (call.metadata?.mode as CallMode) ?? "conversation";
|
||||
|
||||
if (!initialMessage) {
|
||||
console.log(`[voice-call] speakInitialMessage: no initial message for ${call.callId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Clear so we don't speak it again if the provider reconnects.
|
||||
if (call.metadata) {
|
||||
delete call.metadata.initialMessage;
|
||||
persistCallRecord(ctx.storePath, call);
|
||||
}
|
||||
|
||||
console.log(`[voice-call] Speaking initial message for call ${call.callId} (mode: ${mode})`);
|
||||
const result = await speak(ctx, call.callId, initialMessage);
|
||||
if (!result.success) {
|
||||
console.warn(`[voice-call] Failed to speak initial message: ${result.error}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (mode === "notify") {
|
||||
const delaySec = ctx.config.outbound.notifyHangupDelaySec;
|
||||
console.log(`[voice-call] Notify mode: auto-hangup in ${delaySec}s for call ${call.callId}`);
|
||||
setTimeout(async () => {
|
||||
const currentCall = ctx.activeCalls.get(call.callId);
|
||||
if (currentCall && !TerminalStates.has(currentCall.state)) {
|
||||
console.log(`[voice-call] Notify mode: hanging up call ${call.callId}`);
|
||||
await endCall(ctx, call.callId);
|
||||
}
|
||||
}, delaySec * 1000);
|
||||
}
|
||||
}
|
||||
|
||||
export async function continueCall(
|
||||
ctx: CallManagerContext,
|
||||
callId: CallId,
|
||||
prompt: string,
|
||||
): Promise<{ success: boolean; transcript?: string; error?: string }> {
|
||||
const call = ctx.activeCalls.get(callId);
|
||||
if (!call) return { success: false, error: "Call not found" };
|
||||
if (!ctx.provider || !call.providerCallId) return { success: false, error: "Call not connected" };
|
||||
if (TerminalStates.has(call.state)) return { success: false, error: "Call has ended" };
|
||||
|
||||
try {
|
||||
await speak(ctx, callId, prompt);
|
||||
|
||||
transitionState(call, "listening");
|
||||
persistCallRecord(ctx.storePath, call);
|
||||
|
||||
await ctx.provider.startListening({ callId, providerCallId: call.providerCallId });
|
||||
|
||||
const transcript = await waitForFinalTranscript(ctx, callId);
|
||||
|
||||
// Best-effort: stop listening after final transcript.
|
||||
await ctx.provider.stopListening({ callId, providerCallId: call.providerCallId });
|
||||
|
||||
return { success: true, transcript };
|
||||
} catch (err) {
|
||||
return { success: false, error: err instanceof Error ? err.message : String(err) };
|
||||
} finally {
|
||||
clearTranscriptWaiter(ctx, callId);
|
||||
}
|
||||
}
|
||||
|
||||
export async function endCall(
|
||||
ctx: CallManagerContext,
|
||||
callId: CallId,
|
||||
): Promise<{ success: boolean; error?: string }> {
|
||||
const call = ctx.activeCalls.get(callId);
|
||||
if (!call) return { success: false, error: "Call not found" };
|
||||
if (!ctx.provider || !call.providerCallId) return { success: false, error: "Call not connected" };
|
||||
if (TerminalStates.has(call.state)) return { success: true };
|
||||
|
||||
try {
|
||||
await ctx.provider.hangupCall({
|
||||
callId,
|
||||
providerCallId: call.providerCallId,
|
||||
reason: "hangup-bot",
|
||||
});
|
||||
|
||||
call.state = "hangup-bot";
|
||||
call.endedAt = Date.now();
|
||||
call.endReason = "hangup-bot";
|
||||
persistCallRecord(ctx.storePath, call);
|
||||
|
||||
clearMaxDurationTimer(ctx, callId);
|
||||
rejectTranscriptWaiter(ctx, callId, "Call ended: hangup-bot");
|
||||
|
||||
ctx.activeCalls.delete(callId);
|
||||
if (call.providerCallId) ctx.providerCallIdMap.delete(call.providerCallId);
|
||||
|
||||
return { success: true };
|
||||
} catch (err) {
|
||||
return { success: false, error: err instanceof Error ? err.message : String(err) };
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
import { TerminalStates, type CallRecord, type CallState, type TranscriptEntry } from "../types.js";
|
||||
|
||||
const ConversationStates = new Set<CallState>(["speaking", "listening"]);
|
||||
|
||||
const StateOrder: readonly CallState[] = [
|
||||
"initiated",
|
||||
"ringing",
|
||||
"answered",
|
||||
"active",
|
||||
"speaking",
|
||||
"listening",
|
||||
];
|
||||
|
||||
export function transitionState(call: CallRecord, newState: CallState): void {
|
||||
// No-op for same state or already terminal.
|
||||
if (call.state === newState || TerminalStates.has(call.state)) return;
|
||||
|
||||
// Terminal states can always be reached from non-terminal.
|
||||
if (TerminalStates.has(newState)) {
|
||||
call.state = newState;
|
||||
return;
|
||||
}
|
||||
|
||||
// Allow cycling between speaking and listening (multi-turn conversations).
|
||||
if (ConversationStates.has(call.state) && ConversationStates.has(newState)) {
|
||||
call.state = newState;
|
||||
return;
|
||||
}
|
||||
|
||||
// Only allow forward transitions in state order.
|
||||
const currentIndex = StateOrder.indexOf(call.state);
|
||||
const newIndex = StateOrder.indexOf(newState);
|
||||
if (newIndex > currentIndex) {
|
||||
call.state = newState;
|
||||
}
|
||||
}
|
||||
|
||||
export function addTranscriptEntry(
|
||||
call: CallRecord,
|
||||
speaker: "bot" | "user",
|
||||
text: string,
|
||||
): void {
|
||||
const entry: TranscriptEntry = {
|
||||
timestamp: Date.now(),
|
||||
speaker,
|
||||
text,
|
||||
isFinal: true,
|
||||
};
|
||||
call.transcript.push(entry);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
import fs from "node:fs";
|
||||
import fsp from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
|
||||
import { CallRecordSchema, TerminalStates, type CallId, type CallRecord } from "../types.js";
|
||||
|
||||
export function persistCallRecord(storePath: string, call: CallRecord): void {
|
||||
const logPath = path.join(storePath, "calls.jsonl");
|
||||
const line = `${JSON.stringify(call)}\n`;
|
||||
// Fire-and-forget async write to avoid blocking event loop.
|
||||
fsp.appendFile(logPath, line).catch((err) => {
|
||||
console.error("[voice-call] Failed to persist call record:", err);
|
||||
});
|
||||
}
|
||||
|
||||
export function loadActiveCallsFromStore(storePath: string): {
|
||||
activeCalls: Map<CallId, CallRecord>;
|
||||
providerCallIdMap: Map<string, CallId>;
|
||||
processedEventIds: Set<string>;
|
||||
} {
|
||||
const logPath = path.join(storePath, "calls.jsonl");
|
||||
if (!fs.existsSync(logPath)) {
|
||||
return {
|
||||
activeCalls: new Map(),
|
||||
providerCallIdMap: new Map(),
|
||||
processedEventIds: new Set(),
|
||||
};
|
||||
}
|
||||
|
||||
const content = fs.readFileSync(logPath, "utf-8");
|
||||
const lines = content.split("\n");
|
||||
|
||||
const callMap = new Map<CallId, CallRecord>();
|
||||
for (const line of lines) {
|
||||
if (!line.trim()) continue;
|
||||
try {
|
||||
const call = CallRecordSchema.parse(JSON.parse(line));
|
||||
callMap.set(call.callId, call);
|
||||
} catch {
|
||||
// Skip invalid lines.
|
||||
}
|
||||
}
|
||||
|
||||
const activeCalls = new Map<CallId, CallRecord>();
|
||||
const providerCallIdMap = new Map<string, CallId>();
|
||||
const processedEventIds = new Set<string>();
|
||||
|
||||
for (const [callId, call] of callMap) {
|
||||
if (TerminalStates.has(call.state)) continue;
|
||||
activeCalls.set(callId, call);
|
||||
if (call.providerCallId) {
|
||||
providerCallIdMap.set(call.providerCallId, callId);
|
||||
}
|
||||
for (const eventId of call.processedEventIds) {
|
||||
processedEventIds.add(eventId);
|
||||
}
|
||||
}
|
||||
|
||||
return { activeCalls, providerCallIdMap, processedEventIds };
|
||||
}
|
||||
|
||||
export async function getCallHistoryFromStore(
|
||||
storePath: string,
|
||||
limit = 50,
|
||||
): Promise<CallRecord[]> {
|
||||
const logPath = path.join(storePath, "calls.jsonl");
|
||||
|
||||
try {
|
||||
await fsp.access(logPath);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
|
||||
const content = await fsp.readFile(logPath, "utf-8");
|
||||
const lines = content.trim().split("\n").filter(Boolean);
|
||||
const calls: CallRecord[] = [];
|
||||
|
||||
for (const line of lines.slice(-limit)) {
|
||||
try {
|
||||
const parsed = CallRecordSchema.parse(JSON.parse(line));
|
||||
calls.push(parsed);
|
||||
} catch {
|
||||
// Skip invalid lines.
|
||||
}
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
import { TerminalStates, type CallId } from "../types.js";
|
||||
import type { CallManagerContext } from "./context.js";
|
||||
import { persistCallRecord } from "./store.js";
|
||||
|
||||
export function clearMaxDurationTimer(ctx: CallManagerContext, callId: CallId): void {
|
||||
const timer = ctx.maxDurationTimers.get(callId);
|
||||
if (timer) {
|
||||
clearTimeout(timer);
|
||||
ctx.maxDurationTimers.delete(callId);
|
||||
}
|
||||
}
|
||||
|
||||
export function startMaxDurationTimer(params: {
|
||||
ctx: CallManagerContext;
|
||||
callId: CallId;
|
||||
onTimeout: (callId: CallId) => Promise<void>;
|
||||
}): void {
|
||||
clearMaxDurationTimer(params.ctx, params.callId);
|
||||
|
||||
const maxDurationMs = params.ctx.config.maxDurationSeconds * 1000;
|
||||
console.log(
|
||||
`[voice-call] Starting max duration timer (${params.ctx.config.maxDurationSeconds}s) for call ${params.callId}`,
|
||||
);
|
||||
|
||||
const timer = setTimeout(async () => {
|
||||
params.ctx.maxDurationTimers.delete(params.callId);
|
||||
const call = params.ctx.activeCalls.get(params.callId);
|
||||
if (call && !TerminalStates.has(call.state)) {
|
||||
console.log(
|
||||
`[voice-call] Max duration reached (${params.ctx.config.maxDurationSeconds}s), ending call ${params.callId}`,
|
||||
);
|
||||
call.endReason = "timeout";
|
||||
persistCallRecord(params.ctx.storePath, call);
|
||||
await params.onTimeout(params.callId);
|
||||
}
|
||||
}, maxDurationMs);
|
||||
|
||||
params.ctx.maxDurationTimers.set(params.callId, timer);
|
||||
}
|
||||
|
||||
export function clearTranscriptWaiter(ctx: CallManagerContext, callId: CallId): void {
|
||||
const waiter = ctx.transcriptWaiters.get(callId);
|
||||
if (!waiter) return;
|
||||
clearTimeout(waiter.timeout);
|
||||
ctx.transcriptWaiters.delete(callId);
|
||||
}
|
||||
|
||||
export function rejectTranscriptWaiter(
|
||||
ctx: CallManagerContext,
|
||||
callId: CallId,
|
||||
reason: string,
|
||||
): void {
|
||||
const waiter = ctx.transcriptWaiters.get(callId);
|
||||
if (!waiter) return;
|
||||
clearTranscriptWaiter(ctx, callId);
|
||||
waiter.reject(new Error(reason));
|
||||
}
|
||||
|
||||
export function resolveTranscriptWaiter(
|
||||
ctx: CallManagerContext,
|
||||
callId: CallId,
|
||||
transcript: string,
|
||||
): void {
|
||||
const waiter = ctx.transcriptWaiters.get(callId);
|
||||
if (!waiter) return;
|
||||
clearTranscriptWaiter(ctx, callId);
|
||||
waiter.resolve(transcript);
|
||||
}
|
||||
|
||||
export function waitForFinalTranscript(
|
||||
ctx: CallManagerContext,
|
||||
callId: CallId,
|
||||
): Promise<string> {
|
||||
// Only allow one in-flight waiter per call.
|
||||
rejectTranscriptWaiter(ctx, callId, "Transcript waiter replaced");
|
||||
|
||||
const timeoutMs = ctx.config.transcriptTimeoutMs;
|
||||
return new Promise((resolve, reject) => {
|
||||
const timeout = setTimeout(() => {
|
||||
ctx.transcriptWaiters.delete(callId);
|
||||
reject(new Error(`Timed out waiting for transcript after ${timeoutMs}ms`));
|
||||
}, timeoutMs);
|
||||
|
||||
ctx.transcriptWaiters.set(callId, { resolve, reject, timeout });
|
||||
});
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
import { escapeXml } from "../voice-mapping.js";
|
||||
|
||||
export function generateNotifyTwiml(message: string, voice: string): string {
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Say voice="${voice}">${escapeXml(message)}</Say>
|
||||
<Hangup/>
|
||||
</Response>`;
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import type {
|
||||
OpenAIRealtimeSTTProvider,
|
||||
RealtimeSTTSession,
|
||||
} from "./providers/stt-openai-realtime.js";
|
||||
import { MediaStreamHandler } from "./media-stream.js";
|
||||
|
||||
const createStubSession = (): RealtimeSTTSession => ({
|
||||
connect: async () => {},
|
||||
sendAudio: () => {},
|
||||
waitForTranscript: async () => "",
|
||||
onPartial: () => {},
|
||||
onTranscript: () => {},
|
||||
onSpeechStart: () => {},
|
||||
close: () => {},
|
||||
isConnected: () => true,
|
||||
});
|
||||
|
||||
const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
|
||||
({
|
||||
createSession: () => createStubSession(),
|
||||
}) as unknown as OpenAIRealtimeSTTProvider;
|
||||
|
||||
const flush = async (): Promise<void> => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 0));
|
||||
};
|
||||
|
||||
const waitForAbort = (signal: AbortSignal): Promise<void> =>
|
||||
new Promise((resolve) => {
|
||||
if (signal.aborted) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
signal.addEventListener("abort", () => resolve(), { once: true });
|
||||
});
|
||||
|
||||
describe("MediaStreamHandler TTS queue", () => {
|
||||
it("serializes TTS playback and resolves in order", async () => {
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
});
|
||||
const started: number[] = [];
|
||||
const finished: number[] = [];
|
||||
|
||||
let resolveFirst!: () => void;
|
||||
const firstGate = new Promise<void>((resolve) => {
|
||||
resolveFirst = resolve;
|
||||
});
|
||||
|
||||
const first = handler.queueTts("stream-1", async () => {
|
||||
started.push(1);
|
||||
await firstGate;
|
||||
finished.push(1);
|
||||
});
|
||||
const second = handler.queueTts("stream-1", async () => {
|
||||
started.push(2);
|
||||
finished.push(2);
|
||||
});
|
||||
|
||||
await flush();
|
||||
expect(started).toEqual([1]);
|
||||
|
||||
resolveFirst();
|
||||
await first;
|
||||
await second;
|
||||
|
||||
expect(started).toEqual([1, 2]);
|
||||
expect(finished).toEqual([1, 2]);
|
||||
});
|
||||
|
||||
it("cancels active playback and clears queued items", async () => {
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
});
|
||||
|
||||
let queuedRan = false;
|
||||
const started: string[] = [];
|
||||
|
||||
const active = handler.queueTts("stream-1", async (signal) => {
|
||||
started.push("active");
|
||||
await waitForAbort(signal);
|
||||
});
|
||||
void handler.queueTts("stream-1", async () => {
|
||||
queuedRan = true;
|
||||
});
|
||||
|
||||
await flush();
|
||||
expect(started).toEqual(["active"]);
|
||||
|
||||
handler.clearTtsQueue("stream-1");
|
||||
await active;
|
||||
await flush();
|
||||
|
||||
expect(queuedRan).toBe(false);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,393 @@
|
||||
/**
|
||||
* Media Stream Handler
|
||||
*
|
||||
* Handles bidirectional audio streaming between Twilio and the AI services.
|
||||
* - Receives mu-law audio from Twilio via WebSocket
|
||||
* - Forwards to OpenAI Realtime STT for transcription
|
||||
* - Sends TTS audio back to Twilio
|
||||
*/
|
||||
|
||||
import type { IncomingMessage } from "node:http";
|
||||
import type { Duplex } from "node:stream";
|
||||
|
||||
import { WebSocket, WebSocketServer } from "ws";
|
||||
|
||||
import type {
|
||||
OpenAIRealtimeSTTProvider,
|
||||
RealtimeSTTSession,
|
||||
} from "./providers/stt-openai-realtime.js";
|
||||
|
||||
/**
|
||||
* Configuration for the media stream handler.
|
||||
*/
|
||||
export interface MediaStreamConfig {
|
||||
/** STT provider for transcription */
|
||||
sttProvider: OpenAIRealtimeSTTProvider;
|
||||
/** Callback when transcript is received */
|
||||
onTranscript?: (callId: string, transcript: string) => void;
|
||||
/** Callback for partial transcripts (streaming UI) */
|
||||
onPartialTranscript?: (callId: string, partial: string) => void;
|
||||
/** Callback when stream connects */
|
||||
onConnect?: (callId: string, streamSid: string) => void;
|
||||
/** Callback when speech starts (barge-in) */
|
||||
onSpeechStart?: (callId: string) => void;
|
||||
/** Callback when stream disconnects */
|
||||
onDisconnect?: (callId: string) => void;
|
||||
}
|
||||
|
||||
/**
|
||||
* Active media stream session.
|
||||
*/
|
||||
interface StreamSession {
|
||||
callId: string;
|
||||
streamSid: string;
|
||||
ws: WebSocket;
|
||||
sttSession: RealtimeSTTSession;
|
||||
}
|
||||
|
||||
type TtsQueueEntry = {
|
||||
playFn: (signal: AbortSignal) => Promise<void>;
|
||||
controller: AbortController;
|
||||
resolve: () => void;
|
||||
reject: (error: unknown) => void;
|
||||
};
|
||||
|
||||
/**
|
||||
* Manages WebSocket connections for Twilio media streams.
|
||||
*/
|
||||
export class MediaStreamHandler {
|
||||
private wss: WebSocketServer | null = null;
|
||||
private sessions = new Map<string, StreamSession>();
|
||||
private config: MediaStreamConfig;
|
||||
/** TTS playback queues per stream (serialize audio to prevent overlap) */
|
||||
private ttsQueues = new Map<string, TtsQueueEntry[]>();
|
||||
/** Whether TTS is currently playing per stream */
|
||||
private ttsPlaying = new Map<string, boolean>();
|
||||
/** Active TTS playback controllers per stream */
|
||||
private ttsActiveControllers = new Map<string, AbortController>();
|
||||
|
||||
constructor(config: MediaStreamConfig) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle WebSocket upgrade for media stream connections.
|
||||
*/
|
||||
handleUpgrade(request: IncomingMessage, socket: Duplex, head: Buffer): void {
|
||||
if (!this.wss) {
|
||||
this.wss = new WebSocketServer({ noServer: true });
|
||||
this.wss.on("connection", (ws, req) => this.handleConnection(ws, req));
|
||||
}
|
||||
|
||||
this.wss.handleUpgrade(request, socket, head, (ws) => {
|
||||
this.wss?.emit("connection", ws, request);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle new WebSocket connection from Twilio.
|
||||
*/
|
||||
private async handleConnection(
|
||||
ws: WebSocket,
|
||||
_request: IncomingMessage,
|
||||
): Promise<void> {
|
||||
let session: StreamSession | null = null;
|
||||
|
||||
ws.on("message", async (data: Buffer) => {
|
||||
try {
|
||||
const message = JSON.parse(data.toString()) as TwilioMediaMessage;
|
||||
|
||||
switch (message.event) {
|
||||
case "connected":
|
||||
console.log("[MediaStream] Twilio connected");
|
||||
break;
|
||||
|
||||
case "start":
|
||||
session = await this.handleStart(ws, message);
|
||||
break;
|
||||
|
||||
case "media":
|
||||
if (session && message.media?.payload) {
|
||||
// Forward audio to STT
|
||||
const audioBuffer = Buffer.from(message.media.payload, "base64");
|
||||
session.sttSession.sendAudio(audioBuffer);
|
||||
}
|
||||
break;
|
||||
|
||||
case "stop":
|
||||
if (session) {
|
||||
this.handleStop(session);
|
||||
session = null;
|
||||
}
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("[MediaStream] Error processing message:", error);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on("close", () => {
|
||||
if (session) {
|
||||
this.handleStop(session);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on("error", (error) => {
|
||||
console.error("[MediaStream] WebSocket error:", error);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle stream start event.
|
||||
*/
|
||||
private async handleStart(
|
||||
ws: WebSocket,
|
||||
message: TwilioMediaMessage,
|
||||
): Promise<StreamSession> {
|
||||
const streamSid = message.streamSid || "";
|
||||
const callSid = message.start?.callSid || "";
|
||||
|
||||
console.log(
|
||||
`[MediaStream] Stream started: ${streamSid} (call: ${callSid})`,
|
||||
);
|
||||
|
||||
// Create STT session
|
||||
const sttSession = this.config.sttProvider.createSession();
|
||||
|
||||
// Set up transcript callbacks
|
||||
sttSession.onPartial((partial) => {
|
||||
this.config.onPartialTranscript?.(callSid, partial);
|
||||
});
|
||||
|
||||
sttSession.onTranscript((transcript) => {
|
||||
this.config.onTranscript?.(callSid, transcript);
|
||||
});
|
||||
|
||||
sttSession.onSpeechStart(() => {
|
||||
this.config.onSpeechStart?.(callSid);
|
||||
});
|
||||
|
||||
const session: StreamSession = {
|
||||
callId: callSid,
|
||||
streamSid,
|
||||
ws,
|
||||
sttSession,
|
||||
};
|
||||
|
||||
this.sessions.set(streamSid, session);
|
||||
|
||||
// Notify connection BEFORE STT connect so TTS can work even if STT fails
|
||||
this.config.onConnect?.(callSid, streamSid);
|
||||
|
||||
// Connect to OpenAI STT (non-blocking, log errors but don't fail the call)
|
||||
sttSession.connect().catch((err) => {
|
||||
console.warn(
|
||||
`[MediaStream] STT connection failed (TTS still works):`,
|
||||
err.message,
|
||||
);
|
||||
});
|
||||
|
||||
return session;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle stream stop event.
|
||||
*/
|
||||
private handleStop(session: StreamSession): void {
|
||||
console.log(`[MediaStream] Stream stopped: ${session.streamSid}`);
|
||||
|
||||
this.clearTtsState(session.streamSid);
|
||||
session.sttSession.close();
|
||||
this.sessions.delete(session.streamSid);
|
||||
this.config.onDisconnect?.(session.callId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an active session with an open WebSocket, or undefined if unavailable.
|
||||
*/
|
||||
private getOpenSession(streamSid: string): StreamSession | undefined {
|
||||
const session = this.sessions.get(streamSid);
|
||||
return session?.ws.readyState === WebSocket.OPEN ? session : undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a message to a stream's WebSocket if available.
|
||||
*/
|
||||
private sendToStream(streamSid: string, message: unknown): void {
|
||||
const session = this.getOpenSession(streamSid);
|
||||
session?.ws.send(JSON.stringify(message));
|
||||
}
|
||||
|
||||
/**
|
||||
* Send audio to a specific stream (for TTS playback).
|
||||
* Audio should be mu-law encoded at 8kHz mono.
|
||||
*/
|
||||
sendAudio(streamSid: string, muLawAudio: Buffer): void {
|
||||
this.sendToStream(streamSid, {
|
||||
event: "media",
|
||||
streamSid,
|
||||
media: { payload: muLawAudio.toString("base64") },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a mark event to track audio playback position.
|
||||
*/
|
||||
sendMark(streamSid: string, name: string): void {
|
||||
this.sendToStream(streamSid, {
|
||||
event: "mark",
|
||||
streamSid,
|
||||
mark: { name },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear audio buffer (interrupt playback).
|
||||
*/
|
||||
clearAudio(streamSid: string): void {
|
||||
this.sendToStream(streamSid, { event: "clear", streamSid });
|
||||
}
|
||||
|
||||
/**
|
||||
* Queue a TTS operation for sequential playback.
|
||||
* Only one TTS operation plays at a time per stream to prevent overlap.
|
||||
*/
|
||||
async queueTts(
|
||||
streamSid: string,
|
||||
playFn: (signal: AbortSignal) => Promise<void>,
|
||||
): Promise<void> {
|
||||
const queue = this.getTtsQueue(streamSid);
|
||||
let resolveEntry: () => void;
|
||||
let rejectEntry: (error: unknown) => void;
|
||||
const promise = new Promise<void>((resolve, reject) => {
|
||||
resolveEntry = resolve;
|
||||
rejectEntry = reject;
|
||||
});
|
||||
|
||||
queue.push({
|
||||
playFn,
|
||||
controller: new AbortController(),
|
||||
resolve: resolveEntry!,
|
||||
reject: rejectEntry!,
|
||||
});
|
||||
|
||||
if (!this.ttsPlaying.get(streamSid)) {
|
||||
void this.processQueue(streamSid);
|
||||
}
|
||||
|
||||
return promise;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear TTS queue and interrupt current playback (barge-in).
|
||||
*/
|
||||
clearTtsQueue(streamSid: string): void {
|
||||
const queue = this.getTtsQueue(streamSid);
|
||||
queue.length = 0;
|
||||
this.ttsActiveControllers.get(streamSid)?.abort();
|
||||
this.clearAudio(streamSid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get active session by call ID.
|
||||
*/
|
||||
getSessionByCallId(callId: string): StreamSession | undefined {
|
||||
return [...this.sessions.values()].find(
|
||||
(session) => session.callId === callId,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all sessions.
|
||||
*/
|
||||
closeAll(): void {
|
||||
for (const session of this.sessions.values()) {
|
||||
this.clearTtsState(session.streamSid);
|
||||
session.sttSession.close();
|
||||
session.ws.close();
|
||||
}
|
||||
this.sessions.clear();
|
||||
}
|
||||
|
||||
private getTtsQueue(streamSid: string): TtsQueueEntry[] {
|
||||
const existing = this.ttsQueues.get(streamSid);
|
||||
if (existing) return existing;
|
||||
const queue: TtsQueueEntry[] = [];
|
||||
this.ttsQueues.set(streamSid, queue);
|
||||
return queue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process the TTS queue for a stream.
|
||||
* Uses iterative approach to avoid stack accumulation from recursion.
|
||||
*/
|
||||
private async processQueue(streamSid: string): Promise<void> {
|
||||
this.ttsPlaying.set(streamSid, true);
|
||||
|
||||
while (true) {
|
||||
const queue = this.ttsQueues.get(streamSid);
|
||||
if (!queue || queue.length === 0) {
|
||||
this.ttsPlaying.set(streamSid, false);
|
||||
this.ttsActiveControllers.delete(streamSid);
|
||||
return;
|
||||
}
|
||||
|
||||
const entry = queue.shift()!;
|
||||
this.ttsActiveControllers.set(streamSid, entry.controller);
|
||||
|
||||
try {
|
||||
await entry.playFn(entry.controller.signal);
|
||||
entry.resolve();
|
||||
} catch (error) {
|
||||
if (entry.controller.signal.aborted) {
|
||||
entry.resolve();
|
||||
} else {
|
||||
console.error("[MediaStream] TTS playback error:", error);
|
||||
entry.reject(error);
|
||||
}
|
||||
} finally {
|
||||
if (this.ttsActiveControllers.get(streamSid) === entry.controller) {
|
||||
this.ttsActiveControllers.delete(streamSid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private clearTtsState(streamSid: string): void {
|
||||
const queue = this.ttsQueues.get(streamSid);
|
||||
if (queue) queue.length = 0;
|
||||
this.ttsActiveControllers.get(streamSid)?.abort();
|
||||
this.ttsActiveControllers.delete(streamSid);
|
||||
this.ttsPlaying.delete(streamSid);
|
||||
this.ttsQueues.delete(streamSid);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Twilio Media Stream message format.
|
||||
*/
|
||||
interface TwilioMediaMessage {
|
||||
event: "connected" | "start" | "media" | "stop" | "mark" | "clear";
|
||||
sequenceNumber?: string;
|
||||
streamSid?: string;
|
||||
start?: {
|
||||
streamSid: string;
|
||||
accountSid: string;
|
||||
callSid: string;
|
||||
tracks: string[];
|
||||
mediaFormat: {
|
||||
encoding: string;
|
||||
sampleRate: number;
|
||||
channels: number;
|
||||
};
|
||||
};
|
||||
media?: {
|
||||
track?: string;
|
||||
chunk?: string;
|
||||
timestamp?: string;
|
||||
payload?: string;
|
||||
};
|
||||
mark?: {
|
||||
name: string;
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
import type {
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
PlayTtsInput,
|
||||
ProviderName,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
|
||||
/**
|
||||
* Abstract base interface for voice call providers.
|
||||
*
|
||||
* Each provider (Telnyx, Twilio, etc.) implements this interface to provide
|
||||
* a consistent API for the call manager.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Webhook verification and event parsing
|
||||
* - Outbound call initiation and hangup
|
||||
* - Media control (TTS playback, STT listening)
|
||||
*/
|
||||
export interface VoiceCallProvider {
|
||||
/** Provider identifier */
|
||||
readonly name: ProviderName;
|
||||
|
||||
/**
|
||||
* Verify webhook signature/HMAC before processing.
|
||||
* Must be called before parseWebhookEvent.
|
||||
*/
|
||||
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult;
|
||||
|
||||
/**
|
||||
* Parse provider-specific webhook payload into normalized events.
|
||||
* Returns events and optional response to send back to provider.
|
||||
*/
|
||||
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult;
|
||||
|
||||
/**
|
||||
* Initiate an outbound call.
|
||||
* @returns Provider call ID and status
|
||||
*/
|
||||
initiateCall(input: InitiateCallInput): Promise<InitiateCallResult>;
|
||||
|
||||
/**
|
||||
* Hang up an active call.
|
||||
*/
|
||||
hangupCall(input: HangupCallInput): Promise<void>;
|
||||
|
||||
/**
|
||||
* Play TTS audio to the caller.
|
||||
* The provider should handle streaming if supported.
|
||||
*/
|
||||
playTts(input: PlayTtsInput): Promise<void>;
|
||||
|
||||
/**
|
||||
* Start listening for user speech (activate STT).
|
||||
*/
|
||||
startListening(input: StartListeningInput): Promise<void>;
|
||||
|
||||
/**
|
||||
* Stop listening for user speech (deactivate STT).
|
||||
*/
|
||||
stopListening(input: StopListeningInput): Promise<void>;
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
export type { VoiceCallProvider } from "./base.js";
|
||||
export { MockProvider } from "./mock.js";
|
||||
export {
|
||||
OpenAIRealtimeSTTProvider,
|
||||
type RealtimeSTTConfig,
|
||||
type RealtimeSTTSession,
|
||||
} from "./stt-openai-realtime.js";
|
||||
export { TelnyxProvider } from "./telnyx.js";
|
||||
export { TwilioProvider } from "./twilio.js";
|
||||
export { PlivoProvider } from "./plivo.js";
|
||||
@@ -0,0 +1,168 @@
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import type {
|
||||
EndReason,
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
NormalizedEvent,
|
||||
PlayTtsInput,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
|
||||
/**
|
||||
* Mock voice call provider for local testing.
|
||||
*
|
||||
* Events are driven via webhook POST with JSON body:
|
||||
* - { events: NormalizedEvent[] } for bulk events
|
||||
* - { event: NormalizedEvent } for single event
|
||||
*/
|
||||
export class MockProvider implements VoiceCallProvider {
|
||||
readonly name = "mock" as const;
|
||||
|
||||
verifyWebhook(_ctx: WebhookContext): WebhookVerificationResult {
|
||||
return { ok: true };
|
||||
}
|
||||
|
||||
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
|
||||
try {
|
||||
const payload = JSON.parse(ctx.rawBody);
|
||||
const events: NormalizedEvent[] = [];
|
||||
|
||||
if (Array.isArray(payload.events)) {
|
||||
for (const evt of payload.events) {
|
||||
const normalized = this.normalizeEvent(evt);
|
||||
if (normalized) events.push(normalized);
|
||||
}
|
||||
} else if (payload.event) {
|
||||
const normalized = this.normalizeEvent(payload.event);
|
||||
if (normalized) events.push(normalized);
|
||||
}
|
||||
|
||||
return { events, statusCode: 200 };
|
||||
} catch {
|
||||
return { events: [], statusCode: 400 };
|
||||
}
|
||||
}
|
||||
|
||||
private normalizeEvent(
|
||||
evt: Partial<NormalizedEvent>,
|
||||
): NormalizedEvent | null {
|
||||
if (!evt.type || !evt.callId) return null;
|
||||
|
||||
const base = {
|
||||
id: evt.id || crypto.randomUUID(),
|
||||
callId: evt.callId,
|
||||
providerCallId: evt.providerCallId,
|
||||
timestamp: evt.timestamp || Date.now(),
|
||||
};
|
||||
|
||||
switch (evt.type) {
|
||||
case "call.initiated":
|
||||
case "call.ringing":
|
||||
case "call.answered":
|
||||
case "call.active":
|
||||
return { ...base, type: evt.type };
|
||||
|
||||
case "call.speaking": {
|
||||
const payload = evt as Partial<NormalizedEvent & { text?: string }>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
text: payload.text || "",
|
||||
};
|
||||
}
|
||||
|
||||
case "call.speech": {
|
||||
const payload = evt as Partial<
|
||||
NormalizedEvent & {
|
||||
transcript?: string;
|
||||
isFinal?: boolean;
|
||||
confidence?: number;
|
||||
}
|
||||
>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
transcript: payload.transcript || "",
|
||||
isFinal: payload.isFinal ?? true,
|
||||
confidence: payload.confidence,
|
||||
};
|
||||
}
|
||||
|
||||
case "call.silence": {
|
||||
const payload = evt as Partial<
|
||||
NormalizedEvent & { durationMs?: number }
|
||||
>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
durationMs: payload.durationMs || 0,
|
||||
};
|
||||
}
|
||||
|
||||
case "call.dtmf": {
|
||||
const payload = evt as Partial<NormalizedEvent & { digits?: string }>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
digits: payload.digits || "",
|
||||
};
|
||||
}
|
||||
|
||||
case "call.ended": {
|
||||
const payload = evt as Partial<
|
||||
NormalizedEvent & { reason?: EndReason }
|
||||
>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
reason: payload.reason || "completed",
|
||||
};
|
||||
}
|
||||
|
||||
case "call.error": {
|
||||
const payload = evt as Partial<
|
||||
NormalizedEvent & { error?: string; retryable?: boolean }
|
||||
>;
|
||||
return {
|
||||
...base,
|
||||
type: evt.type,
|
||||
error: payload.error || "unknown error",
|
||||
retryable: payload.retryable,
|
||||
};
|
||||
}
|
||||
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
return {
|
||||
providerCallId: `mock-${input.callId}`,
|
||||
status: "initiated",
|
||||
};
|
||||
}
|
||||
|
||||
async hangupCall(_input: HangupCallInput): Promise<void> {
|
||||
// No-op for mock
|
||||
}
|
||||
|
||||
async playTts(_input: PlayTtsInput): Promise<void> {
|
||||
// No-op for mock
|
||||
}
|
||||
|
||||
async startListening(_input: StartListeningInput): Promise<void> {
|
||||
// No-op for mock
|
||||
}
|
||||
|
||||
async stopListening(_input: StopListeningInput): Promise<void> {
|
||||
// No-op for mock
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { PlivoProvider } from "./plivo.js";
|
||||
|
||||
describe("PlivoProvider", () => {
|
||||
it("parses answer callback into call.answered and returns keep-alive XML", () => {
|
||||
const provider = new PlivoProvider({
|
||||
authId: "MA000000000000000000",
|
||||
authToken: "test-token",
|
||||
});
|
||||
|
||||
const result = provider.parseWebhookEvent({
|
||||
headers: { host: "example.com" },
|
||||
rawBody:
|
||||
"CallUUID=call-uuid&CallStatus=in-progress&Direction=outbound&From=%2B15550000000&To=%2B15550000001&Event=StartApp",
|
||||
url: "https://example.com/voice/webhook?provider=plivo&flow=answer&callId=internal-call-id",
|
||||
method: "POST",
|
||||
query: { provider: "plivo", flow: "answer", callId: "internal-call-id" },
|
||||
});
|
||||
|
||||
expect(result.events).toHaveLength(1);
|
||||
expect(result.events[0]?.type).toBe("call.answered");
|
||||
expect(result.events[0]?.callId).toBe("internal-call-id");
|
||||
expect(result.events[0]?.providerCallId).toBe("call-uuid");
|
||||
expect(result.providerResponseBody).toContain("<Wait");
|
||||
expect(result.providerResponseBody).toContain('length="300"');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,504 @@
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import type { PlivoConfig } from "../config.js";
|
||||
import type {
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
NormalizedEvent,
|
||||
PlayTtsInput,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import { escapeXml } from "../voice-mapping.js";
|
||||
import { reconstructWebhookUrl, verifyPlivoWebhook } from "../webhook-security.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
|
||||
export interface PlivoProviderOptions {
|
||||
/** Override public URL origin for signature verification */
|
||||
publicUrl?: string;
|
||||
/** Skip webhook signature verification (development only) */
|
||||
skipVerification?: boolean;
|
||||
/** Outbound ring timeout in seconds */
|
||||
ringTimeoutSec?: number;
|
||||
}
|
||||
|
||||
type PendingSpeak = { text: string; locale?: string };
|
||||
type PendingListen = { language?: string };
|
||||
|
||||
export class PlivoProvider implements VoiceCallProvider {
|
||||
readonly name = "plivo" as const;
|
||||
|
||||
private readonly authId: string;
|
||||
private readonly authToken: string;
|
||||
private readonly baseUrl: string;
|
||||
private readonly options: PlivoProviderOptions;
|
||||
|
||||
// Best-effort mapping between create-call request UUID and call UUID.
|
||||
private requestUuidToCallUuid = new Map<string, string>();
|
||||
|
||||
// Used for transfer URLs and GetInput action URLs.
|
||||
private callIdToWebhookUrl = new Map<string, string>();
|
||||
private callUuidToWebhookUrl = new Map<string, string>();
|
||||
|
||||
private pendingSpeakByCallId = new Map<string, PendingSpeak>();
|
||||
private pendingListenByCallId = new Map<string, PendingListen>();
|
||||
|
||||
constructor(config: PlivoConfig, options: PlivoProviderOptions = {}) {
|
||||
if (!config.authId) {
|
||||
throw new Error("Plivo Auth ID is required");
|
||||
}
|
||||
if (!config.authToken) {
|
||||
throw new Error("Plivo Auth Token is required");
|
||||
}
|
||||
|
||||
this.authId = config.authId;
|
||||
this.authToken = config.authToken;
|
||||
this.baseUrl = `https://api.plivo.com/v1/Account/${this.authId}`;
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
private async apiRequest<T = unknown>(params: {
|
||||
method: "GET" | "POST" | "DELETE";
|
||||
endpoint: string;
|
||||
body?: Record<string, unknown>;
|
||||
allowNotFound?: boolean;
|
||||
}): Promise<T> {
|
||||
const { method, endpoint, body, allowNotFound } = params;
|
||||
const response = await fetch(`${this.baseUrl}${endpoint}`, {
|
||||
method,
|
||||
headers: {
|
||||
Authorization: `Basic ${Buffer.from(`${this.authId}:${this.authToken}`).toString("base64")}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: body ? JSON.stringify(body) : undefined,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
if (allowNotFound && response.status === 404) {
|
||||
return undefined as T;
|
||||
}
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Plivo API error: ${response.status} ${errorText}`);
|
||||
}
|
||||
|
||||
const text = await response.text();
|
||||
return text ? (JSON.parse(text) as T) : (undefined as T);
|
||||
}
|
||||
|
||||
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
|
||||
const result = verifyPlivoWebhook(ctx, this.authToken, {
|
||||
publicUrl: this.options.publicUrl,
|
||||
skipVerification: this.options.skipVerification,
|
||||
});
|
||||
|
||||
if (!result.ok) {
|
||||
console.warn(`[plivo] Webhook verification failed: ${result.reason}`);
|
||||
}
|
||||
|
||||
return { ok: result.ok, reason: result.reason };
|
||||
}
|
||||
|
||||
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
|
||||
const flow =
|
||||
typeof ctx.query?.flow === "string" ? ctx.query.flow.trim() : "";
|
||||
|
||||
const parsed = this.parseBody(ctx.rawBody);
|
||||
if (!parsed) {
|
||||
return { events: [], statusCode: 400 };
|
||||
}
|
||||
|
||||
// Keep providerCallId mapping for later call control.
|
||||
const callUuid = parsed.get("CallUUID") || undefined;
|
||||
if (callUuid) {
|
||||
const webhookBase = PlivoProvider.baseWebhookUrlFromCtx(ctx);
|
||||
if (webhookBase) {
|
||||
this.callUuidToWebhookUrl.set(callUuid, webhookBase);
|
||||
}
|
||||
}
|
||||
|
||||
// Special flows that exist only to return Plivo XML (no events).
|
||||
if (flow === "xml-speak") {
|
||||
const callId = this.getCallIdFromQuery(ctx);
|
||||
const pending = callId ? this.pendingSpeakByCallId.get(callId) : undefined;
|
||||
if (callId) this.pendingSpeakByCallId.delete(callId);
|
||||
|
||||
const xml = pending
|
||||
? PlivoProvider.xmlSpeak(pending.text, pending.locale)
|
||||
: PlivoProvider.xmlKeepAlive();
|
||||
return {
|
||||
events: [],
|
||||
providerResponseBody: xml,
|
||||
providerResponseHeaders: { "Content-Type": "text/xml" },
|
||||
statusCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
if (flow === "xml-listen") {
|
||||
const callId = this.getCallIdFromQuery(ctx);
|
||||
const pending = callId
|
||||
? this.pendingListenByCallId.get(callId)
|
||||
: undefined;
|
||||
if (callId) this.pendingListenByCallId.delete(callId);
|
||||
|
||||
const actionUrl = this.buildActionUrl(ctx, {
|
||||
flow: "getinput",
|
||||
callId,
|
||||
});
|
||||
|
||||
const xml =
|
||||
actionUrl && callId
|
||||
? PlivoProvider.xmlGetInputSpeech({
|
||||
actionUrl,
|
||||
language: pending?.language,
|
||||
})
|
||||
: PlivoProvider.xmlKeepAlive();
|
||||
|
||||
return {
|
||||
events: [],
|
||||
providerResponseBody: xml,
|
||||
providerResponseHeaders: { "Content-Type": "text/xml" },
|
||||
statusCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
// Normal events.
|
||||
const callIdFromQuery = this.getCallIdFromQuery(ctx);
|
||||
const event = this.normalizeEvent(parsed, callIdFromQuery);
|
||||
|
||||
return {
|
||||
events: event ? [event] : [],
|
||||
providerResponseBody:
|
||||
flow === "answer" || flow === "getinput"
|
||||
? PlivoProvider.xmlKeepAlive()
|
||||
: PlivoProvider.xmlEmpty(),
|
||||
providerResponseHeaders: { "Content-Type": "text/xml" },
|
||||
statusCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
private normalizeEvent(
|
||||
params: URLSearchParams,
|
||||
callIdOverride?: string,
|
||||
): NormalizedEvent | null {
|
||||
const callUuid = params.get("CallUUID") || "";
|
||||
const requestUuid = params.get("RequestUUID") || "";
|
||||
|
||||
if (requestUuid && callUuid) {
|
||||
this.requestUuidToCallUuid.set(requestUuid, callUuid);
|
||||
}
|
||||
|
||||
const direction = params.get("Direction");
|
||||
const from = params.get("From") || undefined;
|
||||
const to = params.get("To") || undefined;
|
||||
const callStatus = params.get("CallStatus");
|
||||
|
||||
const baseEvent = {
|
||||
id: crypto.randomUUID(),
|
||||
callId: callIdOverride || callUuid || requestUuid,
|
||||
providerCallId: callUuid || requestUuid || undefined,
|
||||
timestamp: Date.now(),
|
||||
direction:
|
||||
direction === "inbound"
|
||||
? ("inbound" as const)
|
||||
: direction === "outbound"
|
||||
? ("outbound" as const)
|
||||
: undefined,
|
||||
from,
|
||||
to,
|
||||
};
|
||||
|
||||
const digits = params.get("Digits");
|
||||
if (digits) {
|
||||
return { ...baseEvent, type: "call.dtmf", digits };
|
||||
}
|
||||
|
||||
const transcript = PlivoProvider.extractTranscript(params);
|
||||
if (transcript) {
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.speech",
|
||||
transcript,
|
||||
isFinal: true,
|
||||
};
|
||||
}
|
||||
|
||||
// Call lifecycle.
|
||||
if (callStatus === "ringing") {
|
||||
return { ...baseEvent, type: "call.ringing" };
|
||||
}
|
||||
|
||||
if (callStatus === "in-progress") {
|
||||
return { ...baseEvent, type: "call.answered" };
|
||||
}
|
||||
|
||||
if (
|
||||
callStatus === "completed" ||
|
||||
callStatus === "busy" ||
|
||||
callStatus === "no-answer" ||
|
||||
callStatus === "failed"
|
||||
) {
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.ended",
|
||||
reason:
|
||||
callStatus === "completed"
|
||||
? "completed"
|
||||
: callStatus === "busy"
|
||||
? "busy"
|
||||
: callStatus === "no-answer"
|
||||
? "no-answer"
|
||||
: "failed",
|
||||
};
|
||||
}
|
||||
|
||||
// Plivo will call our answer_url when the call is answered; if we don't have
|
||||
// a CallStatus for some reason, treat it as answered so the call can proceed.
|
||||
if (params.get("Event") === "StartApp" && callUuid) {
|
||||
return { ...baseEvent, type: "call.answered" };
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
const webhookUrl = new URL(input.webhookUrl);
|
||||
webhookUrl.searchParams.set("provider", "plivo");
|
||||
webhookUrl.searchParams.set("callId", input.callId);
|
||||
|
||||
const answerUrl = new URL(webhookUrl);
|
||||
answerUrl.searchParams.set("flow", "answer");
|
||||
|
||||
const hangupUrl = new URL(webhookUrl);
|
||||
hangupUrl.searchParams.set("flow", "hangup");
|
||||
|
||||
this.callIdToWebhookUrl.set(input.callId, input.webhookUrl);
|
||||
|
||||
const ringTimeoutSec = this.options.ringTimeoutSec ?? 30;
|
||||
|
||||
const result = await this.apiRequest<PlivoCreateCallResponse>({
|
||||
method: "POST",
|
||||
endpoint: "/Call/",
|
||||
body: {
|
||||
from: PlivoProvider.normalizeNumber(input.from),
|
||||
to: PlivoProvider.normalizeNumber(input.to),
|
||||
answer_url: answerUrl.toString(),
|
||||
answer_method: "POST",
|
||||
hangup_url: hangupUrl.toString(),
|
||||
hangup_method: "POST",
|
||||
// Plivo's API uses `hangup_on_ring` for outbound ring timeout.
|
||||
hangup_on_ring: ringTimeoutSec,
|
||||
},
|
||||
});
|
||||
|
||||
const requestUuid = Array.isArray(result.request_uuid)
|
||||
? result.request_uuid[0]
|
||||
: result.request_uuid;
|
||||
if (!requestUuid) {
|
||||
throw new Error("Plivo call create returned no request_uuid");
|
||||
}
|
||||
|
||||
return { providerCallId: requestUuid, status: "initiated" };
|
||||
}
|
||||
|
||||
async hangupCall(input: HangupCallInput): Promise<void> {
|
||||
const callUuid = this.requestUuidToCallUuid.get(input.providerCallId);
|
||||
if (callUuid) {
|
||||
await this.apiRequest({
|
||||
method: "DELETE",
|
||||
endpoint: `/Call/${callUuid}/`,
|
||||
allowNotFound: true,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Best-effort: try hangup (call UUID), then cancel (request UUID).
|
||||
await this.apiRequest({
|
||||
method: "DELETE",
|
||||
endpoint: `/Call/${input.providerCallId}/`,
|
||||
allowNotFound: true,
|
||||
});
|
||||
await this.apiRequest({
|
||||
method: "DELETE",
|
||||
endpoint: `/Request/${input.providerCallId}/`,
|
||||
allowNotFound: true,
|
||||
});
|
||||
}
|
||||
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
const callUuid = this.requestUuidToCallUuid.get(input.providerCallId) ??
|
||||
input.providerCallId;
|
||||
const webhookBase =
|
||||
this.callUuidToWebhookUrl.get(callUuid) ||
|
||||
this.callIdToWebhookUrl.get(input.callId);
|
||||
if (!webhookBase) {
|
||||
throw new Error("Missing webhook URL for this call (provider state missing)");
|
||||
}
|
||||
|
||||
if (!callUuid) {
|
||||
throw new Error("Missing Plivo CallUUID for playTts");
|
||||
}
|
||||
|
||||
const transferUrl = new URL(webhookBase);
|
||||
transferUrl.searchParams.set("provider", "plivo");
|
||||
transferUrl.searchParams.set("flow", "xml-speak");
|
||||
transferUrl.searchParams.set("callId", input.callId);
|
||||
|
||||
this.pendingSpeakByCallId.set(input.callId, {
|
||||
text: input.text,
|
||||
locale: input.locale,
|
||||
});
|
||||
|
||||
await this.apiRequest({
|
||||
method: "POST",
|
||||
endpoint: `/Call/${callUuid}/`,
|
||||
body: {
|
||||
legs: "aleg",
|
||||
aleg_url: transferUrl.toString(),
|
||||
aleg_method: "POST",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
async startListening(input: StartListeningInput): Promise<void> {
|
||||
const callUuid = this.requestUuidToCallUuid.get(input.providerCallId) ??
|
||||
input.providerCallId;
|
||||
const webhookBase =
|
||||
this.callUuidToWebhookUrl.get(callUuid) ||
|
||||
this.callIdToWebhookUrl.get(input.callId);
|
||||
if (!webhookBase) {
|
||||
throw new Error("Missing webhook URL for this call (provider state missing)");
|
||||
}
|
||||
|
||||
if (!callUuid) {
|
||||
throw new Error("Missing Plivo CallUUID for startListening");
|
||||
}
|
||||
|
||||
const transferUrl = new URL(webhookBase);
|
||||
transferUrl.searchParams.set("provider", "plivo");
|
||||
transferUrl.searchParams.set("flow", "xml-listen");
|
||||
transferUrl.searchParams.set("callId", input.callId);
|
||||
|
||||
this.pendingListenByCallId.set(input.callId, {
|
||||
language: input.language,
|
||||
});
|
||||
|
||||
await this.apiRequest({
|
||||
method: "POST",
|
||||
endpoint: `/Call/${callUuid}/`,
|
||||
body: {
|
||||
legs: "aleg",
|
||||
aleg_url: transferUrl.toString(),
|
||||
aleg_method: "POST",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
async stopListening(_input: StopListeningInput): Promise<void> {
|
||||
// GetInput ends automatically when speech ends.
|
||||
}
|
||||
|
||||
private static normalizeNumber(numberOrSip: string): string {
|
||||
const trimmed = numberOrSip.trim();
|
||||
if (trimmed.toLowerCase().startsWith("sip:")) return trimmed;
|
||||
return trimmed.replace(/[^\d+]/g, "");
|
||||
}
|
||||
|
||||
private static xmlEmpty(): string {
|
||||
return `<?xml version="1.0" encoding="UTF-8"?><Response></Response>`;
|
||||
}
|
||||
|
||||
private static xmlKeepAlive(): string {
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Wait length="300" />
|
||||
</Response>`;
|
||||
}
|
||||
|
||||
private static xmlSpeak(text: string, locale?: string): string {
|
||||
const language = locale || "en-US";
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Speak language="${escapeXml(language)}">${escapeXml(text)}</Speak>
|
||||
<Wait length="300" />
|
||||
</Response>`;
|
||||
}
|
||||
|
||||
private static xmlGetInputSpeech(params: {
|
||||
actionUrl: string;
|
||||
language?: string;
|
||||
}): string {
|
||||
const language = params.language || "en-US";
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<GetInput inputType="speech" method="POST" action="${escapeXml(params.actionUrl)}" language="${escapeXml(language)}" executionTimeout="30" speechEndTimeout="1" redirect="false">
|
||||
</GetInput>
|
||||
<Wait length="300" />
|
||||
</Response>`;
|
||||
}
|
||||
|
||||
private getCallIdFromQuery(ctx: WebhookContext): string | undefined {
|
||||
const callId =
|
||||
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
|
||||
? ctx.query.callId.trim()
|
||||
: undefined;
|
||||
return callId || undefined;
|
||||
}
|
||||
|
||||
private buildActionUrl(
|
||||
ctx: WebhookContext,
|
||||
opts: { flow: string; callId?: string },
|
||||
): string | null {
|
||||
const base = PlivoProvider.baseWebhookUrlFromCtx(ctx);
|
||||
if (!base) return null;
|
||||
|
||||
const u = new URL(base);
|
||||
u.searchParams.set("provider", "plivo");
|
||||
u.searchParams.set("flow", opts.flow);
|
||||
if (opts.callId) u.searchParams.set("callId", opts.callId);
|
||||
return u.toString();
|
||||
}
|
||||
|
||||
private static baseWebhookUrlFromCtx(ctx: WebhookContext): string | null {
|
||||
try {
|
||||
const u = new URL(reconstructWebhookUrl(ctx));
|
||||
return `${u.origin}${u.pathname}`;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private parseBody(rawBody: string): URLSearchParams | null {
|
||||
try {
|
||||
return new URLSearchParams(rawBody);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static extractTranscript(params: URLSearchParams): string | null {
|
||||
const candidates = [
|
||||
"Speech",
|
||||
"Transcription",
|
||||
"TranscriptionText",
|
||||
"SpeechResult",
|
||||
"RecognizedSpeech",
|
||||
"Text",
|
||||
] as const;
|
||||
|
||||
for (const key of candidates) {
|
||||
const value = params.get(key);
|
||||
if (value && value.trim()) return value.trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
type PlivoCreateCallResponse = {
|
||||
api_id?: string;
|
||||
message?: string;
|
||||
request_uuid?: string | string[];
|
||||
};
|
||||
@@ -0,0 +1,311 @@
|
||||
/**
|
||||
* OpenAI Realtime STT Provider
|
||||
*
|
||||
* Uses the OpenAI Realtime API for streaming transcription with:
|
||||
* - Direct mu-law audio support (no conversion needed)
|
||||
* - Built-in server-side VAD for turn detection
|
||||
* - Low-latency streaming transcription
|
||||
* - Partial transcript callbacks for real-time UI updates
|
||||
*/
|
||||
|
||||
import WebSocket from "ws";
|
||||
|
||||
/**
|
||||
* Configuration for OpenAI Realtime STT.
|
||||
*/
|
||||
export interface RealtimeSTTConfig {
|
||||
/** OpenAI API key */
|
||||
apiKey: string;
|
||||
/** Model to use (default: gpt-4o-transcribe) */
|
||||
model?: string;
|
||||
/** Silence duration in ms before considering speech ended (default: 800) */
|
||||
silenceDurationMs?: number;
|
||||
/** VAD threshold 0-1 (default: 0.5) */
|
||||
vadThreshold?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Session for streaming audio and receiving transcripts.
|
||||
*/
|
||||
export interface RealtimeSTTSession {
|
||||
/** Connect to the transcription service */
|
||||
connect(): Promise<void>;
|
||||
/** Send mu-law audio data (8kHz mono) */
|
||||
sendAudio(audio: Buffer): void;
|
||||
/** Wait for next complete transcript (after VAD detects end of speech) */
|
||||
waitForTranscript(timeoutMs?: number): Promise<string>;
|
||||
/** Set callback for partial transcripts (streaming) */
|
||||
onPartial(callback: (partial: string) => void): void;
|
||||
/** Set callback for final transcripts */
|
||||
onTranscript(callback: (transcript: string) => void): void;
|
||||
/** Set callback when speech starts (VAD) */
|
||||
onSpeechStart(callback: () => void): void;
|
||||
/** Close the session */
|
||||
close(): void;
|
||||
/** Check if session is connected */
|
||||
isConnected(): boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provider factory for OpenAI Realtime STT sessions.
|
||||
*/
|
||||
export class OpenAIRealtimeSTTProvider {
|
||||
readonly name = "openai-realtime";
|
||||
private apiKey: string;
|
||||
private model: string;
|
||||
private silenceDurationMs: number;
|
||||
private vadThreshold: number;
|
||||
|
||||
constructor(config: RealtimeSTTConfig) {
|
||||
if (!config.apiKey) {
|
||||
throw new Error("OpenAI API key required for Realtime STT");
|
||||
}
|
||||
this.apiKey = config.apiKey;
|
||||
this.model = config.model || "gpt-4o-transcribe";
|
||||
this.silenceDurationMs = config.silenceDurationMs || 800;
|
||||
this.vadThreshold = config.vadThreshold || 0.5;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new realtime transcription session.
|
||||
*/
|
||||
createSession(): RealtimeSTTSession {
|
||||
return new OpenAIRealtimeSTTSession(
|
||||
this.apiKey,
|
||||
this.model,
|
||||
this.silenceDurationMs,
|
||||
this.vadThreshold,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* WebSocket-based session for real-time speech-to-text.
|
||||
*/
|
||||
class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
|
||||
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
|
||||
private static readonly RECONNECT_DELAY_MS = 1000;
|
||||
|
||||
private ws: WebSocket | null = null;
|
||||
private connected = false;
|
||||
private closed = false;
|
||||
private reconnectAttempts = 0;
|
||||
private pendingTranscript = "";
|
||||
private onTranscriptCallback: ((transcript: string) => void) | null = null;
|
||||
private onPartialCallback: ((partial: string) => void) | null = null;
|
||||
private onSpeechStartCallback: (() => void) | null = null;
|
||||
|
||||
constructor(
|
||||
private readonly apiKey: string,
|
||||
private readonly model: string,
|
||||
private readonly silenceDurationMs: number,
|
||||
private readonly vadThreshold: number,
|
||||
) {}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
this.closed = false;
|
||||
this.reconnectAttempts = 0;
|
||||
return this.doConnect();
|
||||
}
|
||||
|
||||
private async doConnect(): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const url = "wss://api.openai.com/v1/realtime?intent=transcription";
|
||||
|
||||
this.ws = new WebSocket(url, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"OpenAI-Beta": "realtime=v1",
|
||||
},
|
||||
});
|
||||
|
||||
this.ws.on("open", () => {
|
||||
console.log("[RealtimeSTT] WebSocket connected");
|
||||
this.connected = true;
|
||||
this.reconnectAttempts = 0;
|
||||
|
||||
// Configure the transcription session
|
||||
this.sendEvent({
|
||||
type: "transcription_session.update",
|
||||
session: {
|
||||
input_audio_format: "g711_ulaw",
|
||||
input_audio_transcription: {
|
||||
model: this.model,
|
||||
},
|
||||
turn_detection: {
|
||||
type: "server_vad",
|
||||
threshold: this.vadThreshold,
|
||||
prefix_padding_ms: 300,
|
||||
silence_duration_ms: this.silenceDurationMs,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
resolve();
|
||||
});
|
||||
|
||||
this.ws.on("message", (data: Buffer) => {
|
||||
try {
|
||||
const event = JSON.parse(data.toString());
|
||||
this.handleEvent(event);
|
||||
} catch (e) {
|
||||
console.error("[RealtimeSTT] Failed to parse event:", e);
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on("error", (error) => {
|
||||
console.error("[RealtimeSTT] WebSocket error:", error);
|
||||
if (!this.connected) reject(error);
|
||||
});
|
||||
|
||||
this.ws.on("close", (code, reason) => {
|
||||
console.log(
|
||||
`[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`,
|
||||
);
|
||||
this.connected = false;
|
||||
|
||||
// Attempt reconnection if not intentionally closed
|
||||
if (!this.closed) {
|
||||
void this.attemptReconnect();
|
||||
}
|
||||
});
|
||||
|
||||
setTimeout(() => {
|
||||
if (!this.connected) {
|
||||
reject(new Error("Realtime STT connection timeout"));
|
||||
}
|
||||
}, 10000);
|
||||
});
|
||||
}
|
||||
|
||||
private async attemptReconnect(): Promise<void> {
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (
|
||||
this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS
|
||||
) {
|
||||
console.error(
|
||||
`[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
this.reconnectAttempts++;
|
||||
const delay =
|
||||
OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS *
|
||||
2 ** (this.reconnectAttempts - 1);
|
||||
console.log(
|
||||
`[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`,
|
||||
);
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.doConnect();
|
||||
console.log("[RealtimeSTT] Reconnected successfully");
|
||||
} catch (error) {
|
||||
console.error("[RealtimeSTT] Reconnect failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
private handleEvent(event: {
|
||||
type: string;
|
||||
delta?: string;
|
||||
transcript?: string;
|
||||
error?: unknown;
|
||||
}): void {
|
||||
switch (event.type) {
|
||||
case "transcription_session.created":
|
||||
case "transcription_session.updated":
|
||||
case "input_audio_buffer.speech_stopped":
|
||||
case "input_audio_buffer.committed":
|
||||
console.log(`[RealtimeSTT] ${event.type}`);
|
||||
break;
|
||||
|
||||
case "conversation.item.input_audio_transcription.delta":
|
||||
if (event.delta) {
|
||||
this.pendingTranscript += event.delta;
|
||||
this.onPartialCallback?.(this.pendingTranscript);
|
||||
}
|
||||
break;
|
||||
|
||||
case "conversation.item.input_audio_transcription.completed":
|
||||
if (event.transcript) {
|
||||
console.log(`[RealtimeSTT] Transcript: ${event.transcript}`);
|
||||
this.onTranscriptCallback?.(event.transcript);
|
||||
}
|
||||
this.pendingTranscript = "";
|
||||
break;
|
||||
|
||||
case "input_audio_buffer.speech_started":
|
||||
console.log("[RealtimeSTT] Speech started");
|
||||
this.pendingTranscript = "";
|
||||
this.onSpeechStartCallback?.();
|
||||
break;
|
||||
|
||||
case "error":
|
||||
console.error("[RealtimeSTT] Error:", event.error);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private sendEvent(event: unknown): void {
|
||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||
this.ws.send(JSON.stringify(event));
|
||||
}
|
||||
}
|
||||
|
||||
sendAudio(muLawData: Buffer): void {
|
||||
if (!this.connected) return;
|
||||
this.sendEvent({
|
||||
type: "input_audio_buffer.append",
|
||||
audio: muLawData.toString("base64"),
|
||||
});
|
||||
}
|
||||
|
||||
onPartial(callback: (partial: string) => void): void {
|
||||
this.onPartialCallback = callback;
|
||||
}
|
||||
|
||||
onTranscript(callback: (transcript: string) => void): void {
|
||||
this.onTranscriptCallback = callback;
|
||||
}
|
||||
|
||||
onSpeechStart(callback: () => void): void {
|
||||
this.onSpeechStartCallback = callback;
|
||||
}
|
||||
|
||||
async waitForTranscript(timeoutMs = 30000): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timeout = setTimeout(() => {
|
||||
this.onTranscriptCallback = null;
|
||||
reject(new Error("Transcript timeout"));
|
||||
}, timeoutMs);
|
||||
|
||||
this.onTranscriptCallback = (transcript) => {
|
||||
clearTimeout(timeout);
|
||||
this.onTranscriptCallback = null;
|
||||
resolve(transcript);
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.closed = true;
|
||||
if (this.ws) {
|
||||
this.ws.close();
|
||||
this.ws = null;
|
||||
}
|
||||
this.connected = false;
|
||||
}
|
||||
|
||||
isConnected(): boolean {
|
||||
return this.connected;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,364 @@
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import type { TelnyxConfig } from "../config.js";
|
||||
import type {
|
||||
EndReason,
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
NormalizedEvent,
|
||||
PlayTtsInput,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
|
||||
/**
|
||||
* Telnyx Voice API provider implementation.
|
||||
*
|
||||
* Uses Telnyx Call Control API v2 for managing calls.
|
||||
* @see https://developers.telnyx.com/docs/api/v2/call-control
|
||||
*/
|
||||
export class TelnyxProvider implements VoiceCallProvider {
|
||||
readonly name = "telnyx" as const;
|
||||
|
||||
private readonly apiKey: string;
|
||||
private readonly connectionId: string;
|
||||
private readonly publicKey: string | undefined;
|
||||
private readonly baseUrl = "https://api.telnyx.com/v2";
|
||||
|
||||
constructor(config: TelnyxConfig) {
|
||||
if (!config.apiKey) {
|
||||
throw new Error("Telnyx API key is required");
|
||||
}
|
||||
if (!config.connectionId) {
|
||||
throw new Error("Telnyx connection ID is required");
|
||||
}
|
||||
|
||||
this.apiKey = config.apiKey;
|
||||
this.connectionId = config.connectionId;
|
||||
this.publicKey = config.publicKey;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an authenticated request to the Telnyx API.
|
||||
*/
|
||||
private async apiRequest<T = unknown>(
|
||||
endpoint: string,
|
||||
body: Record<string, unknown>,
|
||||
options?: { allowNotFound?: boolean },
|
||||
): Promise<T> {
|
||||
const response = await fetch(`${this.baseUrl}${endpoint}`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
if (options?.allowNotFound && response.status === 404) {
|
||||
return undefined as T;
|
||||
}
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Telnyx API error: ${response.status} ${errorText}`);
|
||||
}
|
||||
|
||||
const text = await response.text();
|
||||
return text ? (JSON.parse(text) as T) : (undefined as T);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify Telnyx webhook signature using Ed25519.
|
||||
*/
|
||||
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
|
||||
if (!this.publicKey) {
|
||||
// No public key configured, skip verification (not recommended for production)
|
||||
return { ok: true };
|
||||
}
|
||||
|
||||
const signature = ctx.headers["telnyx-signature-ed25519"];
|
||||
const timestamp = ctx.headers["telnyx-timestamp"];
|
||||
|
||||
if (!signature || !timestamp) {
|
||||
return { ok: false, reason: "Missing signature or timestamp header" };
|
||||
}
|
||||
|
||||
const signatureStr = Array.isArray(signature) ? signature[0] : signature;
|
||||
const timestampStr = Array.isArray(timestamp) ? timestamp[0] : timestamp;
|
||||
|
||||
if (!signatureStr || !timestampStr) {
|
||||
return { ok: false, reason: "Empty signature or timestamp" };
|
||||
}
|
||||
|
||||
try {
|
||||
const signedPayload = `${timestampStr}|${ctx.rawBody}`;
|
||||
const signatureBuffer = Buffer.from(signatureStr, "base64");
|
||||
const publicKeyBuffer = Buffer.from(this.publicKey, "base64");
|
||||
|
||||
const isValid = crypto.verify(
|
||||
null, // Ed25519 doesn't use a digest
|
||||
Buffer.from(signedPayload),
|
||||
{
|
||||
key: publicKeyBuffer,
|
||||
format: "der",
|
||||
type: "spki",
|
||||
},
|
||||
signatureBuffer,
|
||||
);
|
||||
|
||||
if (!isValid) {
|
||||
return { ok: false, reason: "Invalid signature" };
|
||||
}
|
||||
|
||||
// Check timestamp is within 5 minutes
|
||||
const eventTime = parseInt(timestampStr, 10) * 1000;
|
||||
const now = Date.now();
|
||||
if (Math.abs(now - eventTime) > 5 * 60 * 1000) {
|
||||
return { ok: false, reason: "Timestamp too old" };
|
||||
}
|
||||
|
||||
return { ok: true };
|
||||
} catch (err) {
|
||||
return {
|
||||
ok: false,
|
||||
reason: `Verification error: ${err instanceof Error ? err.message : String(err)}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Telnyx webhook event into normalized format.
|
||||
*/
|
||||
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
|
||||
try {
|
||||
const payload = JSON.parse(ctx.rawBody);
|
||||
const data = payload.data;
|
||||
|
||||
if (!data || !data.event_type) {
|
||||
return { events: [], statusCode: 200 };
|
||||
}
|
||||
|
||||
const event = this.normalizeEvent(data);
|
||||
return {
|
||||
events: event ? [event] : [],
|
||||
statusCode: 200,
|
||||
};
|
||||
} catch {
|
||||
return { events: [], statusCode: 400 };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Telnyx event to normalized event format.
|
||||
*/
|
||||
private normalizeEvent(data: TelnyxEvent): NormalizedEvent | null {
|
||||
// Decode client_state from Base64 (we encode it in initiateCall)
|
||||
let callId = "";
|
||||
if (data.payload?.client_state) {
|
||||
try {
|
||||
callId = Buffer.from(data.payload.client_state, "base64").toString(
|
||||
"utf8",
|
||||
);
|
||||
} catch {
|
||||
// Fallback if not valid Base64
|
||||
callId = data.payload.client_state;
|
||||
}
|
||||
}
|
||||
if (!callId) {
|
||||
callId = data.payload?.call_control_id || "";
|
||||
}
|
||||
|
||||
const baseEvent = {
|
||||
id: data.id || crypto.randomUUID(),
|
||||
callId,
|
||||
providerCallId: data.payload?.call_control_id,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
||||
switch (data.event_type) {
|
||||
case "call.initiated":
|
||||
return { ...baseEvent, type: "call.initiated" };
|
||||
|
||||
case "call.ringing":
|
||||
return { ...baseEvent, type: "call.ringing" };
|
||||
|
||||
case "call.answered":
|
||||
return { ...baseEvent, type: "call.answered" };
|
||||
|
||||
case "call.bridged":
|
||||
return { ...baseEvent, type: "call.active" };
|
||||
|
||||
case "call.speak.started":
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.speaking",
|
||||
text: data.payload?.text || "",
|
||||
};
|
||||
|
||||
case "call.transcription":
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.speech",
|
||||
transcript: data.payload?.transcription || "",
|
||||
isFinal: data.payload?.is_final ?? true,
|
||||
confidence: data.payload?.confidence,
|
||||
};
|
||||
|
||||
case "call.hangup":
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.ended",
|
||||
reason: this.mapHangupCause(data.payload?.hangup_cause),
|
||||
};
|
||||
|
||||
case "call.dtmf.received":
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.dtmf",
|
||||
digits: data.payload?.digit || "",
|
||||
};
|
||||
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Map Telnyx hangup cause to normalized end reason.
|
||||
* @see https://developers.telnyx.com/docs/api/v2/call-control/Call-Commands#hangup-causes
|
||||
*/
|
||||
private mapHangupCause(cause?: string): EndReason {
|
||||
switch (cause) {
|
||||
case "normal_clearing":
|
||||
case "normal_unspecified":
|
||||
return "completed";
|
||||
case "originator_cancel":
|
||||
return "hangup-bot";
|
||||
case "call_rejected":
|
||||
case "user_busy":
|
||||
return "busy";
|
||||
case "no_answer":
|
||||
case "no_user_response":
|
||||
return "no-answer";
|
||||
case "destination_out_of_order":
|
||||
case "network_out_of_order":
|
||||
case "service_unavailable":
|
||||
case "recovery_on_timer_expire":
|
||||
return "failed";
|
||||
case "machine_detected":
|
||||
case "fax_detected":
|
||||
return "voicemail";
|
||||
case "user_hangup":
|
||||
case "subscriber_absent":
|
||||
return "hangup-user";
|
||||
default:
|
||||
// Unknown cause - log it for debugging and return completed
|
||||
if (cause) {
|
||||
console.warn(`[telnyx] Unknown hangup cause: ${cause}`);
|
||||
}
|
||||
return "completed";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate an outbound call via Telnyx API.
|
||||
*/
|
||||
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
const result = await this.apiRequest<TelnyxCallResponse>("/calls", {
|
||||
connection_id: this.connectionId,
|
||||
to: input.to,
|
||||
from: input.from,
|
||||
webhook_url: input.webhookUrl,
|
||||
webhook_url_method: "POST",
|
||||
client_state: Buffer.from(input.callId).toString("base64"),
|
||||
timeout_secs: 30,
|
||||
});
|
||||
|
||||
return {
|
||||
providerCallId: result.data.call_control_id,
|
||||
status: "initiated",
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Hang up a call via Telnyx API.
|
||||
*/
|
||||
async hangupCall(input: HangupCallInput): Promise<void> {
|
||||
await this.apiRequest(
|
||||
`/calls/${input.providerCallId}/actions/hangup`,
|
||||
{ command_id: crypto.randomUUID() },
|
||||
{ allowNotFound: true },
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS audio via Telnyx speak action.
|
||||
*/
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
await this.apiRequest(`/calls/${input.providerCallId}/actions/speak`, {
|
||||
command_id: crypto.randomUUID(),
|
||||
payload: input.text,
|
||||
voice: input.voice || "female",
|
||||
language: input.locale || "en-US",
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Start transcription (STT) via Telnyx.
|
||||
*/
|
||||
async startListening(input: StartListeningInput): Promise<void> {
|
||||
await this.apiRequest(
|
||||
`/calls/${input.providerCallId}/actions/transcription_start`,
|
||||
{
|
||||
command_id: crypto.randomUUID(),
|
||||
language: input.language || "en",
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop transcription via Telnyx.
|
||||
*/
|
||||
async stopListening(input: StopListeningInput): Promise<void> {
|
||||
await this.apiRequest(
|
||||
`/calls/${input.providerCallId}/actions/transcription_stop`,
|
||||
{ command_id: crypto.randomUUID() },
|
||||
{ allowNotFound: true },
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Telnyx-specific types
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
interface TelnyxEvent {
|
||||
id?: string;
|
||||
event_type: string;
|
||||
payload?: {
|
||||
call_control_id?: string;
|
||||
client_state?: string;
|
||||
text?: string;
|
||||
transcription?: string;
|
||||
is_final?: boolean;
|
||||
confidence?: number;
|
||||
hangup_cause?: string;
|
||||
digit?: string;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
}
|
||||
|
||||
interface TelnyxCallResponse {
|
||||
data: {
|
||||
call_control_id: string;
|
||||
call_leg_id: string;
|
||||
call_session_id: string;
|
||||
is_alive: boolean;
|
||||
record_type: string;
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,264 @@
|
||||
/**
|
||||
* OpenAI TTS Provider
|
||||
*
|
||||
* Generates speech audio using OpenAI's text-to-speech API.
|
||||
* Handles audio format conversion for telephony (mu-law 8kHz).
|
||||
*
|
||||
* Best practices from OpenAI docs:
|
||||
* - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
|
||||
* - Use tts-1 for lower latency, tts-1-hd for higher quality
|
||||
* - Use marin or cedar voices for best quality
|
||||
* - Use pcm or wav format for fastest response times
|
||||
*
|
||||
* @see https://platform.openai.com/docs/guides/text-to-speech
|
||||
*/
|
||||
|
||||
/**
|
||||
* OpenAI TTS configuration.
|
||||
*/
|
||||
export interface OpenAITTSConfig {
|
||||
/** OpenAI API key (uses OPENAI_API_KEY env if not set) */
|
||||
apiKey?: string;
|
||||
/**
|
||||
* TTS model:
|
||||
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
|
||||
* - tts-1: lower latency
|
||||
* - tts-1-hd: higher quality
|
||||
*/
|
||||
model?: string;
|
||||
/**
|
||||
* Voice to use. For best quality, use marin or cedar.
|
||||
* All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
|
||||
* Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
|
||||
*/
|
||||
voice?: string;
|
||||
/** Speed multiplier (0.25 to 4.0) */
|
||||
speed?: number;
|
||||
/**
|
||||
* Instructions for speech style (only works with gpt-4o-mini-tts model).
|
||||
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
|
||||
*/
|
||||
instructions?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported OpenAI TTS voices (all 13 built-in voices).
|
||||
* For best quality, use marin or cedar.
|
||||
* Note: tts-1 and tts-1-hd support a smaller set.
|
||||
*/
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
"ballad",
|
||||
"coral",
|
||||
"echo",
|
||||
"fable",
|
||||
"nova",
|
||||
"onyx",
|
||||
"sage",
|
||||
"shimmer",
|
||||
"verse",
|
||||
"marin",
|
||||
"cedar",
|
||||
] as const;
|
||||
|
||||
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
|
||||
/**
|
||||
* OpenAI TTS Provider for generating speech audio.
|
||||
*/
|
||||
export class OpenAITTSProvider {
|
||||
private apiKey: string;
|
||||
private model: string;
|
||||
private voice: OpenAITTSVoice;
|
||||
private speed: number;
|
||||
private instructions?: string;
|
||||
|
||||
constructor(config: OpenAITTSConfig = {}) {
|
||||
this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
|
||||
// Default to gpt-4o-mini-tts for intelligent realtime applications
|
||||
this.model = config.model || "gpt-4o-mini-tts";
|
||||
// Default to coral - good balance of quality and natural tone
|
||||
this.voice = (config.voice as OpenAITTSVoice) || "coral";
|
||||
this.speed = config.speed || 1.0;
|
||||
this.instructions = config.instructions;
|
||||
|
||||
if (!this.apiKey) {
|
||||
throw new Error(
|
||||
"OpenAI API key required (set OPENAI_API_KEY or pass apiKey)",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech audio from text.
|
||||
* Returns raw PCM audio data (24kHz, mono, 16-bit).
|
||||
*/
|
||||
async synthesize(text: string, instructions?: string): Promise<Buffer> {
|
||||
// Build request body
|
||||
const body: Record<string, unknown> = {
|
||||
model: this.model,
|
||||
input: text,
|
||||
voice: this.voice,
|
||||
response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
|
||||
speed: this.speed,
|
||||
};
|
||||
|
||||
// Add instructions if using gpt-4o-mini-tts model
|
||||
const effectiveInstructions = instructions || this.instructions;
|
||||
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
|
||||
body.instructions = effectiveInstructions;
|
||||
}
|
||||
|
||||
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
|
||||
}
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
return Buffer.from(arrayBuffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech and convert to mu-law format for Twilio.
|
||||
* Twilio Media Streams expect 8kHz mono mu-law audio.
|
||||
*/
|
||||
async synthesizeForTwilio(text: string): Promise<Buffer> {
|
||||
// Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
|
||||
const pcm24k = await this.synthesize(text);
|
||||
|
||||
// Resample from 24kHz to 8kHz
|
||||
const pcm8k = resample24kTo8k(pcm24k);
|
||||
|
||||
// Encode to mu-law
|
||||
return pcmToMulaw(pcm8k);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample 24kHz PCM to 8kHz using linear interpolation.
|
||||
* Input/output: 16-bit signed little-endian mono.
|
||||
*/
|
||||
function resample24kTo8k(input: Buffer): Buffer {
|
||||
const inputSamples = input.length / 2;
|
||||
const outputSamples = Math.floor(inputSamples / 3);
|
||||
const output = Buffer.alloc(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
// Calculate position in input (3:1 ratio)
|
||||
const srcPos = i * 3;
|
||||
const srcIdx = srcPos * 2;
|
||||
|
||||
if (srcIdx + 3 < input.length) {
|
||||
// Linear interpolation between samples
|
||||
const s0 = input.readInt16LE(srcIdx);
|
||||
const s1 = input.readInt16LE(srcIdx + 2);
|
||||
const frac = srcPos % 1 || 0;
|
||||
const sample = Math.round(s0 + frac * (s1 - s0));
|
||||
output.writeInt16LE(clamp16(sample), i * 2);
|
||||
} else {
|
||||
// Last sample
|
||||
output.writeInt16LE(input.readInt16LE(srcIdx), i * 2);
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clamp value to 16-bit signed integer range.
|
||||
*/
|
||||
function clamp16(value: number): number {
|
||||
return Math.max(-32768, Math.min(32767, value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 16-bit PCM to 8-bit mu-law.
|
||||
* Standard G.711 mu-law encoding for telephony.
|
||||
*/
|
||||
function pcmToMulaw(pcm: Buffer): Buffer {
|
||||
const samples = pcm.length / 2;
|
||||
const mulaw = Buffer.alloc(samples);
|
||||
|
||||
for (let i = 0; i < samples; i++) {
|
||||
const sample = pcm.readInt16LE(i * 2);
|
||||
mulaw[i] = linearToMulaw(sample);
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a single 16-bit linear sample to 8-bit mu-law.
|
||||
* Implements ITU-T G.711 mu-law encoding.
|
||||
*/
|
||||
function linearToMulaw(sample: number): number {
|
||||
const BIAS = 132;
|
||||
const CLIP = 32635;
|
||||
|
||||
// Get sign bit
|
||||
const sign = sample < 0 ? 0x80 : 0;
|
||||
if (sample < 0) sample = -sample;
|
||||
|
||||
// Clip to prevent overflow
|
||||
if (sample > CLIP) sample = CLIP;
|
||||
|
||||
// Add bias and find segment
|
||||
sample += BIAS;
|
||||
let exponent = 7;
|
||||
for (
|
||||
let expMask = 0x4000;
|
||||
(sample & expMask) === 0 && exponent > 0;
|
||||
exponent--, expMask >>= 1
|
||||
) {
|
||||
// Find the segment (exponent)
|
||||
}
|
||||
|
||||
// Extract mantissa bits
|
||||
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
||||
|
||||
// Combine into mu-law byte (inverted for transmission)
|
||||
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 8-bit mu-law to 16-bit linear PCM.
|
||||
* Useful for decoding incoming audio.
|
||||
*/
|
||||
export function mulawToLinear(mulaw: number): number {
|
||||
// mu-law is transmitted inverted
|
||||
mulaw = ~mulaw & 0xff;
|
||||
|
||||
const sign = mulaw & 0x80;
|
||||
const exponent = (mulaw >> 4) & 0x07;
|
||||
const mantissa = mulaw & 0x0f;
|
||||
|
||||
let sample = ((mantissa << 3) + 132) << exponent;
|
||||
sample -= 132;
|
||||
|
||||
return sign ? -sample : sample;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk audio buffer into 20ms frames for streaming.
|
||||
* At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
|
||||
*/
|
||||
export function chunkAudio(
|
||||
audio: Buffer,
|
||||
chunkSize = 160,
|
||||
): Generator<Buffer, void, unknown> {
|
||||
return (function* () {
|
||||
for (let i = 0; i < audio.length; i += chunkSize) {
|
||||
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
|
||||
}
|
||||
})();
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import type { WebhookContext } from "../types.js";
|
||||
import { TwilioProvider } from "./twilio.js";
|
||||
|
||||
const STREAM_URL = "wss://example.ngrok.app/voice/stream";
|
||||
|
||||
function createProvider(): TwilioProvider {
|
||||
return new TwilioProvider(
|
||||
{ accountSid: "AC123", authToken: "secret" },
|
||||
{ publicUrl: "https://example.ngrok.app", streamPath: "/voice/stream" },
|
||||
);
|
||||
}
|
||||
|
||||
function createContext(
|
||||
rawBody: string,
|
||||
query?: WebhookContext["query"],
|
||||
): WebhookContext {
|
||||
return {
|
||||
headers: {},
|
||||
rawBody,
|
||||
url: "https://example.ngrok.app/voice/twilio",
|
||||
method: "POST",
|
||||
query,
|
||||
};
|
||||
}
|
||||
|
||||
describe("TwilioProvider", () => {
|
||||
it("returns streaming TwiML for outbound conversation calls before in-progress", () => {
|
||||
const provider = createProvider();
|
||||
const ctx = createContext("CallStatus=initiated&Direction=outbound-api", {
|
||||
callId: "call-1",
|
||||
});
|
||||
|
||||
const result = provider.parseWebhookEvent(ctx);
|
||||
|
||||
expect(result.providerResponseBody).toContain(STREAM_URL);
|
||||
expect(result.providerResponseBody).toContain("<Connect>");
|
||||
});
|
||||
|
||||
it("returns empty TwiML for status callbacks", () => {
|
||||
const provider = createProvider();
|
||||
const ctx = createContext("CallStatus=ringing&Direction=outbound-api", {
|
||||
callId: "call-1",
|
||||
type: "status",
|
||||
});
|
||||
|
||||
const result = provider.parseWebhookEvent(ctx);
|
||||
|
||||
expect(result.providerResponseBody).toBe(
|
||||
'<?xml version="1.0" encoding="UTF-8"?><Response></Response>',
|
||||
);
|
||||
});
|
||||
|
||||
it("returns streaming TwiML for inbound calls", () => {
|
||||
const provider = createProvider();
|
||||
const ctx = createContext("CallStatus=ringing&Direction=inbound");
|
||||
|
||||
const result = provider.parseWebhookEvent(ctx);
|
||||
|
||||
expect(result.providerResponseBody).toContain(STREAM_URL);
|
||||
expect(result.providerResponseBody).toContain("<Connect>");
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,595 @@
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import type { TwilioConfig } from "../config.js";
|
||||
import type { MediaStreamHandler } from "../media-stream.js";
|
||||
import type {
|
||||
HangupCallInput,
|
||||
InitiateCallInput,
|
||||
InitiateCallResult,
|
||||
NormalizedEvent,
|
||||
PlayTtsInput,
|
||||
ProviderWebhookParseResult,
|
||||
StartListeningInput,
|
||||
StopListeningInput,
|
||||
WebhookContext,
|
||||
WebhookVerificationResult,
|
||||
} from "../types.js";
|
||||
import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
|
||||
import { chunkAudio } from "../telephony-audio.js";
|
||||
import type { TelephonyTtsProvider } from "../telephony-tts.js";
|
||||
import type { VoiceCallProvider } from "./base.js";
|
||||
import { twilioApiRequest } from "./twilio/api.js";
|
||||
import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";
|
||||
|
||||
/**
|
||||
* Twilio Voice API provider implementation.
|
||||
*
|
||||
* Uses Twilio Programmable Voice API with Media Streams for real-time
|
||||
* bidirectional audio streaming.
|
||||
*
|
||||
* @see https://www.twilio.com/docs/voice
|
||||
* @see https://www.twilio.com/docs/voice/media-streams
|
||||
*/
|
||||
export interface TwilioProviderOptions {
|
||||
/** Allow ngrok free tier compatibility mode (loopback only, less secure) */
|
||||
allowNgrokFreeTierLoopbackBypass?: boolean;
|
||||
/** Override public URL for signature verification */
|
||||
publicUrl?: string;
|
||||
/** Path for media stream WebSocket (e.g., /voice/stream) */
|
||||
streamPath?: string;
|
||||
/** Skip webhook signature verification (development only) */
|
||||
skipVerification?: boolean;
|
||||
}
|
||||
|
||||
export class TwilioProvider implements VoiceCallProvider {
|
||||
readonly name = "twilio" as const;
|
||||
|
||||
private readonly accountSid: string;
|
||||
private readonly authToken: string;
|
||||
private readonly baseUrl: string;
|
||||
private readonly callWebhookUrls = new Map<string, string>();
|
||||
private readonly options: TwilioProviderOptions;
|
||||
|
||||
/** Current public webhook URL (set when tunnel starts or from config) */
|
||||
private currentPublicUrl: string | null = null;
|
||||
|
||||
/** Optional telephony TTS provider for streaming TTS */
|
||||
private ttsProvider: TelephonyTtsProvider | null = null;
|
||||
|
||||
/** Optional media stream handler for sending audio */
|
||||
private mediaStreamHandler: MediaStreamHandler | null = null;
|
||||
|
||||
/** Map of call SID to stream SID for media streams */
|
||||
private callStreamMap = new Map<string, string>();
|
||||
|
||||
/** Storage for TwiML content (for notify mode with URL-based TwiML) */
|
||||
private readonly twimlStorage = new Map<string, string>();
|
||||
/** Track notify-mode calls to avoid streaming on follow-up callbacks */
|
||||
private readonly notifyCalls = new Set<string>();
|
||||
|
||||
/**
|
||||
* Delete stored TwiML for a given `callId`.
|
||||
*
|
||||
* We keep TwiML in-memory only long enough to satisfy the initial Twilio
|
||||
* webhook request (notify mode). Subsequent webhooks should not reuse it.
|
||||
*/
|
||||
private deleteStoredTwiml(callId: string): void {
|
||||
this.twimlStorage.delete(callId);
|
||||
this.notifyCalls.delete(callId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete stored TwiML for a call, addressed by Twilio's provider call SID.
|
||||
*
|
||||
* This is used when we only have `providerCallId` (e.g. hangup).
|
||||
*/
|
||||
private deleteStoredTwimlForProviderCall(providerCallId: string): void {
|
||||
const webhookUrl = this.callWebhookUrls.get(providerCallId);
|
||||
if (!webhookUrl) return;
|
||||
|
||||
const callIdMatch = webhookUrl.match(/callId=([^&]+)/);
|
||||
if (!callIdMatch) return;
|
||||
|
||||
this.deleteStoredTwiml(callIdMatch[1]);
|
||||
}
|
||||
|
||||
constructor(config: TwilioConfig, options: TwilioProviderOptions = {}) {
|
||||
if (!config.accountSid) {
|
||||
throw new Error("Twilio Account SID is required");
|
||||
}
|
||||
if (!config.authToken) {
|
||||
throw new Error("Twilio Auth Token is required");
|
||||
}
|
||||
|
||||
this.accountSid = config.accountSid;
|
||||
this.authToken = config.authToken;
|
||||
this.baseUrl = `https://api.twilio.com/2010-04-01/Accounts/${this.accountSid}`;
|
||||
this.options = options;
|
||||
|
||||
if (options.publicUrl) {
|
||||
this.currentPublicUrl = options.publicUrl;
|
||||
}
|
||||
}
|
||||
|
||||
setPublicUrl(url: string): void {
|
||||
this.currentPublicUrl = url;
|
||||
}
|
||||
|
||||
getPublicUrl(): string | null {
|
||||
return this.currentPublicUrl;
|
||||
}
|
||||
|
||||
setTTSProvider(provider: TelephonyTtsProvider): void {
|
||||
this.ttsProvider = provider;
|
||||
}
|
||||
|
||||
setMediaStreamHandler(handler: MediaStreamHandler): void {
|
||||
this.mediaStreamHandler = handler;
|
||||
}
|
||||
|
||||
registerCallStream(callSid: string, streamSid: string): void {
|
||||
this.callStreamMap.set(callSid, streamSid);
|
||||
}
|
||||
|
||||
unregisterCallStream(callSid: string): void {
|
||||
this.callStreamMap.delete(callSid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear TTS queue for a call (barge-in).
|
||||
* Used when user starts speaking to interrupt current TTS playback.
|
||||
*/
|
||||
clearTtsQueue(callSid: string): void {
|
||||
const streamSid = this.callStreamMap.get(callSid);
|
||||
if (streamSid && this.mediaStreamHandler) {
|
||||
this.mediaStreamHandler.clearTtsQueue(streamSid);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an authenticated request to the Twilio API.
|
||||
*/
|
||||
private async apiRequest<T = unknown>(
|
||||
endpoint: string,
|
||||
params: Record<string, string | string[]>,
|
||||
options?: { allowNotFound?: boolean },
|
||||
): Promise<T> {
|
||||
return await twilioApiRequest<T>({
|
||||
baseUrl: this.baseUrl,
|
||||
accountSid: this.accountSid,
|
||||
authToken: this.authToken,
|
||||
endpoint,
|
||||
body: params,
|
||||
allowNotFound: options?.allowNotFound,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify Twilio webhook signature using HMAC-SHA1.
|
||||
*
|
||||
* Handles reverse proxy scenarios (Tailscale, nginx, ngrok) by reconstructing
|
||||
* the public URL from forwarding headers.
|
||||
*
|
||||
* @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
|
||||
*/
|
||||
verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
|
||||
return verifyTwilioProviderWebhook({
|
||||
ctx,
|
||||
authToken: this.authToken,
|
||||
currentPublicUrl: this.currentPublicUrl,
|
||||
options: this.options,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Twilio webhook event into normalized format.
|
||||
*/
|
||||
parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
|
||||
try {
|
||||
const params = new URLSearchParams(ctx.rawBody);
|
||||
const callIdFromQuery =
|
||||
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
|
||||
? ctx.query.callId.trim()
|
||||
: undefined;
|
||||
const event = this.normalizeEvent(params, callIdFromQuery);
|
||||
|
||||
// For Twilio, we must return TwiML. Most actions are driven by Calls API updates,
|
||||
// so the webhook response is typically a pause to keep the call alive.
|
||||
const twiml = this.generateTwimlResponse(ctx);
|
||||
|
||||
return {
|
||||
events: event ? [event] : [],
|
||||
providerResponseBody: twiml,
|
||||
providerResponseHeaders: { "Content-Type": "application/xml" },
|
||||
statusCode: 200,
|
||||
};
|
||||
} catch {
|
||||
return { events: [], statusCode: 400 };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Twilio direction to normalized format.
|
||||
*/
|
||||
private static parseDirection(
|
||||
direction: string | null,
|
||||
): "inbound" | "outbound" | undefined {
|
||||
if (direction === "inbound") return "inbound";
|
||||
if (direction === "outbound-api" || direction === "outbound-dial")
|
||||
return "outbound";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Twilio webhook params to normalized event format.
|
||||
*/
|
||||
private normalizeEvent(
|
||||
params: URLSearchParams,
|
||||
callIdOverride?: string,
|
||||
): NormalizedEvent | null {
|
||||
const callSid = params.get("CallSid") || "";
|
||||
|
||||
const baseEvent = {
|
||||
id: crypto.randomUUID(),
|
||||
callId: callIdOverride || callSid,
|
||||
providerCallId: callSid,
|
||||
timestamp: Date.now(),
|
||||
direction: TwilioProvider.parseDirection(params.get("Direction")),
|
||||
from: params.get("From") || undefined,
|
||||
to: params.get("To") || undefined,
|
||||
};
|
||||
|
||||
// Handle speech result (from <Gather>)
|
||||
const speechResult = params.get("SpeechResult");
|
||||
if (speechResult) {
|
||||
return {
|
||||
...baseEvent,
|
||||
type: "call.speech",
|
||||
transcript: speechResult,
|
||||
isFinal: true,
|
||||
confidence: parseFloat(params.get("Confidence") || "0.9"),
|
||||
};
|
||||
}
|
||||
|
||||
// Handle DTMF
|
||||
const digits = params.get("Digits");
|
||||
if (digits) {
|
||||
return { ...baseEvent, type: "call.dtmf", digits };
|
||||
}
|
||||
|
||||
// Handle call status changes
|
||||
const callStatus = params.get("CallStatus");
|
||||
switch (callStatus) {
|
||||
case "initiated":
|
||||
return { ...baseEvent, type: "call.initiated" };
|
||||
case "ringing":
|
||||
return { ...baseEvent, type: "call.ringing" };
|
||||
case "in-progress":
|
||||
return { ...baseEvent, type: "call.answered" };
|
||||
case "completed":
|
||||
case "busy":
|
||||
case "no-answer":
|
||||
case "failed":
|
||||
if (callIdOverride) {
|
||||
this.deleteStoredTwiml(callIdOverride);
|
||||
}
|
||||
return { ...baseEvent, type: "call.ended", reason: callStatus };
|
||||
case "canceled":
|
||||
if (callIdOverride) {
|
||||
this.deleteStoredTwiml(callIdOverride);
|
||||
}
|
||||
return { ...baseEvent, type: "call.ended", reason: "hangup-bot" };
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly EMPTY_TWIML =
|
||||
'<?xml version="1.0" encoding="UTF-8"?><Response></Response>';
|
||||
|
||||
private static readonly PAUSE_TWIML = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Pause length="30"/>
|
||||
</Response>`;
|
||||
|
||||
/**
|
||||
* Generate TwiML response for webhook.
|
||||
* When a call is answered, connects to media stream for bidirectional audio.
|
||||
*/
|
||||
private generateTwimlResponse(ctx?: WebhookContext): string {
|
||||
if (!ctx) return TwilioProvider.EMPTY_TWIML;
|
||||
|
||||
const params = new URLSearchParams(ctx.rawBody);
|
||||
const type =
|
||||
typeof ctx.query?.type === "string" ? ctx.query.type.trim() : undefined;
|
||||
const isStatusCallback = type === "status";
|
||||
const callStatus = params.get("CallStatus");
|
||||
const direction = params.get("Direction");
|
||||
const isOutbound = direction?.startsWith("outbound") ?? false;
|
||||
const callIdFromQuery =
|
||||
typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
|
||||
? ctx.query.callId.trim()
|
||||
: undefined;
|
||||
|
||||
// Avoid logging webhook params/TwiML (may contain PII).
|
||||
|
||||
// Handle initial TwiML request (when Twilio first initiates the call)
|
||||
// Check if we have stored TwiML for this call (notify mode)
|
||||
if (callIdFromQuery && !isStatusCallback) {
|
||||
const storedTwiml = this.twimlStorage.get(callIdFromQuery);
|
||||
if (storedTwiml) {
|
||||
// Clean up after serving (one-time use)
|
||||
this.deleteStoredTwiml(callIdFromQuery);
|
||||
return storedTwiml;
|
||||
}
|
||||
if (this.notifyCalls.has(callIdFromQuery)) {
|
||||
return TwilioProvider.EMPTY_TWIML;
|
||||
}
|
||||
|
||||
// Conversation mode: return streaming TwiML immediately for outbound calls.
|
||||
if (isOutbound) {
|
||||
const streamUrl = this.getStreamUrl();
|
||||
return streamUrl
|
||||
? this.getStreamConnectXml(streamUrl)
|
||||
: TwilioProvider.PAUSE_TWIML;
|
||||
}
|
||||
}
|
||||
|
||||
// Status callbacks should not receive TwiML.
|
||||
if (isStatusCallback) {
|
||||
return TwilioProvider.EMPTY_TWIML;
|
||||
}
|
||||
|
||||
// Handle subsequent webhook requests (status callbacks, etc.)
|
||||
// For inbound calls, answer immediately with stream
|
||||
if (direction === "inbound") {
|
||||
const streamUrl = this.getStreamUrl();
|
||||
return streamUrl
|
||||
? this.getStreamConnectXml(streamUrl)
|
||||
: TwilioProvider.PAUSE_TWIML;
|
||||
}
|
||||
|
||||
// For outbound calls, only connect to stream when call is in-progress
|
||||
if (callStatus !== "in-progress") {
|
||||
return TwilioProvider.EMPTY_TWIML;
|
||||
}
|
||||
|
||||
const streamUrl = this.getStreamUrl();
|
||||
return streamUrl
|
||||
? this.getStreamConnectXml(streamUrl)
|
||||
: TwilioProvider.PAUSE_TWIML;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the WebSocket URL for media streaming.
|
||||
* Derives from the public URL origin + stream path.
|
||||
*/
|
||||
private getStreamUrl(): string | null {
|
||||
if (!this.currentPublicUrl || !this.options.streamPath) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract just the origin (host) from the public URL, ignoring any path
|
||||
const url = new URL(this.currentPublicUrl);
|
||||
const origin = url.origin;
|
||||
|
||||
// Convert https:// to wss:// for WebSocket
|
||||
const wsOrigin = origin
|
||||
.replace(/^https:\/\//, "wss://")
|
||||
.replace(/^http:\/\//, "ws://");
|
||||
|
||||
// Append the stream path
|
||||
const path = this.options.streamPath.startsWith("/")
|
||||
? this.options.streamPath
|
||||
: `/${this.options.streamPath}`;
|
||||
|
||||
return `${wsOrigin}${path}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate TwiML to connect a call to a WebSocket media stream.
|
||||
* This enables bidirectional audio streaming for real-time STT/TTS.
|
||||
*
|
||||
* @param streamUrl - WebSocket URL (wss://...) for the media stream
|
||||
*/
|
||||
getStreamConnectXml(streamUrl: string): string {
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Connect>
|
||||
<Stream url="${escapeXml(streamUrl)}" />
|
||||
</Connect>
|
||||
</Response>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate an outbound call via Twilio API.
|
||||
* If inlineTwiml is provided, uses that directly (for notify mode).
|
||||
* Otherwise, uses webhook URL for dynamic TwiML.
|
||||
*/
|
||||
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
|
||||
const url = new URL(input.webhookUrl);
|
||||
url.searchParams.set("callId", input.callId);
|
||||
|
||||
// Create separate URL for status callbacks (required by Twilio)
|
||||
const statusUrl = new URL(input.webhookUrl);
|
||||
statusUrl.searchParams.set("callId", input.callId);
|
||||
statusUrl.searchParams.set("type", "status"); // Differentiate from TwiML requests
|
||||
|
||||
// Store TwiML content if provided (for notify mode)
|
||||
// We now serve it from the webhook endpoint instead of sending inline
|
||||
if (input.inlineTwiml) {
|
||||
this.twimlStorage.set(input.callId, input.inlineTwiml);
|
||||
this.notifyCalls.add(input.callId);
|
||||
}
|
||||
|
||||
// Build request params - always use URL-based TwiML.
|
||||
// Twilio silently ignores `StatusCallback` when using the inline `Twiml` parameter.
|
||||
const params: Record<string, string | string[]> = {
|
||||
To: input.to,
|
||||
From: input.from,
|
||||
Url: url.toString(), // TwiML serving endpoint
|
||||
StatusCallback: statusUrl.toString(), // Separate status callback endpoint
|
||||
StatusCallbackEvent: ["initiated", "ringing", "answered", "completed"],
|
||||
Timeout: "30",
|
||||
};
|
||||
|
||||
const result = await this.apiRequest<TwilioCallResponse>(
|
||||
"/Calls.json",
|
||||
params,
|
||||
);
|
||||
|
||||
this.callWebhookUrls.set(result.sid, url.toString());
|
||||
|
||||
return {
|
||||
providerCallId: result.sid,
|
||||
status: result.status === "queued" ? "queued" : "initiated",
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Hang up a call via Twilio API.
|
||||
*/
|
||||
async hangupCall(input: HangupCallInput): Promise<void> {
|
||||
this.deleteStoredTwimlForProviderCall(input.providerCallId);
|
||||
|
||||
this.callWebhookUrls.delete(input.providerCallId);
|
||||
|
||||
await this.apiRequest(
|
||||
`/Calls/${input.providerCallId}.json`,
|
||||
{ Status: "completed" },
|
||||
{ allowNotFound: true },
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS audio via Twilio.
|
||||
*
|
||||
* Two modes:
|
||||
* 1. Core TTS + Media Streams: If TTS provider and media stream are available,
|
||||
* generates audio via core TTS and streams it through WebSocket (preferred).
|
||||
* 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
|
||||
* Note: This may not work on all Twilio accounts.
|
||||
*/
|
||||
async playTts(input: PlayTtsInput): Promise<void> {
|
||||
// Try telephony TTS via media stream first (if configured)
|
||||
const streamSid = this.callStreamMap.get(input.providerCallId);
|
||||
if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
|
||||
try {
|
||||
await this.playTtsViaStream(input.text, streamSid);
|
||||
return;
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[voice-call] Telephony TTS failed, falling back to Twilio <Say>:`,
|
||||
err instanceof Error ? err.message : err,
|
||||
);
|
||||
// Fall through to TwiML <Say> fallback
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to TwiML <Say> (may not work on all accounts)
|
||||
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
|
||||
if (!webhookUrl) {
|
||||
throw new Error(
|
||||
"Missing webhook URL for this call (provider state not initialized)",
|
||||
);
|
||||
}
|
||||
|
||||
console.warn(
|
||||
"[voice-call] Using TwiML <Say> fallback - telephony TTS not configured or media stream not active",
|
||||
);
|
||||
|
||||
const pollyVoice = mapVoiceToPolly(input.voice);
|
||||
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Say voice="${pollyVoice}" language="${input.locale || "en-US"}">${escapeXml(input.text)}</Say>
|
||||
<Gather input="speech" speechTimeout="auto" action="${escapeXml(webhookUrl)}" method="POST">
|
||||
<Say>.</Say>
|
||||
</Gather>
|
||||
</Response>`;
|
||||
|
||||
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
|
||||
Twiml: twiml,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Play TTS via core TTS and Twilio Media Streams.
|
||||
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
|
||||
* Uses a queue to serialize playback and prevent overlapping audio.
|
||||
*/
|
||||
private async playTtsViaStream(
|
||||
text: string,
|
||||
streamSid: string,
|
||||
): Promise<void> {
|
||||
if (!this.ttsProvider || !this.mediaStreamHandler) {
|
||||
throw new Error("TTS provider and media stream handler required");
|
||||
}
|
||||
|
||||
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
|
||||
const CHUNK_SIZE = 160;
|
||||
const CHUNK_DELAY_MS = 20;
|
||||
|
||||
const handler = this.mediaStreamHandler;
|
||||
const ttsProvider = this.ttsProvider;
|
||||
await handler.queueTts(streamSid, async (signal) => {
|
||||
// Generate audio with core TTS (returns mu-law at 8kHz)
|
||||
const muLawAudio = await ttsProvider.synthesizeForTelephony(text);
|
||||
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
|
||||
if (signal.aborted) break;
|
||||
handler.sendAudio(streamSid, chunk);
|
||||
|
||||
// Pace the audio to match real-time playback
|
||||
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
|
||||
if (signal.aborted) break;
|
||||
}
|
||||
|
||||
if (!signal.aborted) {
|
||||
// Send a mark to track when audio finishes
|
||||
handler.sendMark(streamSid, `tts-${Date.now()}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Start listening for speech via Twilio <Gather>.
|
||||
*/
|
||||
async startListening(input: StartListeningInput): Promise<void> {
|
||||
const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
|
||||
if (!webhookUrl) {
|
||||
throw new Error(
|
||||
"Missing webhook URL for this call (provider state not initialized)",
|
||||
);
|
||||
}
|
||||
|
||||
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(webhookUrl)}" method="POST">
|
||||
</Gather>
|
||||
</Response>`;
|
||||
|
||||
await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
|
||||
Twiml: twiml,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop listening - for Twilio this is a no-op as <Gather> auto-ends.
|
||||
*/
|
||||
async stopListening(_input: StopListeningInput): Promise<void> {
|
||||
// Twilio's <Gather> automatically stops on speech end
|
||||
// No explicit action needed
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Twilio-specific types
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
interface TwilioCallResponse {
|
||||
sid: string;
|
||||
status: string;
|
||||
direction: string;
|
||||
from: string;
|
||||
to: string;
|
||||
uri: string;
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
export async function twilioApiRequest<T = unknown>(params: {
|
||||
baseUrl: string;
|
||||
accountSid: string;
|
||||
authToken: string;
|
||||
endpoint: string;
|
||||
body: URLSearchParams | Record<string, string | string[]>;
|
||||
allowNotFound?: boolean;
|
||||
}): Promise<T> {
|
||||
const bodyParams =
|
||||
params.body instanceof URLSearchParams
|
||||
? params.body
|
||||
: Object.entries(params.body).reduce<URLSearchParams>(
|
||||
(acc, [key, value]) => {
|
||||
if (Array.isArray(value)) {
|
||||
for (const entry of value) {
|
||||
acc.append(key, entry);
|
||||
}
|
||||
} else if (typeof value === "string") {
|
||||
acc.append(key, value);
|
||||
}
|
||||
return acc;
|
||||
},
|
||||
new URLSearchParams(),
|
||||
);
|
||||
|
||||
const response = await fetch(`${params.baseUrl}${params.endpoint}`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Basic ${Buffer.from(`${params.accountSid}:${params.authToken}`).toString("base64")}`,
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
body: bodyParams,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
if (params.allowNotFound && response.status === 404) {
|
||||
return undefined as T;
|
||||
}
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Twilio API error: ${response.status} ${errorText}`);
|
||||
}
|
||||
|
||||
const text = await response.text();
|
||||
return text ? (JSON.parse(text) as T) : (undefined as T);
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
import type { WebhookContext, WebhookVerificationResult } from "../../types.js";
|
||||
import { verifyTwilioWebhook } from "../../webhook-security.js";
|
||||
|
||||
import type { TwilioProviderOptions } from "../twilio.js";
|
||||
|
||||
export function verifyTwilioProviderWebhook(params: {
|
||||
ctx: WebhookContext;
|
||||
authToken: string;
|
||||
currentPublicUrl?: string | null;
|
||||
options: TwilioProviderOptions;
|
||||
}): WebhookVerificationResult {
|
||||
const result = verifyTwilioWebhook(params.ctx, params.authToken, {
|
||||
publicUrl: params.currentPublicUrl || undefined,
|
||||
allowNgrokFreeTierLoopbackBypass:
|
||||
params.options.allowNgrokFreeTierLoopbackBypass ?? false,
|
||||
skipVerification: params.options.skipVerification,
|
||||
});
|
||||
|
||||
if (!result.ok) {
|
||||
console.warn(`[twilio] Webhook verification failed: ${result.reason}`);
|
||||
if (result.verificationUrl) {
|
||||
console.warn(`[twilio] Verification URL: ${result.verificationUrl}`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
ok: result.ok,
|
||||
reason: result.reason,
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,171 @@
|
||||
/**
|
||||
* Voice call response generator - uses the embedded Pi agent for tool support.
|
||||
* Routes voice responses through the same agent infrastructure as messaging.
|
||||
*/
|
||||
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import { loadCoreAgentDeps, type CoreConfig } from "./core-bridge.js";
|
||||
|
||||
import type { VoiceCallConfig } from "./config.js";
|
||||
|
||||
export type VoiceResponseParams = {
|
||||
/** Voice call config */
|
||||
voiceConfig: VoiceCallConfig;
|
||||
/** Core Moltbot config */
|
||||
coreConfig: CoreConfig;
|
||||
/** Call ID for session tracking */
|
||||
callId: string;
|
||||
/** Caller's phone number */
|
||||
from: string;
|
||||
/** Conversation transcript */
|
||||
transcript: Array<{ speaker: "user" | "bot"; text: string }>;
|
||||
/** Latest user message */
|
||||
userMessage: string;
|
||||
};
|
||||
|
||||
export type VoiceResponseResult = {
|
||||
text: string | null;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
type SessionEntry = {
|
||||
sessionId: string;
|
||||
updatedAt: number;
|
||||
};
|
||||
|
||||
/**
|
||||
* Generate a voice response using the embedded Pi agent with full tool support.
|
||||
* Uses the same agent infrastructure as messaging for consistent behavior.
|
||||
*/
|
||||
export async function generateVoiceResponse(
|
||||
params: VoiceResponseParams,
|
||||
): Promise<VoiceResponseResult> {
|
||||
const { voiceConfig, callId, from, transcript, userMessage, coreConfig } =
|
||||
params;
|
||||
|
||||
if (!coreConfig) {
|
||||
return { text: null, error: "Core config unavailable for voice response" };
|
||||
}
|
||||
|
||||
let deps: Awaited<ReturnType<typeof loadCoreAgentDeps>>;
|
||||
try {
|
||||
deps = await loadCoreAgentDeps();
|
||||
} catch (err) {
|
||||
return {
|
||||
text: null,
|
||||
error:
|
||||
err instanceof Error
|
||||
? err.message
|
||||
: "Unable to load core agent dependencies",
|
||||
};
|
||||
}
|
||||
const cfg = coreConfig;
|
||||
|
||||
// Build voice-specific session key based on phone number
|
||||
const normalizedPhone = from.replace(/\D/g, "");
|
||||
const sessionKey = `voice:${normalizedPhone}`;
|
||||
const agentId = "main";
|
||||
|
||||
// Resolve paths
|
||||
const storePath = deps.resolveStorePath(cfg.session?.store, { agentId });
|
||||
const agentDir = deps.resolveAgentDir(cfg, agentId);
|
||||
const workspaceDir = deps.resolveAgentWorkspaceDir(cfg, agentId);
|
||||
|
||||
// Ensure workspace exists
|
||||
await deps.ensureAgentWorkspace({ dir: workspaceDir });
|
||||
|
||||
// Load or create session entry
|
||||
const sessionStore = deps.loadSessionStore(storePath);
|
||||
const now = Date.now();
|
||||
let sessionEntry = sessionStore[sessionKey] as SessionEntry | undefined;
|
||||
|
||||
if (!sessionEntry) {
|
||||
sessionEntry = {
|
||||
sessionId: crypto.randomUUID(),
|
||||
updatedAt: now,
|
||||
};
|
||||
sessionStore[sessionKey] = sessionEntry;
|
||||
await deps.saveSessionStore(storePath, sessionStore);
|
||||
}
|
||||
|
||||
const sessionId = sessionEntry.sessionId;
|
||||
const sessionFile = deps.resolveSessionFilePath(sessionId, sessionEntry, {
|
||||
agentId,
|
||||
});
|
||||
|
||||
// Resolve model from config
|
||||
const modelRef =
|
||||
voiceConfig.responseModel ||
|
||||
`${deps.DEFAULT_PROVIDER}/${deps.DEFAULT_MODEL}`;
|
||||
const slashIndex = modelRef.indexOf("/");
|
||||
const provider =
|
||||
slashIndex === -1 ? deps.DEFAULT_PROVIDER : modelRef.slice(0, slashIndex);
|
||||
const model = slashIndex === -1 ? modelRef : modelRef.slice(slashIndex + 1);
|
||||
|
||||
// Resolve thinking level
|
||||
const thinkLevel = deps.resolveThinkingDefault({ cfg, provider, model });
|
||||
|
||||
// Resolve agent identity for personalized prompt
|
||||
const identity = deps.resolveAgentIdentity(cfg, agentId);
|
||||
const agentName = identity?.name?.trim() || "assistant";
|
||||
|
||||
// Build system prompt with conversation history
|
||||
const basePrompt =
|
||||
voiceConfig.responseSystemPrompt ??
|
||||
`You are ${agentName}, a helpful voice assistant on a phone call. Keep responses brief and conversational (1-2 sentences max). Be natural and friendly. The caller's phone number is ${from}. You have access to tools - use them when helpful.`;
|
||||
|
||||
let extraSystemPrompt = basePrompt;
|
||||
if (transcript.length > 0) {
|
||||
const history = transcript
|
||||
.map(
|
||||
(entry) =>
|
||||
`${entry.speaker === "bot" ? "You" : "Caller"}: ${entry.text}`,
|
||||
)
|
||||
.join("\n");
|
||||
extraSystemPrompt = `${basePrompt}\n\nConversation so far:\n${history}`;
|
||||
}
|
||||
|
||||
// Resolve timeout
|
||||
const timeoutMs =
|
||||
voiceConfig.responseTimeoutMs ?? deps.resolveAgentTimeoutMs({ cfg });
|
||||
const runId = `voice:${callId}:${Date.now()}`;
|
||||
|
||||
try {
|
||||
const result = await deps.runEmbeddedPiAgent({
|
||||
sessionId,
|
||||
sessionKey,
|
||||
messageProvider: "voice",
|
||||
sessionFile,
|
||||
workspaceDir,
|
||||
config: cfg,
|
||||
prompt: userMessage,
|
||||
provider,
|
||||
model,
|
||||
thinkLevel,
|
||||
verboseLevel: "off",
|
||||
timeoutMs,
|
||||
runId,
|
||||
lane: "voice",
|
||||
extraSystemPrompt,
|
||||
agentDir,
|
||||
});
|
||||
|
||||
// Extract text from payloads
|
||||
const texts = (result.payloads ?? [])
|
||||
.filter((p) => p.text && !p.isError)
|
||||
.map((p) => p.text?.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
const text = texts.join(" ") || null;
|
||||
|
||||
if (!text && result.meta.aborted) {
|
||||
return { text: null, error: "Response generation was aborted" };
|
||||
}
|
||||
|
||||
return { text };
|
||||
} catch (err) {
|
||||
console.error(`[voice-call] Response generation failed:`, err);
|
||||
return { text: null, error: String(err) };
|
||||
}
|
||||
}
|
||||
217
docker-compose/ez-assistant/extensions/voice-call/src/runtime.ts
Normal file
217
docker-compose/ez-assistant/extensions/voice-call/src/runtime.ts
Normal file
@@ -0,0 +1,217 @@
|
||||
import type { CoreConfig } from "./core-bridge.js";
|
||||
import type { VoiceCallConfig } from "./config.js";
|
||||
import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js";
|
||||
import { CallManager } from "./manager.js";
|
||||
import type { VoiceCallProvider } from "./providers/base.js";
|
||||
import { MockProvider } from "./providers/mock.js";
|
||||
import { PlivoProvider } from "./providers/plivo.js";
|
||||
import { TelnyxProvider } from "./providers/telnyx.js";
|
||||
import { TwilioProvider } from "./providers/twilio.js";
|
||||
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
|
||||
import { createTelephonyTtsProvider } from "./telephony-tts.js";
|
||||
import { startTunnel, type TunnelResult } from "./tunnel.js";
|
||||
import {
|
||||
cleanupTailscaleExposure,
|
||||
setupTailscaleExposure,
|
||||
VoiceCallWebhookServer,
|
||||
} from "./webhook.js";
|
||||
|
||||
export type VoiceCallRuntime = {
|
||||
config: VoiceCallConfig;
|
||||
provider: VoiceCallProvider;
|
||||
manager: CallManager;
|
||||
webhookServer: VoiceCallWebhookServer;
|
||||
webhookUrl: string;
|
||||
publicUrl: string | null;
|
||||
stop: () => Promise<void>;
|
||||
};
|
||||
|
||||
type Logger = {
|
||||
info: (message: string) => void;
|
||||
warn: (message: string) => void;
|
||||
error: (message: string) => void;
|
||||
debug: (message: string) => void;
|
||||
};
|
||||
|
||||
function isLoopbackBind(bind: string | undefined): boolean {
|
||||
if (!bind) return false;
|
||||
return bind === "127.0.0.1" || bind === "::1" || bind === "localhost";
|
||||
}
|
||||
|
||||
function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
|
||||
const allowNgrokFreeTierLoopbackBypass =
|
||||
config.tunnel?.provider === "ngrok" &&
|
||||
isLoopbackBind(config.serve?.bind) &&
|
||||
(config.tunnel?.allowNgrokFreeTierLoopbackBypass ||
|
||||
config.tunnel?.allowNgrokFreeTier ||
|
||||
false);
|
||||
|
||||
switch (config.provider) {
|
||||
case "telnyx":
|
||||
return new TelnyxProvider({
|
||||
apiKey: config.telnyx?.apiKey,
|
||||
connectionId: config.telnyx?.connectionId,
|
||||
publicKey: config.telnyx?.publicKey,
|
||||
});
|
||||
case "twilio":
|
||||
return new TwilioProvider(
|
||||
{
|
||||
accountSid: config.twilio?.accountSid,
|
||||
authToken: config.twilio?.authToken,
|
||||
},
|
||||
{
|
||||
allowNgrokFreeTierLoopbackBypass,
|
||||
publicUrl: config.publicUrl,
|
||||
skipVerification: config.skipSignatureVerification,
|
||||
streamPath: config.streaming?.enabled
|
||||
? config.streaming.streamPath
|
||||
: undefined,
|
||||
},
|
||||
);
|
||||
case "plivo":
|
||||
return new PlivoProvider(
|
||||
{
|
||||
authId: config.plivo?.authId,
|
||||
authToken: config.plivo?.authToken,
|
||||
},
|
||||
{
|
||||
publicUrl: config.publicUrl,
|
||||
skipVerification: config.skipSignatureVerification,
|
||||
ringTimeoutSec: Math.max(1, Math.floor(config.ringTimeoutMs / 1000)),
|
||||
},
|
||||
);
|
||||
case "mock":
|
||||
return new MockProvider();
|
||||
default:
|
||||
throw new Error(
|
||||
`Unsupported voice-call provider: ${String(config.provider)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export async function createVoiceCallRuntime(params: {
|
||||
config: VoiceCallConfig;
|
||||
coreConfig: CoreConfig;
|
||||
ttsRuntime?: TelephonyTtsRuntime;
|
||||
logger?: Logger;
|
||||
}): Promise<VoiceCallRuntime> {
|
||||
const { config: rawConfig, coreConfig, ttsRuntime, logger } = params;
|
||||
const log = logger ?? {
|
||||
info: console.log,
|
||||
warn: console.warn,
|
||||
error: console.error,
|
||||
debug: console.debug,
|
||||
};
|
||||
|
||||
const config = resolveVoiceCallConfig(rawConfig);
|
||||
|
||||
if (!config.enabled) {
|
||||
throw new Error(
|
||||
"Voice call disabled. Enable the plugin entry in config.",
|
||||
);
|
||||
}
|
||||
|
||||
const validation = validateProviderConfig(config);
|
||||
if (!validation.valid) {
|
||||
throw new Error(`Invalid voice-call config: ${validation.errors.join("; ")}`);
|
||||
}
|
||||
|
||||
const provider = resolveProvider(config);
|
||||
const manager = new CallManager(config);
|
||||
const webhookServer = new VoiceCallWebhookServer(
|
||||
config,
|
||||
manager,
|
||||
provider,
|
||||
coreConfig,
|
||||
);
|
||||
|
||||
const localUrl = await webhookServer.start();
|
||||
|
||||
// Determine public URL - priority: config.publicUrl > tunnel > legacy tailscale
|
||||
let publicUrl: string | null = config.publicUrl ?? null;
|
||||
let tunnelResult: TunnelResult | null = null;
|
||||
|
||||
if (!publicUrl && config.tunnel?.provider && config.tunnel.provider !== "none") {
|
||||
try {
|
||||
tunnelResult = await startTunnel({
|
||||
provider: config.tunnel.provider,
|
||||
port: config.serve.port,
|
||||
path: config.serve.path,
|
||||
ngrokAuthToken: config.tunnel.ngrokAuthToken,
|
||||
ngrokDomain: config.tunnel.ngrokDomain,
|
||||
});
|
||||
publicUrl = tunnelResult?.publicUrl ?? null;
|
||||
} catch (err) {
|
||||
log.error(
|
||||
`[voice-call] Tunnel setup failed: ${
|
||||
err instanceof Error ? err.message : String(err)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (!publicUrl && config.tailscale?.mode !== "off") {
|
||||
publicUrl = await setupTailscaleExposure(config);
|
||||
}
|
||||
|
||||
const webhookUrl = publicUrl ?? localUrl;
|
||||
|
||||
if (publicUrl && provider.name === "twilio") {
|
||||
(provider as TwilioProvider).setPublicUrl(publicUrl);
|
||||
}
|
||||
|
||||
if (provider.name === "twilio" && config.streaming?.enabled) {
|
||||
const twilioProvider = provider as TwilioProvider;
|
||||
if (ttsRuntime?.textToSpeechTelephony) {
|
||||
try {
|
||||
const ttsProvider = createTelephonyTtsProvider({
|
||||
coreConfig,
|
||||
ttsOverride: config.tts,
|
||||
runtime: ttsRuntime,
|
||||
});
|
||||
twilioProvider.setTTSProvider(ttsProvider);
|
||||
log.info("[voice-call] Telephony TTS provider configured");
|
||||
} catch (err) {
|
||||
log.warn(
|
||||
`[voice-call] Failed to initialize telephony TTS: ${
|
||||
err instanceof Error ? err.message : String(err)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
log.warn("[voice-call] Telephony TTS unavailable; streaming TTS disabled");
|
||||
}
|
||||
|
||||
const mediaHandler = webhookServer.getMediaStreamHandler();
|
||||
if (mediaHandler) {
|
||||
twilioProvider.setMediaStreamHandler(mediaHandler);
|
||||
log.info("[voice-call] Media stream handler wired to provider");
|
||||
}
|
||||
}
|
||||
|
||||
manager.initialize(provider, webhookUrl);
|
||||
|
||||
const stop = async () => {
|
||||
if (tunnelResult) {
|
||||
await tunnelResult.stop();
|
||||
}
|
||||
await cleanupTailscaleExposure(config);
|
||||
await webhookServer.stop();
|
||||
};
|
||||
|
||||
log.info("[voice-call] Runtime initialized");
|
||||
log.info(`[voice-call] Webhook URL: ${webhookUrl}`);
|
||||
if (publicUrl) {
|
||||
log.info(`[voice-call] Public URL: ${publicUrl}`);
|
||||
}
|
||||
|
||||
return {
|
||||
config,
|
||||
provider,
|
||||
manager,
|
||||
webhookServer,
|
||||
webhookUrl,
|
||||
publicUrl,
|
||||
stop,
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
const TELEPHONY_SAMPLE_RATE = 8000;
|
||||
|
||||
function clamp16(value: number): number {
|
||||
return Math.max(-32768, Math.min(32767, value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample 16-bit PCM (little-endian mono) to 8kHz using linear interpolation.
|
||||
*/
|
||||
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
|
||||
if (inputSampleRate === TELEPHONY_SAMPLE_RATE) return input;
|
||||
const inputSamples = Math.floor(input.length / 2);
|
||||
if (inputSamples === 0) return Buffer.alloc(0);
|
||||
|
||||
const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
|
||||
const outputSamples = Math.floor(inputSamples / ratio);
|
||||
const output = Buffer.alloc(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
const srcPos = i * ratio;
|
||||
const srcIndex = Math.floor(srcPos);
|
||||
const frac = srcPos - srcIndex;
|
||||
|
||||
const s0 = input.readInt16LE(srcIndex * 2);
|
||||
const s1Index = Math.min(srcIndex + 1, inputSamples - 1);
|
||||
const s1 = input.readInt16LE(s1Index * 2);
|
||||
|
||||
const sample = Math.round(s0 + frac * (s1 - s0));
|
||||
output.writeInt16LE(clamp16(sample), i * 2);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 16-bit PCM to 8-bit mu-law (G.711).
|
||||
*/
|
||||
export function pcmToMulaw(pcm: Buffer): Buffer {
|
||||
const samples = Math.floor(pcm.length / 2);
|
||||
const mulaw = Buffer.alloc(samples);
|
||||
|
||||
for (let i = 0; i < samples; i++) {
|
||||
const sample = pcm.readInt16LE(i * 2);
|
||||
mulaw[i] = linearToMulaw(sample);
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
export function convertPcmToMulaw8k(
|
||||
pcm: Buffer,
|
||||
inputSampleRate: number,
|
||||
): Buffer {
|
||||
const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
|
||||
return pcmToMulaw(pcm8k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
|
||||
*/
|
||||
export function chunkAudio(
|
||||
audio: Buffer,
|
||||
chunkSize = 160,
|
||||
): Generator<Buffer, void, unknown> {
|
||||
return (function* () {
|
||||
for (let i = 0; i < audio.length; i += chunkSize) {
|
||||
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
function linearToMulaw(sample: number): number {
|
||||
const BIAS = 132;
|
||||
const CLIP = 32635;
|
||||
|
||||
const sign = sample < 0 ? 0x80 : 0;
|
||||
if (sample < 0) sample = -sample;
|
||||
if (sample > CLIP) sample = CLIP;
|
||||
|
||||
sample += BIAS;
|
||||
let exponent = 7;
|
||||
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
|
||||
expMask >>= 1;
|
||||
}
|
||||
|
||||
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
||||
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
import type { CoreConfig } from "./core-bridge.js";
|
||||
import type { VoiceCallTtsConfig } from "./config.js";
|
||||
import { convertPcmToMulaw8k } from "./telephony-audio.js";
|
||||
|
||||
export type TelephonyTtsRuntime = {
|
||||
textToSpeechTelephony: (params: {
|
||||
text: string;
|
||||
cfg: CoreConfig;
|
||||
prefsPath?: string;
|
||||
}) => Promise<{
|
||||
success: boolean;
|
||||
audioBuffer?: Buffer;
|
||||
sampleRate?: number;
|
||||
provider?: string;
|
||||
error?: string;
|
||||
}>;
|
||||
};
|
||||
|
||||
export type TelephonyTtsProvider = {
|
||||
synthesizeForTelephony: (text: string) => Promise<Buffer>;
|
||||
};
|
||||
|
||||
export function createTelephonyTtsProvider(params: {
|
||||
coreConfig: CoreConfig;
|
||||
ttsOverride?: VoiceCallTtsConfig;
|
||||
runtime: TelephonyTtsRuntime;
|
||||
}): TelephonyTtsProvider {
|
||||
const { coreConfig, ttsOverride, runtime } = params;
|
||||
const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
|
||||
|
||||
return {
|
||||
synthesizeForTelephony: async (text: string) => {
|
||||
const result = await runtime.textToSpeechTelephony({
|
||||
text,
|
||||
cfg: mergedConfig,
|
||||
});
|
||||
|
||||
if (!result.success || !result.audioBuffer || !result.sampleRate) {
|
||||
throw new Error(result.error ?? "TTS conversion failed");
|
||||
}
|
||||
|
||||
return convertPcmToMulaw8k(result.audioBuffer, result.sampleRate);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function applyTtsOverride(
|
||||
coreConfig: CoreConfig,
|
||||
override?: VoiceCallTtsConfig,
|
||||
): CoreConfig {
|
||||
if (!override) return coreConfig;
|
||||
|
||||
const base = coreConfig.messages?.tts;
|
||||
const merged = mergeTtsConfig(base, override);
|
||||
if (!merged) return coreConfig;
|
||||
|
||||
return {
|
||||
...coreConfig,
|
||||
messages: {
|
||||
...(coreConfig.messages ?? {}),
|
||||
tts: merged,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function mergeTtsConfig(
|
||||
base?: VoiceCallTtsConfig,
|
||||
override?: VoiceCallTtsConfig,
|
||||
): VoiceCallTtsConfig | undefined {
|
||||
if (!base && !override) return undefined;
|
||||
if (!override) return base;
|
||||
if (!base) return override;
|
||||
return deepMerge(base, override);
|
||||
}
|
||||
|
||||
function deepMerge<T>(base: T, override: T): T {
|
||||
if (!isPlainObject(base) || !isPlainObject(override)) {
|
||||
return override;
|
||||
}
|
||||
const result: Record<string, unknown> = { ...base };
|
||||
for (const [key, value] of Object.entries(override)) {
|
||||
if (value === undefined) continue;
|
||||
const existing = (base as Record<string, unknown>)[key];
|
||||
if (isPlainObject(existing) && isPlainObject(value)) {
|
||||
result[key] = deepMerge(existing, value);
|
||||
} else {
|
||||
result[key] = value;
|
||||
}
|
||||
}
|
||||
return result as T;
|
||||
}
|
||||
|
||||
function isPlainObject(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
||||
}
|
||||
331
docker-compose/ez-assistant/extensions/voice-call/src/tunnel.ts
Normal file
331
docker-compose/ez-assistant/extensions/voice-call/src/tunnel.ts
Normal file
@@ -0,0 +1,331 @@
|
||||
import { spawn } from "node:child_process";
|
||||
|
||||
import { getTailscaleDnsName } from "./webhook.js";
|
||||
|
||||
/**
|
||||
* Tunnel configuration for exposing the webhook server.
|
||||
*/
|
||||
export interface TunnelConfig {
|
||||
/** Tunnel provider: ngrok, tailscale-serve, or tailscale-funnel */
|
||||
provider: "ngrok" | "tailscale-serve" | "tailscale-funnel" | "none";
|
||||
/** Local port to tunnel */
|
||||
port: number;
|
||||
/** Path prefix for the tunnel (e.g., /voice/webhook) */
|
||||
path: string;
|
||||
/** ngrok auth token (optional, enables longer sessions) */
|
||||
ngrokAuthToken?: string;
|
||||
/** ngrok custom domain (paid feature) */
|
||||
ngrokDomain?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of starting a tunnel.
|
||||
*/
|
||||
export interface TunnelResult {
|
||||
/** The public URL */
|
||||
publicUrl: string;
|
||||
/** Function to stop the tunnel */
|
||||
stop: () => Promise<void>;
|
||||
/** Tunnel provider name */
|
||||
provider: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start an ngrok tunnel to expose the local webhook server.
|
||||
*
|
||||
* Uses the ngrok CLI which must be installed: https://ngrok.com/download
|
||||
*
|
||||
* @example
|
||||
* const tunnel = await startNgrokTunnel({ port: 3334, path: '/voice/webhook' });
|
||||
* console.log('Public URL:', tunnel.publicUrl);
|
||||
* // Later: await tunnel.stop();
|
||||
*/
|
||||
export async function startNgrokTunnel(config: {
|
||||
port: number;
|
||||
path: string;
|
||||
authToken?: string;
|
||||
domain?: string;
|
||||
}): Promise<TunnelResult> {
|
||||
// Set auth token if provided
|
||||
if (config.authToken) {
|
||||
await runNgrokCommand(["config", "add-authtoken", config.authToken]);
|
||||
}
|
||||
|
||||
// Build ngrok command args
|
||||
const args = [
|
||||
"http",
|
||||
String(config.port),
|
||||
"--log",
|
||||
"stdout",
|
||||
"--log-format",
|
||||
"json",
|
||||
];
|
||||
|
||||
// Add custom domain if provided (paid ngrok feature)
|
||||
if (config.domain) {
|
||||
args.push("--domain", config.domain);
|
||||
}
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const proc = spawn("ngrok", args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
let resolved = false;
|
||||
let publicUrl: string | null = null;
|
||||
let outputBuffer = "";
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
if (!resolved) {
|
||||
resolved = true;
|
||||
proc.kill("SIGTERM");
|
||||
reject(new Error("ngrok startup timed out (30s)"));
|
||||
}
|
||||
}, 30000);
|
||||
|
||||
const processLine = (line: string) => {
|
||||
try {
|
||||
const log = JSON.parse(line);
|
||||
|
||||
// ngrok logs the public URL in a 'started tunnel' message
|
||||
if (log.msg === "started tunnel" && log.url) {
|
||||
publicUrl = log.url;
|
||||
}
|
||||
|
||||
// Also check for the URL field directly
|
||||
if (log.addr && log.url && !publicUrl) {
|
||||
publicUrl = log.url;
|
||||
}
|
||||
|
||||
// Check for ready state
|
||||
if (publicUrl && !resolved) {
|
||||
resolved = true;
|
||||
clearTimeout(timeout);
|
||||
|
||||
// Add path to the public URL
|
||||
const fullUrl = publicUrl + config.path;
|
||||
|
||||
console.log(`[voice-call] ngrok tunnel active: ${fullUrl}`);
|
||||
|
||||
resolve({
|
||||
publicUrl: fullUrl,
|
||||
provider: "ngrok",
|
||||
stop: async () => {
|
||||
proc.kill("SIGTERM");
|
||||
await new Promise<void>((res) => {
|
||||
proc.on("close", () => res());
|
||||
setTimeout(res, 2000); // Fallback timeout
|
||||
});
|
||||
},
|
||||
});
|
||||
}
|
||||
} catch {
|
||||
// Not JSON, might be startup message
|
||||
}
|
||||
};
|
||||
|
||||
proc.stdout.on("data", (data: Buffer) => {
|
||||
outputBuffer += data.toString();
|
||||
const lines = outputBuffer.split("\n");
|
||||
outputBuffer = lines.pop() || "";
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.trim()) {
|
||||
processLine(line);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
proc.stderr.on("data", (data: Buffer) => {
|
||||
const msg = data.toString();
|
||||
// Check for common errors
|
||||
if (msg.includes("ERR_NGROK")) {
|
||||
if (!resolved) {
|
||||
resolved = true;
|
||||
clearTimeout(timeout);
|
||||
reject(new Error(`ngrok error: ${msg}`));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
proc.on("error", (err) => {
|
||||
if (!resolved) {
|
||||
resolved = true;
|
||||
clearTimeout(timeout);
|
||||
reject(new Error(`Failed to start ngrok: ${err.message}`));
|
||||
}
|
||||
});
|
||||
|
||||
proc.on("close", (code) => {
|
||||
if (!resolved) {
|
||||
resolved = true;
|
||||
clearTimeout(timeout);
|
||||
reject(new Error(`ngrok exited unexpectedly with code ${code}`));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Run an ngrok command and wait for completion.
|
||||
*/
|
||||
async function runNgrokCommand(args: string[]): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const proc = spawn("ngrok", args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
|
||||
proc.stdout.on("data", (data) => {
|
||||
stdout += data.toString();
|
||||
});
|
||||
proc.stderr.on("data", (data) => {
|
||||
stderr += data.toString();
|
||||
});
|
||||
|
||||
proc.on("close", (code) => {
|
||||
if (code === 0) {
|
||||
resolve(stdout);
|
||||
} else {
|
||||
reject(new Error(`ngrok command failed: ${stderr || stdout}`));
|
||||
}
|
||||
});
|
||||
|
||||
proc.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if ngrok is installed and available.
|
||||
*/
|
||||
export async function isNgrokAvailable(): Promise<boolean> {
|
||||
return new Promise((resolve) => {
|
||||
const proc = spawn("ngrok", ["version"], {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
proc.on("close", (code) => {
|
||||
resolve(code === 0);
|
||||
});
|
||||
|
||||
proc.on("error", () => {
|
||||
resolve(false);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Start a Tailscale serve/funnel tunnel.
|
||||
*/
|
||||
export async function startTailscaleTunnel(config: {
|
||||
mode: "serve" | "funnel";
|
||||
port: number;
|
||||
path: string;
|
||||
}): Promise<TunnelResult> {
|
||||
// Get Tailscale DNS name
|
||||
const dnsName = await getTailscaleDnsName();
|
||||
if (!dnsName) {
|
||||
throw new Error("Could not get Tailscale DNS name. Is Tailscale running?");
|
||||
}
|
||||
|
||||
const path = config.path.startsWith("/") ? config.path : `/${config.path}`;
|
||||
const localUrl = `http://127.0.0.1:${config.port}${path}`;
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const proc = spawn(
|
||||
"tailscale",
|
||||
[config.mode, "--bg", "--yes", "--set-path", path, localUrl],
|
||||
{ stdio: ["ignore", "pipe", "pipe"] },
|
||||
);
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
proc.kill("SIGKILL");
|
||||
reject(new Error(`Tailscale ${config.mode} timed out`));
|
||||
}, 10000);
|
||||
|
||||
proc.on("close", (code) => {
|
||||
clearTimeout(timeout);
|
||||
if (code === 0) {
|
||||
const publicUrl = `https://${dnsName}${path}`;
|
||||
console.log(
|
||||
`[voice-call] Tailscale ${config.mode} active: ${publicUrl}`,
|
||||
);
|
||||
|
||||
resolve({
|
||||
publicUrl,
|
||||
provider: `tailscale-${config.mode}`,
|
||||
stop: async () => {
|
||||
await stopTailscaleTunnel(config.mode, path);
|
||||
},
|
||||
});
|
||||
} else {
|
||||
reject(new Error(`Tailscale ${config.mode} failed with code ${code}`));
|
||||
}
|
||||
});
|
||||
|
||||
proc.on("error", (err) => {
|
||||
clearTimeout(timeout);
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop a Tailscale serve/funnel tunnel.
|
||||
*/
|
||||
async function stopTailscaleTunnel(
|
||||
mode: "serve" | "funnel",
|
||||
path: string,
|
||||
): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
const proc = spawn("tailscale", [mode, "off", path], {
|
||||
stdio: "ignore",
|
||||
});
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
proc.kill("SIGKILL");
|
||||
resolve();
|
||||
}, 5000);
|
||||
|
||||
proc.on("close", () => {
|
||||
clearTimeout(timeout);
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Start a tunnel based on configuration.
|
||||
*/
|
||||
export async function startTunnel(
|
||||
config: TunnelConfig,
|
||||
): Promise<TunnelResult | null> {
|
||||
switch (config.provider) {
|
||||
case "ngrok":
|
||||
return startNgrokTunnel({
|
||||
port: config.port,
|
||||
path: config.path,
|
||||
authToken: config.ngrokAuthToken,
|
||||
domain: config.ngrokDomain,
|
||||
});
|
||||
|
||||
case "tailscale-serve":
|
||||
return startTailscaleTunnel({
|
||||
mode: "serve",
|
||||
port: config.port,
|
||||
path: config.path,
|
||||
});
|
||||
|
||||
case "tailscale-funnel":
|
||||
return startTailscaleTunnel({
|
||||
mode: "funnel",
|
||||
port: config.port,
|
||||
path: config.path,
|
||||
});
|
||||
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
273
docker-compose/ez-assistant/extensions/voice-call/src/types.ts
Normal file
273
docker-compose/ez-assistant/extensions/voice-call/src/types.ts
Normal file
@@ -0,0 +1,273 @@
|
||||
import { z } from "zod";
|
||||
|
||||
import type { CallMode } from "./config.js";
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Provider Identifiers
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const ProviderNameSchema = z.enum(["telnyx", "twilio", "plivo", "mock"]);
|
||||
export type ProviderName = z.infer<typeof ProviderNameSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Core Call Identifiers
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
/** Internal call identifier (UUID) */
|
||||
export type CallId = string;
|
||||
|
||||
/** Provider-specific call identifier */
|
||||
export type ProviderCallId = string;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Call Lifecycle States
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const CallStateSchema = z.enum([
|
||||
// Non-terminal states
|
||||
"initiated",
|
||||
"ringing",
|
||||
"answered",
|
||||
"active",
|
||||
"speaking",
|
||||
"listening",
|
||||
// Terminal states
|
||||
"completed",
|
||||
"hangup-user",
|
||||
"hangup-bot",
|
||||
"timeout",
|
||||
"error",
|
||||
"failed",
|
||||
"no-answer",
|
||||
"busy",
|
||||
"voicemail",
|
||||
]);
|
||||
export type CallState = z.infer<typeof CallStateSchema>;
|
||||
|
||||
export const TerminalStates = new Set<CallState>([
|
||||
"completed",
|
||||
"hangup-user",
|
||||
"hangup-bot",
|
||||
"timeout",
|
||||
"error",
|
||||
"failed",
|
||||
"no-answer",
|
||||
"busy",
|
||||
"voicemail",
|
||||
]);
|
||||
|
||||
export const EndReasonSchema = z.enum([
|
||||
"completed",
|
||||
"hangup-user",
|
||||
"hangup-bot",
|
||||
"timeout",
|
||||
"error",
|
||||
"failed",
|
||||
"no-answer",
|
||||
"busy",
|
||||
"voicemail",
|
||||
]);
|
||||
export type EndReason = z.infer<typeof EndReasonSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Normalized Call Events
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
const BaseEventSchema = z.object({
|
||||
id: z.string(),
|
||||
callId: z.string(),
|
||||
providerCallId: z.string().optional(),
|
||||
timestamp: z.number(),
|
||||
// Optional fields for inbound call detection
|
||||
direction: z.enum(["inbound", "outbound"]).optional(),
|
||||
from: z.string().optional(),
|
||||
to: z.string().optional(),
|
||||
});
|
||||
|
||||
export const NormalizedEventSchema = z.discriminatedUnion("type", [
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.initiated"),
|
||||
}),
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.ringing"),
|
||||
}),
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.answered"),
|
||||
}),
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.active"),
|
||||
}),
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.speaking"),
|
||||
text: z.string(),
|
||||
}),
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.speech"),
|
||||
transcript: z.string(),
|
||||
isFinal: z.boolean(),
|
||||
confidence: z.number().min(0).max(1).optional(),
|
||||
}),
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.silence"),
|
||||
durationMs: z.number(),
|
||||
}),
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.dtmf"),
|
||||
digits: z.string(),
|
||||
}),
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.ended"),
|
||||
reason: EndReasonSchema,
|
||||
}),
|
||||
BaseEventSchema.extend({
|
||||
type: z.literal("call.error"),
|
||||
error: z.string(),
|
||||
retryable: z.boolean().optional(),
|
||||
}),
|
||||
]);
|
||||
export type NormalizedEvent = z.infer<typeof NormalizedEventSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Call Direction
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const CallDirectionSchema = z.enum(["outbound", "inbound"]);
|
||||
export type CallDirection = z.infer<typeof CallDirectionSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Call Record
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const TranscriptEntrySchema = z.object({
|
||||
timestamp: z.number(),
|
||||
speaker: z.enum(["bot", "user"]),
|
||||
text: z.string(),
|
||||
isFinal: z.boolean().default(true),
|
||||
});
|
||||
export type TranscriptEntry = z.infer<typeof TranscriptEntrySchema>;
|
||||
|
||||
export const CallRecordSchema = z.object({
|
||||
callId: z.string(),
|
||||
providerCallId: z.string().optional(),
|
||||
provider: ProviderNameSchema,
|
||||
direction: CallDirectionSchema,
|
||||
state: CallStateSchema,
|
||||
from: z.string(),
|
||||
to: z.string(),
|
||||
sessionKey: z.string().optional(),
|
||||
startedAt: z.number(),
|
||||
answeredAt: z.number().optional(),
|
||||
endedAt: z.number().optional(),
|
||||
endReason: EndReasonSchema.optional(),
|
||||
transcript: z.array(TranscriptEntrySchema).default([]),
|
||||
processedEventIds: z.array(z.string()).default([]),
|
||||
metadata: z.record(z.string(), z.unknown()).optional(),
|
||||
});
|
||||
export type CallRecord = z.infer<typeof CallRecordSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Webhook Types
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export type WebhookVerificationResult = {
|
||||
ok: boolean;
|
||||
reason?: string;
|
||||
};
|
||||
|
||||
export type WebhookContext = {
|
||||
headers: Record<string, string | string[] | undefined>;
|
||||
rawBody: string;
|
||||
url: string;
|
||||
method: "GET" | "POST" | "PUT" | "DELETE" | "PATCH";
|
||||
query?: Record<string, string | string[] | undefined>;
|
||||
remoteAddress?: string;
|
||||
};
|
||||
|
||||
export type ProviderWebhookParseResult = {
|
||||
events: NormalizedEvent[];
|
||||
providerResponseBody?: string;
|
||||
providerResponseHeaders?: Record<string, string>;
|
||||
statusCode?: number;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Provider Method Types
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export type InitiateCallInput = {
|
||||
callId: CallId;
|
||||
from: string;
|
||||
to: string;
|
||||
webhookUrl: string;
|
||||
clientState?: Record<string, string>;
|
||||
/** Inline TwiML to execute (skips webhook, used for notify mode) */
|
||||
inlineTwiml?: string;
|
||||
};
|
||||
|
||||
export type InitiateCallResult = {
|
||||
providerCallId: ProviderCallId;
|
||||
status: "initiated" | "queued";
|
||||
};
|
||||
|
||||
export type HangupCallInput = {
|
||||
callId: CallId;
|
||||
providerCallId: ProviderCallId;
|
||||
reason: EndReason;
|
||||
};
|
||||
|
||||
export type PlayTtsInput = {
|
||||
callId: CallId;
|
||||
providerCallId: ProviderCallId;
|
||||
text: string;
|
||||
voice?: string;
|
||||
locale?: string;
|
||||
};
|
||||
|
||||
export type StartListeningInput = {
|
||||
callId: CallId;
|
||||
providerCallId: ProviderCallId;
|
||||
language?: string;
|
||||
};
|
||||
|
||||
export type StopListeningInput = {
|
||||
callId: CallId;
|
||||
providerCallId: ProviderCallId;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Outbound Call Options
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export type OutboundCallOptions = {
|
||||
/** Message to speak when call connects */
|
||||
message?: string;
|
||||
/** Call mode (overrides config default) */
|
||||
mode?: CallMode;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Tool Result Types
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export type InitiateCallToolResult = {
|
||||
success: boolean;
|
||||
callId?: string;
|
||||
status?: "initiated" | "queued" | "no-answer" | "busy" | "failed";
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export type ContinueCallToolResult = {
|
||||
success: boolean;
|
||||
transcript?: string;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export type SpeakToUserToolResult = {
|
||||
success: boolean;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export type EndCallToolResult = {
|
||||
success: boolean;
|
||||
error?: string;
|
||||
};
|
||||
@@ -0,0 +1,12 @@
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
export function resolveUserPath(input: string): string {
|
||||
const trimmed = input.trim();
|
||||
if (!trimmed) return trimmed;
|
||||
if (trimmed.startsWith("~")) {
|
||||
const expanded = trimmed.replace(/^~(?=$|[\\/])/, os.homedir());
|
||||
return path.resolve(expanded);
|
||||
}
|
||||
return path.resolve(trimmed);
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
/**
|
||||
* Voice mapping and XML utilities for voice call providers.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Escape XML special characters for TwiML and other XML responses.
|
||||
*/
|
||||
export function escapeXml(text: string): string {
|
||||
return text
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, """)
|
||||
.replace(/'/g, "'");
|
||||
}
|
||||
|
||||
/**
|
||||
* Map of OpenAI voice names to similar Twilio Polly voices.
|
||||
*/
|
||||
const OPENAI_TO_POLLY_MAP: Record<string, string> = {
|
||||
alloy: "Polly.Joanna", // neutral, warm
|
||||
echo: "Polly.Matthew", // male, warm
|
||||
fable: "Polly.Amy", // British, expressive
|
||||
onyx: "Polly.Brian", // deep male
|
||||
nova: "Polly.Salli", // female, friendly
|
||||
shimmer: "Polly.Kimberly", // female, clear
|
||||
};
|
||||
|
||||
/**
|
||||
* Default Polly voice when no mapping is found.
|
||||
*/
|
||||
export const DEFAULT_POLLY_VOICE = "Polly.Joanna";
|
||||
|
||||
/**
|
||||
* Map OpenAI voice names to Twilio Polly equivalents.
|
||||
* Falls through if already a valid Polly/Google voice.
|
||||
*
|
||||
* @param voice - OpenAI voice name (alloy, echo, etc.) or Polly voice name
|
||||
* @returns Polly voice name suitable for Twilio TwiML
|
||||
*/
|
||||
export function mapVoiceToPolly(voice: string | undefined): string {
|
||||
if (!voice) return DEFAULT_POLLY_VOICE;
|
||||
|
||||
// Already a Polly/Google voice - pass through
|
||||
if (voice.startsWith("Polly.") || voice.startsWith("Google.")) {
|
||||
return voice;
|
||||
}
|
||||
|
||||
// Map OpenAI voices to Polly equivalents
|
||||
return OPENAI_TO_POLLY_MAP[voice.toLowerCase()] || DEFAULT_POLLY_VOICE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a voice name is a known OpenAI voice.
|
||||
*/
|
||||
export function isOpenAiVoice(voice: string): boolean {
|
||||
return voice.toLowerCase() in OPENAI_TO_POLLY_MAP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all supported OpenAI voice names.
|
||||
*/
|
||||
export function getOpenAiVoiceNames(): string[] {
|
||||
return Object.keys(OPENAI_TO_POLLY_MAP);
|
||||
}
|
||||
@@ -0,0 +1,260 @@
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { verifyPlivoWebhook, verifyTwilioWebhook } from "./webhook-security.js";
|
||||
|
||||
function canonicalizeBase64(input: string): string {
|
||||
return Buffer.from(input, "base64").toString("base64");
|
||||
}
|
||||
|
||||
function plivoV2Signature(params: {
|
||||
authToken: string;
|
||||
urlNoQuery: string;
|
||||
nonce: string;
|
||||
}): string {
|
||||
const digest = crypto
|
||||
.createHmac("sha256", params.authToken)
|
||||
.update(params.urlNoQuery + params.nonce)
|
||||
.digest("base64");
|
||||
return canonicalizeBase64(digest);
|
||||
}
|
||||
|
||||
function plivoV3Signature(params: {
|
||||
authToken: string;
|
||||
urlWithQuery: string;
|
||||
postBody: string;
|
||||
nonce: string;
|
||||
}): string {
|
||||
const u = new URL(params.urlWithQuery);
|
||||
const baseNoQuery = `${u.protocol}//${u.host}${u.pathname}`;
|
||||
const queryPairs: Array<[string, string]> = [];
|
||||
for (const [k, v] of u.searchParams.entries()) queryPairs.push([k, v]);
|
||||
|
||||
const queryMap = new Map<string, string[]>();
|
||||
for (const [k, v] of queryPairs) {
|
||||
queryMap.set(k, (queryMap.get(k) ?? []).concat(v));
|
||||
}
|
||||
|
||||
const sortedQuery = Array.from(queryMap.keys())
|
||||
.sort()
|
||||
.flatMap((k) =>
|
||||
[...(queryMap.get(k) ?? [])].sort().map((v) => `${k}=${v}`),
|
||||
)
|
||||
.join("&");
|
||||
|
||||
const postParams = new URLSearchParams(params.postBody);
|
||||
const postMap = new Map<string, string[]>();
|
||||
for (const [k, v] of postParams.entries()) {
|
||||
postMap.set(k, (postMap.get(k) ?? []).concat(v));
|
||||
}
|
||||
|
||||
const sortedPost = Array.from(postMap.keys())
|
||||
.sort()
|
||||
.flatMap((k) => [...(postMap.get(k) ?? [])].sort().map((v) => `${k}${v}`))
|
||||
.join("");
|
||||
|
||||
const hasPost = sortedPost.length > 0;
|
||||
let baseUrl = baseNoQuery;
|
||||
if (sortedQuery.length > 0 || hasPost) {
|
||||
baseUrl = `${baseNoQuery}?${sortedQuery}`;
|
||||
}
|
||||
if (sortedQuery.length > 0 && hasPost) {
|
||||
baseUrl = `${baseUrl}.`;
|
||||
}
|
||||
baseUrl = `${baseUrl}${sortedPost}`;
|
||||
|
||||
const digest = crypto
|
||||
.createHmac("sha256", params.authToken)
|
||||
.update(`${baseUrl}.${params.nonce}`)
|
||||
.digest("base64");
|
||||
return canonicalizeBase64(digest);
|
||||
}
|
||||
|
||||
function twilioSignature(params: {
|
||||
authToken: string;
|
||||
url: string;
|
||||
postBody: string;
|
||||
}): string {
|
||||
let dataToSign = params.url;
|
||||
const sortedParams = Array.from(
|
||||
new URLSearchParams(params.postBody).entries(),
|
||||
).sort((a, b) => a[0].localeCompare(b[0]));
|
||||
|
||||
for (const [key, value] of sortedParams) {
|
||||
dataToSign += key + value;
|
||||
}
|
||||
|
||||
return crypto
|
||||
.createHmac("sha1", params.authToken)
|
||||
.update(dataToSign)
|
||||
.digest("base64");
|
||||
}
|
||||
|
||||
describe("verifyPlivoWebhook", () => {
|
||||
it("accepts valid V2 signature", () => {
|
||||
const authToken = "test-auth-token";
|
||||
const nonce = "nonce-123";
|
||||
|
||||
const ctxUrl = "http://local/voice/webhook?flow=answer&callId=abc";
|
||||
const verificationUrl = "https://example.com/voice/webhook";
|
||||
const signature = plivoV2Signature({
|
||||
authToken,
|
||||
urlNoQuery: verificationUrl,
|
||||
nonce,
|
||||
});
|
||||
|
||||
const result = verifyPlivoWebhook(
|
||||
{
|
||||
headers: {
|
||||
host: "example.com",
|
||||
"x-forwarded-proto": "https",
|
||||
"x-plivo-signature-v2": signature,
|
||||
"x-plivo-signature-v2-nonce": nonce,
|
||||
},
|
||||
rawBody: "CallUUID=uuid&CallStatus=in-progress",
|
||||
url: ctxUrl,
|
||||
method: "POST",
|
||||
query: { flow: "answer", callId: "abc" },
|
||||
},
|
||||
authToken,
|
||||
);
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
expect(result.version).toBe("v2");
|
||||
});
|
||||
|
||||
it("accepts valid V3 signature (including multi-signature header)", () => {
|
||||
const authToken = "test-auth-token";
|
||||
const nonce = "nonce-456";
|
||||
|
||||
const urlWithQuery = "https://example.com/voice/webhook?flow=answer&callId=abc";
|
||||
const postBody = "CallUUID=uuid&CallStatus=in-progress&From=%2B15550000000";
|
||||
|
||||
const good = plivoV3Signature({
|
||||
authToken,
|
||||
urlWithQuery,
|
||||
postBody,
|
||||
nonce,
|
||||
});
|
||||
|
||||
const result = verifyPlivoWebhook(
|
||||
{
|
||||
headers: {
|
||||
host: "example.com",
|
||||
"x-forwarded-proto": "https",
|
||||
"x-plivo-signature-v3": `bad, ${good}`,
|
||||
"x-plivo-signature-v3-nonce": nonce,
|
||||
},
|
||||
rawBody: postBody,
|
||||
url: urlWithQuery,
|
||||
method: "POST",
|
||||
query: { flow: "answer", callId: "abc" },
|
||||
},
|
||||
authToken,
|
||||
);
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
expect(result.version).toBe("v3");
|
||||
});
|
||||
|
||||
it("rejects missing signatures", () => {
|
||||
const result = verifyPlivoWebhook(
|
||||
{
|
||||
headers: { host: "example.com", "x-forwarded-proto": "https" },
|
||||
rawBody: "",
|
||||
url: "https://example.com/voice/webhook",
|
||||
method: "POST",
|
||||
},
|
||||
"token",
|
||||
);
|
||||
|
||||
expect(result.ok).toBe(false);
|
||||
expect(result.reason).toMatch(/Missing Plivo signature headers/);
|
||||
});
|
||||
});
|
||||
|
||||
describe("verifyTwilioWebhook", () => {
|
||||
it("uses request query when publicUrl omits it", () => {
|
||||
const authToken = "test-auth-token";
|
||||
const publicUrl = "https://example.com/voice/webhook";
|
||||
const urlWithQuery = `${publicUrl}?callId=abc`;
|
||||
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
|
||||
|
||||
const signature = twilioSignature({
|
||||
authToken,
|
||||
url: urlWithQuery,
|
||||
postBody,
|
||||
});
|
||||
|
||||
const result = verifyTwilioWebhook(
|
||||
{
|
||||
headers: {
|
||||
host: "example.com",
|
||||
"x-forwarded-proto": "https",
|
||||
"x-twilio-signature": signature,
|
||||
},
|
||||
rawBody: postBody,
|
||||
url: "http://local/voice/webhook?callId=abc",
|
||||
method: "POST",
|
||||
query: { callId: "abc" },
|
||||
},
|
||||
authToken,
|
||||
{ publicUrl },
|
||||
);
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
});
|
||||
|
||||
it("rejects invalid signatures even with ngrok free tier enabled", () => {
|
||||
const authToken = "test-auth-token";
|
||||
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
|
||||
|
||||
const result = verifyTwilioWebhook(
|
||||
{
|
||||
headers: {
|
||||
host: "127.0.0.1:3334",
|
||||
"x-forwarded-proto": "https",
|
||||
"x-forwarded-host": "attacker.ngrok-free.app",
|
||||
"x-twilio-signature": "invalid",
|
||||
},
|
||||
rawBody: postBody,
|
||||
url: "http://127.0.0.1:3334/voice/webhook",
|
||||
method: "POST",
|
||||
remoteAddress: "203.0.113.10",
|
||||
},
|
||||
authToken,
|
||||
{ allowNgrokFreeTierLoopbackBypass: true },
|
||||
);
|
||||
|
||||
expect(result.ok).toBe(false);
|
||||
expect(result.isNgrokFreeTier).toBe(true);
|
||||
expect(result.reason).toMatch(/Invalid signature/);
|
||||
});
|
||||
|
||||
it("allows invalid signatures for ngrok free tier only on loopback", () => {
|
||||
const authToken = "test-auth-token";
|
||||
const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
|
||||
|
||||
const result = verifyTwilioWebhook(
|
||||
{
|
||||
headers: {
|
||||
host: "127.0.0.1:3334",
|
||||
"x-forwarded-proto": "https",
|
||||
"x-forwarded-host": "local.ngrok-free.app",
|
||||
"x-twilio-signature": "invalid",
|
||||
},
|
||||
rawBody: postBody,
|
||||
url: "http://127.0.0.1:3334/voice/webhook",
|
||||
method: "POST",
|
||||
remoteAddress: "127.0.0.1",
|
||||
},
|
||||
authToken,
|
||||
{ allowNgrokFreeTierLoopbackBypass: true },
|
||||
);
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
expect(result.isNgrokFreeTier).toBe(true);
|
||||
expect(result.reason).toMatch(/compatibility mode/);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,469 @@
|
||||
import crypto from "node:crypto";
|
||||
|
||||
import type { WebhookContext } from "./types.js";
|
||||
|
||||
/**
|
||||
* Validate Twilio webhook signature using HMAC-SHA1.
|
||||
*
|
||||
* Twilio signs requests by concatenating the URL with sorted POST params,
|
||||
* then computing HMAC-SHA1 with the auth token.
|
||||
*
|
||||
* @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
|
||||
*/
|
||||
export function validateTwilioSignature(
|
||||
authToken: string,
|
||||
signature: string | undefined,
|
||||
url: string,
|
||||
params: URLSearchParams,
|
||||
): boolean {
|
||||
if (!signature) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Build the string to sign: URL + sorted params (key+value pairs)
|
||||
let dataToSign = url;
|
||||
|
||||
// Sort params alphabetically and append key+value
|
||||
const sortedParams = Array.from(params.entries()).sort((a, b) =>
|
||||
a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0,
|
||||
);
|
||||
|
||||
for (const [key, value] of sortedParams) {
|
||||
dataToSign += key + value;
|
||||
}
|
||||
|
||||
// HMAC-SHA1 with auth token, then base64 encode
|
||||
const expectedSignature = crypto
|
||||
.createHmac("sha1", authToken)
|
||||
.update(dataToSign)
|
||||
.digest("base64");
|
||||
|
||||
// Use timing-safe comparison to prevent timing attacks
|
||||
return timingSafeEqual(signature, expectedSignature);
|
||||
}
|
||||
|
||||
/**
|
||||
* Timing-safe string comparison to prevent timing attacks.
|
||||
*/
|
||||
function timingSafeEqual(a: string, b: string): boolean {
|
||||
if (a.length !== b.length) {
|
||||
// Still do comparison to maintain constant time
|
||||
const dummy = Buffer.from(a);
|
||||
crypto.timingSafeEqual(dummy, dummy);
|
||||
return false;
|
||||
}
|
||||
|
||||
const bufA = Buffer.from(a);
|
||||
const bufB = Buffer.from(b);
|
||||
return crypto.timingSafeEqual(bufA, bufB);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstruct the public webhook URL from request headers.
|
||||
*
|
||||
* When behind a reverse proxy (Tailscale, nginx, ngrok), the original URL
|
||||
* used by Twilio differs from the local request URL. We use standard
|
||||
* forwarding headers to reconstruct it.
|
||||
*
|
||||
* Priority order:
|
||||
* 1. X-Forwarded-Proto + X-Forwarded-Host (standard proxy headers)
|
||||
* 2. X-Original-Host (nginx)
|
||||
* 3. Ngrok-Forwarded-Host (ngrok specific)
|
||||
* 4. Host header (direct connection)
|
||||
*/
|
||||
export function reconstructWebhookUrl(ctx: WebhookContext): string {
|
||||
const { headers } = ctx;
|
||||
|
||||
const proto = getHeader(headers, "x-forwarded-proto") || "https";
|
||||
|
||||
const forwardedHost =
|
||||
getHeader(headers, "x-forwarded-host") ||
|
||||
getHeader(headers, "x-original-host") ||
|
||||
getHeader(headers, "ngrok-forwarded-host") ||
|
||||
getHeader(headers, "host") ||
|
||||
"";
|
||||
|
||||
// Extract path from the context URL (fallback to "/" on parse failure)
|
||||
let path = "/";
|
||||
try {
|
||||
const parsed = new URL(ctx.url);
|
||||
path = parsed.pathname + parsed.search;
|
||||
} catch {
|
||||
// URL parsing failed
|
||||
}
|
||||
|
||||
// Remove port from host (ngrok URLs don't have ports)
|
||||
const host = forwardedHost.split(":")[0] || forwardedHost;
|
||||
|
||||
return `${proto}://${host}${path}`;
|
||||
}
|
||||
|
||||
function buildTwilioVerificationUrl(
|
||||
ctx: WebhookContext,
|
||||
publicUrl?: string,
|
||||
): string {
|
||||
if (!publicUrl) {
|
||||
return reconstructWebhookUrl(ctx);
|
||||
}
|
||||
|
||||
try {
|
||||
const base = new URL(publicUrl);
|
||||
const requestUrl = new URL(ctx.url);
|
||||
base.pathname = requestUrl.pathname;
|
||||
base.search = requestUrl.search;
|
||||
return base.toString();
|
||||
} catch {
|
||||
return publicUrl;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a header value, handling both string and string[] types.
|
||||
*/
|
||||
function getHeader(
|
||||
headers: Record<string, string | string[] | undefined>,
|
||||
name: string,
|
||||
): string | undefined {
|
||||
const value = headers[name.toLowerCase()];
|
||||
if (Array.isArray(value)) {
|
||||
return value[0];
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function isLoopbackAddress(address?: string): boolean {
|
||||
if (!address) return false;
|
||||
if (address === "127.0.0.1" || address === "::1") return true;
|
||||
if (address.startsWith("::ffff:127.")) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of Twilio webhook verification with detailed info.
|
||||
*/
|
||||
export interface TwilioVerificationResult {
|
||||
ok: boolean;
|
||||
reason?: string;
|
||||
/** The URL that was used for verification (for debugging) */
|
||||
verificationUrl?: string;
|
||||
/** Whether we're running behind ngrok free tier */
|
||||
isNgrokFreeTier?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify Twilio webhook with full context and detailed result.
|
||||
*
|
||||
* Handles the special case of ngrok free tier where signature validation
|
||||
* may fail due to URL discrepancies (ngrok adds interstitial page handling).
|
||||
*/
|
||||
export function verifyTwilioWebhook(
|
||||
ctx: WebhookContext,
|
||||
authToken: string,
|
||||
options?: {
|
||||
/** Override the public URL (e.g., from config) */
|
||||
publicUrl?: string;
|
||||
/** Allow ngrok free tier compatibility mode (loopback only, less secure) */
|
||||
allowNgrokFreeTierLoopbackBypass?: boolean;
|
||||
/** Skip verification entirely (only for development) */
|
||||
skipVerification?: boolean;
|
||||
},
|
||||
): TwilioVerificationResult {
|
||||
// Allow skipping verification for development/testing
|
||||
if (options?.skipVerification) {
|
||||
return { ok: true, reason: "verification skipped (dev mode)" };
|
||||
}
|
||||
|
||||
const signature = getHeader(ctx.headers, "x-twilio-signature");
|
||||
|
||||
if (!signature) {
|
||||
return { ok: false, reason: "Missing X-Twilio-Signature header" };
|
||||
}
|
||||
|
||||
// Reconstruct the URL Twilio used
|
||||
const verificationUrl = buildTwilioVerificationUrl(ctx, options?.publicUrl);
|
||||
|
||||
// Parse the body as URL-encoded params
|
||||
const params = new URLSearchParams(ctx.rawBody);
|
||||
|
||||
// Validate signature
|
||||
const isValid = validateTwilioSignature(
|
||||
authToken,
|
||||
signature,
|
||||
verificationUrl,
|
||||
params,
|
||||
);
|
||||
|
||||
if (isValid) {
|
||||
return { ok: true, verificationUrl };
|
||||
}
|
||||
|
||||
// Check if this is ngrok free tier - the URL might have different format
|
||||
const isNgrokFreeTier =
|
||||
verificationUrl.includes(".ngrok-free.app") ||
|
||||
verificationUrl.includes(".ngrok.io");
|
||||
|
||||
if (
|
||||
isNgrokFreeTier &&
|
||||
options?.allowNgrokFreeTierLoopbackBypass &&
|
||||
isLoopbackAddress(ctx.remoteAddress)
|
||||
) {
|
||||
console.warn(
|
||||
"[voice-call] Twilio signature validation failed (ngrok free tier compatibility, loopback only)",
|
||||
);
|
||||
return {
|
||||
ok: true,
|
||||
reason: "ngrok free tier compatibility mode (loopback only)",
|
||||
verificationUrl,
|
||||
isNgrokFreeTier: true,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
ok: false,
|
||||
reason: `Invalid signature for URL: ${verificationUrl}`,
|
||||
verificationUrl,
|
||||
isNgrokFreeTier,
|
||||
};
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Plivo webhook verification
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Result of Plivo webhook verification with detailed info.
|
||||
*/
|
||||
export interface PlivoVerificationResult {
|
||||
ok: boolean;
|
||||
reason?: string;
|
||||
verificationUrl?: string;
|
||||
/** Signature version used for verification */
|
||||
version?: "v3" | "v2";
|
||||
}
|
||||
|
||||
function normalizeSignatureBase64(input: string): string {
|
||||
// Canonicalize base64 to match Plivo SDK behavior (decode then re-encode).
|
||||
return Buffer.from(input, "base64").toString("base64");
|
||||
}
|
||||
|
||||
function getBaseUrlNoQuery(url: string): string {
|
||||
const u = new URL(url);
|
||||
return `${u.protocol}//${u.host}${u.pathname}`;
|
||||
}
|
||||
|
||||
function timingSafeEqualString(a: string, b: string): boolean {
|
||||
if (a.length !== b.length) {
|
||||
const dummy = Buffer.from(a);
|
||||
crypto.timingSafeEqual(dummy, dummy);
|
||||
return false;
|
||||
}
|
||||
return crypto.timingSafeEqual(Buffer.from(a), Buffer.from(b));
|
||||
}
|
||||
|
||||
function validatePlivoV2Signature(params: {
|
||||
authToken: string;
|
||||
signature: string;
|
||||
nonce: string;
|
||||
url: string;
|
||||
}): boolean {
|
||||
const baseUrl = getBaseUrlNoQuery(params.url);
|
||||
const digest = crypto
|
||||
.createHmac("sha256", params.authToken)
|
||||
.update(baseUrl + params.nonce)
|
||||
.digest("base64");
|
||||
const expected = normalizeSignatureBase64(digest);
|
||||
const provided = normalizeSignatureBase64(params.signature);
|
||||
return timingSafeEqualString(expected, provided);
|
||||
}
|
||||
|
||||
type PlivoParamMap = Record<string, string[]>;
|
||||
|
||||
function toParamMapFromSearchParams(sp: URLSearchParams): PlivoParamMap {
|
||||
const map: PlivoParamMap = {};
|
||||
for (const [key, value] of sp.entries()) {
|
||||
if (!map[key]) map[key] = [];
|
||||
map[key].push(value);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
function sortedQueryString(params: PlivoParamMap): string {
|
||||
const parts: string[] = [];
|
||||
for (const key of Object.keys(params).sort()) {
|
||||
const values = [...params[key]].sort();
|
||||
for (const value of values) {
|
||||
parts.push(`${key}=${value}`);
|
||||
}
|
||||
}
|
||||
return parts.join("&");
|
||||
}
|
||||
|
||||
function sortedParamsString(params: PlivoParamMap): string {
|
||||
const parts: string[] = [];
|
||||
for (const key of Object.keys(params).sort()) {
|
||||
const values = [...params[key]].sort();
|
||||
for (const value of values) {
|
||||
parts.push(`${key}${value}`);
|
||||
}
|
||||
}
|
||||
return parts.join("");
|
||||
}
|
||||
|
||||
function constructPlivoV3BaseUrl(params: {
|
||||
method: "GET" | "POST";
|
||||
url: string;
|
||||
postParams: PlivoParamMap;
|
||||
}): string {
|
||||
const hasPostParams = Object.keys(params.postParams).length > 0;
|
||||
const u = new URL(params.url);
|
||||
const baseNoQuery = `${u.protocol}//${u.host}${u.pathname}`;
|
||||
|
||||
const queryMap = toParamMapFromSearchParams(u.searchParams);
|
||||
const queryString = sortedQueryString(queryMap);
|
||||
|
||||
// In the Plivo V3 algorithm, the query portion is always sorted, and if we
|
||||
// have POST params we add a '.' separator after the query string.
|
||||
let baseUrl = baseNoQuery;
|
||||
if (queryString.length > 0 || hasPostParams) {
|
||||
baseUrl = `${baseNoQuery}?${queryString}`;
|
||||
}
|
||||
if (queryString.length > 0 && hasPostParams) {
|
||||
baseUrl = `${baseUrl}.`;
|
||||
}
|
||||
|
||||
if (params.method === "GET") {
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
return baseUrl + sortedParamsString(params.postParams);
|
||||
}
|
||||
|
||||
function validatePlivoV3Signature(params: {
|
||||
authToken: string;
|
||||
signatureHeader: string;
|
||||
nonce: string;
|
||||
method: "GET" | "POST";
|
||||
url: string;
|
||||
postParams: PlivoParamMap;
|
||||
}): boolean {
|
||||
const baseUrl = constructPlivoV3BaseUrl({
|
||||
method: params.method,
|
||||
url: params.url,
|
||||
postParams: params.postParams,
|
||||
});
|
||||
|
||||
const hmacBase = `${baseUrl}.${params.nonce}`;
|
||||
const digest = crypto
|
||||
.createHmac("sha256", params.authToken)
|
||||
.update(hmacBase)
|
||||
.digest("base64");
|
||||
const expected = normalizeSignatureBase64(digest);
|
||||
|
||||
// Header can contain multiple signatures separated by commas.
|
||||
const provided = params.signatureHeader
|
||||
.split(",")
|
||||
.map((s) => s.trim())
|
||||
.filter(Boolean)
|
||||
.map((s) => normalizeSignatureBase64(s));
|
||||
|
||||
for (const sig of provided) {
|
||||
if (timingSafeEqualString(expected, sig)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify Plivo webhooks using V3 signature if present; fall back to V2.
|
||||
*
|
||||
* Header names (case-insensitive; Node provides lower-case keys):
|
||||
* - V3: X-Plivo-Signature-V3 / X-Plivo-Signature-V3-Nonce
|
||||
* - V2: X-Plivo-Signature-V2 / X-Plivo-Signature-V2-Nonce
|
||||
*/
|
||||
export function verifyPlivoWebhook(
|
||||
ctx: WebhookContext,
|
||||
authToken: string,
|
||||
options?: {
|
||||
/** Override the public URL origin (host) used for verification */
|
||||
publicUrl?: string;
|
||||
/** Skip verification entirely (only for development) */
|
||||
skipVerification?: boolean;
|
||||
},
|
||||
): PlivoVerificationResult {
|
||||
if (options?.skipVerification) {
|
||||
return { ok: true, reason: "verification skipped (dev mode)" };
|
||||
}
|
||||
|
||||
const signatureV3 = getHeader(ctx.headers, "x-plivo-signature-v3");
|
||||
const nonceV3 = getHeader(ctx.headers, "x-plivo-signature-v3-nonce");
|
||||
const signatureV2 = getHeader(ctx.headers, "x-plivo-signature-v2");
|
||||
const nonceV2 = getHeader(ctx.headers, "x-plivo-signature-v2-nonce");
|
||||
|
||||
const reconstructed = reconstructWebhookUrl(ctx);
|
||||
let verificationUrl = reconstructed;
|
||||
if (options?.publicUrl) {
|
||||
try {
|
||||
const req = new URL(reconstructed);
|
||||
const base = new URL(options.publicUrl);
|
||||
base.pathname = req.pathname;
|
||||
base.search = req.search;
|
||||
verificationUrl = base.toString();
|
||||
} catch {
|
||||
verificationUrl = reconstructed;
|
||||
}
|
||||
}
|
||||
|
||||
if (signatureV3 && nonceV3) {
|
||||
const method =
|
||||
ctx.method === "GET" || ctx.method === "POST" ? ctx.method : null;
|
||||
|
||||
if (!method) {
|
||||
return {
|
||||
ok: false,
|
||||
version: "v3",
|
||||
verificationUrl,
|
||||
reason: `Unsupported HTTP method for Plivo V3 signature: ${ctx.method}`,
|
||||
};
|
||||
}
|
||||
|
||||
const postParams = toParamMapFromSearchParams(new URLSearchParams(ctx.rawBody));
|
||||
const ok = validatePlivoV3Signature({
|
||||
authToken,
|
||||
signatureHeader: signatureV3,
|
||||
nonce: nonceV3,
|
||||
method,
|
||||
url: verificationUrl,
|
||||
postParams,
|
||||
});
|
||||
return ok
|
||||
? { ok: true, version: "v3", verificationUrl }
|
||||
: {
|
||||
ok: false,
|
||||
version: "v3",
|
||||
verificationUrl,
|
||||
reason: "Invalid Plivo V3 signature",
|
||||
};
|
||||
}
|
||||
|
||||
if (signatureV2 && nonceV2) {
|
||||
const ok = validatePlivoV2Signature({
|
||||
authToken,
|
||||
signature: signatureV2,
|
||||
nonce: nonceV2,
|
||||
url: verificationUrl,
|
||||
});
|
||||
return ok
|
||||
? { ok: true, version: "v2", verificationUrl }
|
||||
: {
|
||||
ok: false,
|
||||
version: "v2",
|
||||
verificationUrl,
|
||||
reason: "Invalid Plivo V2 signature",
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
ok: false,
|
||||
reason: "Missing Plivo signature headers (V3 or V2)",
|
||||
verificationUrl,
|
||||
};
|
||||
}
|
||||
491
docker-compose/ez-assistant/extensions/voice-call/src/webhook.ts
Normal file
491
docker-compose/ez-assistant/extensions/voice-call/src/webhook.ts
Normal file
@@ -0,0 +1,491 @@
|
||||
import { spawn } from "node:child_process";
|
||||
import http from "node:http";
|
||||
import { URL } from "node:url";
|
||||
|
||||
import type { VoiceCallConfig } from "./config.js";
|
||||
import type { CoreConfig } from "./core-bridge.js";
|
||||
import type { CallManager } from "./manager.js";
|
||||
import type { MediaStreamConfig } from "./media-stream.js";
|
||||
import { MediaStreamHandler } from "./media-stream.js";
|
||||
import type { VoiceCallProvider } from "./providers/base.js";
|
||||
import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js";
|
||||
import type { TwilioProvider } from "./providers/twilio.js";
|
||||
import type { NormalizedEvent, WebhookContext } from "./types.js";
|
||||
|
||||
/**
|
||||
* HTTP server for receiving voice call webhooks from providers.
|
||||
* Supports WebSocket upgrades for media streams when streaming is enabled.
|
||||
*/
|
||||
export class VoiceCallWebhookServer {
|
||||
private server: http.Server | null = null;
|
||||
private config: VoiceCallConfig;
|
||||
private manager: CallManager;
|
||||
private provider: VoiceCallProvider;
|
||||
private coreConfig: CoreConfig | null;
|
||||
|
||||
/** Media stream handler for bidirectional audio (when streaming enabled) */
|
||||
private mediaStreamHandler: MediaStreamHandler | null = null;
|
||||
|
||||
constructor(
|
||||
config: VoiceCallConfig,
|
||||
manager: CallManager,
|
||||
provider: VoiceCallProvider,
|
||||
coreConfig?: CoreConfig,
|
||||
) {
|
||||
this.config = config;
|
||||
this.manager = manager;
|
||||
this.provider = provider;
|
||||
this.coreConfig = coreConfig ?? null;
|
||||
|
||||
// Initialize media stream handler if streaming is enabled
|
||||
if (config.streaming?.enabled) {
|
||||
this.initializeMediaStreaming();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the media stream handler (for wiring to provider).
|
||||
*/
|
||||
getMediaStreamHandler(): MediaStreamHandler | null {
|
||||
return this.mediaStreamHandler;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize media streaming with OpenAI Realtime STT.
|
||||
*/
|
||||
private initializeMediaStreaming(): void {
|
||||
const apiKey =
|
||||
this.config.streaming?.openaiApiKey || process.env.OPENAI_API_KEY;
|
||||
|
||||
if (!apiKey) {
|
||||
console.warn(
|
||||
"[voice-call] Streaming enabled but no OpenAI API key found",
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const sttProvider = new OpenAIRealtimeSTTProvider({
|
||||
apiKey,
|
||||
model: this.config.streaming?.sttModel,
|
||||
silenceDurationMs: this.config.streaming?.silenceDurationMs,
|
||||
vadThreshold: this.config.streaming?.vadThreshold,
|
||||
});
|
||||
|
||||
const streamConfig: MediaStreamConfig = {
|
||||
sttProvider,
|
||||
onTranscript: (providerCallId, transcript) => {
|
||||
console.log(
|
||||
`[voice-call] Transcript for ${providerCallId}: ${transcript}`,
|
||||
);
|
||||
|
||||
// Clear TTS queue on barge-in (user started speaking, interrupt current playback)
|
||||
if (this.provider.name === "twilio") {
|
||||
(this.provider as TwilioProvider).clearTtsQueue(providerCallId);
|
||||
}
|
||||
|
||||
// Look up our internal call ID from the provider call ID
|
||||
const call = this.manager.getCallByProviderCallId(providerCallId);
|
||||
if (!call) {
|
||||
console.warn(
|
||||
`[voice-call] No active call found for provider ID: ${providerCallId}`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Create a speech event and process it through the manager
|
||||
const event: NormalizedEvent = {
|
||||
id: `stream-transcript-${Date.now()}`,
|
||||
type: "call.speech",
|
||||
callId: call.callId,
|
||||
providerCallId,
|
||||
timestamp: Date.now(),
|
||||
transcript,
|
||||
isFinal: true,
|
||||
};
|
||||
this.manager.processEvent(event);
|
||||
|
||||
// Auto-respond in conversation mode (inbound always, outbound if mode is conversation)
|
||||
const callMode = call.metadata?.mode as string | undefined;
|
||||
const shouldRespond =
|
||||
call.direction === "inbound" || callMode === "conversation";
|
||||
if (shouldRespond) {
|
||||
this.handleInboundResponse(call.callId, transcript).catch((err) => {
|
||||
console.warn(`[voice-call] Failed to auto-respond:`, err);
|
||||
});
|
||||
}
|
||||
},
|
||||
onSpeechStart: (providerCallId) => {
|
||||
if (this.provider.name === "twilio") {
|
||||
(this.provider as TwilioProvider).clearTtsQueue(providerCallId);
|
||||
}
|
||||
},
|
||||
onPartialTranscript: (callId, partial) => {
|
||||
console.log(`[voice-call] Partial for ${callId}: ${partial}`);
|
||||
},
|
||||
onConnect: (callId, streamSid) => {
|
||||
console.log(
|
||||
`[voice-call] Media stream connected: ${callId} -> ${streamSid}`,
|
||||
);
|
||||
// Register stream with provider for TTS routing
|
||||
if (this.provider.name === "twilio") {
|
||||
(this.provider as TwilioProvider).registerCallStream(
|
||||
callId,
|
||||
streamSid,
|
||||
);
|
||||
}
|
||||
|
||||
// Speak initial message if one was provided when call was initiated
|
||||
// Use setTimeout to allow stream setup to complete
|
||||
setTimeout(() => {
|
||||
this.manager.speakInitialMessage(callId).catch((err) => {
|
||||
console.warn(`[voice-call] Failed to speak initial message:`, err);
|
||||
});
|
||||
}, 500);
|
||||
},
|
||||
onDisconnect: (callId) => {
|
||||
console.log(`[voice-call] Media stream disconnected: ${callId}`);
|
||||
if (this.provider.name === "twilio") {
|
||||
(this.provider as TwilioProvider).unregisterCallStream(callId);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
this.mediaStreamHandler = new MediaStreamHandler(streamConfig);
|
||||
console.log("[voice-call] Media streaming initialized");
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the webhook server.
|
||||
*/
|
||||
async start(): Promise<string> {
|
||||
const { port, bind, path: webhookPath } = this.config.serve;
|
||||
const streamPath = this.config.streaming?.streamPath || "/voice/stream";
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
this.server = http.createServer((req, res) => {
|
||||
this.handleRequest(req, res, webhookPath).catch((err) => {
|
||||
console.error("[voice-call] Webhook error:", err);
|
||||
res.statusCode = 500;
|
||||
res.end("Internal Server Error");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle WebSocket upgrades for media streams
|
||||
if (this.mediaStreamHandler) {
|
||||
this.server.on("upgrade", (request, socket, head) => {
|
||||
const url = new URL(
|
||||
request.url || "/",
|
||||
`http://${request.headers.host}`,
|
||||
);
|
||||
|
||||
if (url.pathname === streamPath) {
|
||||
console.log("[voice-call] WebSocket upgrade for media stream");
|
||||
this.mediaStreamHandler?.handleUpgrade(request, socket, head);
|
||||
} else {
|
||||
socket.destroy();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
this.server.on("error", reject);
|
||||
|
||||
this.server.listen(port, bind, () => {
|
||||
const url = `http://${bind}:${port}${webhookPath}`;
|
||||
console.log(`[voice-call] Webhook server listening on ${url}`);
|
||||
if (this.mediaStreamHandler) {
|
||||
console.log(
|
||||
`[voice-call] Media stream WebSocket on ws://${bind}:${port}${streamPath}`,
|
||||
);
|
||||
}
|
||||
resolve(url);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop the webhook server.
|
||||
*/
|
||||
async stop(): Promise<void> {
|
||||
return new Promise((resolve) => {
|
||||
if (this.server) {
|
||||
this.server.close(() => {
|
||||
this.server = null;
|
||||
resolve();
|
||||
});
|
||||
} else {
|
||||
resolve();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle incoming HTTP request.
|
||||
*/
|
||||
private async handleRequest(
|
||||
req: http.IncomingMessage,
|
||||
res: http.ServerResponse,
|
||||
webhookPath: string,
|
||||
): Promise<void> {
|
||||
const url = new URL(req.url || "/", `http://${req.headers.host}`);
|
||||
|
||||
// Check path
|
||||
if (!url.pathname.startsWith(webhookPath)) {
|
||||
res.statusCode = 404;
|
||||
res.end("Not Found");
|
||||
return;
|
||||
}
|
||||
|
||||
// Only accept POST
|
||||
if (req.method !== "POST") {
|
||||
res.statusCode = 405;
|
||||
res.end("Method Not Allowed");
|
||||
return;
|
||||
}
|
||||
|
||||
// Read body
|
||||
const body = await this.readBody(req);
|
||||
|
||||
// Build webhook context
|
||||
const ctx: WebhookContext = {
|
||||
headers: req.headers as Record<string, string | string[] | undefined>,
|
||||
rawBody: body,
|
||||
url: `http://${req.headers.host}${req.url}`,
|
||||
method: "POST",
|
||||
query: Object.fromEntries(url.searchParams),
|
||||
remoteAddress: req.socket.remoteAddress ?? undefined,
|
||||
};
|
||||
|
||||
// Verify signature
|
||||
const verification = this.provider.verifyWebhook(ctx);
|
||||
if (!verification.ok) {
|
||||
console.warn(
|
||||
`[voice-call] Webhook verification failed: ${verification.reason}`,
|
||||
);
|
||||
res.statusCode = 401;
|
||||
res.end("Unauthorized");
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse events
|
||||
const result = this.provider.parseWebhookEvent(ctx);
|
||||
|
||||
// Process each event
|
||||
for (const event of result.events) {
|
||||
try {
|
||||
this.manager.processEvent(event);
|
||||
} catch (err) {
|
||||
console.error(
|
||||
`[voice-call] Error processing event ${event.type}:`,
|
||||
err,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Send response
|
||||
res.statusCode = result.statusCode || 200;
|
||||
|
||||
if (result.providerResponseHeaders) {
|
||||
for (const [key, value] of Object.entries(
|
||||
result.providerResponseHeaders,
|
||||
)) {
|
||||
res.setHeader(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
res.end(result.providerResponseBody || "OK");
|
||||
}
|
||||
|
||||
/**
|
||||
* Read request body as string.
|
||||
*/
|
||||
private readBody(req: http.IncomingMessage): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const chunks: Buffer[] = [];
|
||||
req.on("data", (chunk) => chunks.push(chunk));
|
||||
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf-8")));
|
||||
req.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle auto-response for inbound calls using the agent system.
|
||||
* Supports tool calling for richer voice interactions.
|
||||
*/
|
||||
private async handleInboundResponse(
|
||||
callId: string,
|
||||
userMessage: string,
|
||||
): Promise<void> {
|
||||
console.log(
|
||||
`[voice-call] Auto-responding to inbound call ${callId}: "${userMessage}"`,
|
||||
);
|
||||
|
||||
// Get call context for conversation history
|
||||
const call = this.manager.getCall(callId);
|
||||
if (!call) {
|
||||
console.warn(`[voice-call] Call ${callId} not found for auto-response`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.coreConfig) {
|
||||
console.warn("[voice-call] Core config missing; skipping auto-response");
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const { generateVoiceResponse } = await import("./response-generator.js");
|
||||
|
||||
const result = await generateVoiceResponse({
|
||||
voiceConfig: this.config,
|
||||
coreConfig: this.coreConfig,
|
||||
callId,
|
||||
from: call.from,
|
||||
transcript: call.transcript,
|
||||
userMessage,
|
||||
});
|
||||
|
||||
if (result.error) {
|
||||
console.error(
|
||||
`[voice-call] Response generation error: ${result.error}`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if (result.text) {
|
||||
console.log(`[voice-call] AI response: "${result.text}"`);
|
||||
await this.manager.speak(callId, result.text);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`[voice-call] Auto-response error:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the current machine's Tailscale DNS name.
|
||||
*/
|
||||
export type TailscaleSelfInfo = {
|
||||
dnsName: string | null;
|
||||
nodeId: string | null;
|
||||
};
|
||||
|
||||
/**
|
||||
* Run a tailscale command with timeout, collecting stdout.
|
||||
*/
|
||||
function runTailscaleCommand(
|
||||
args: string[],
|
||||
timeoutMs = 2500,
|
||||
): Promise<{ code: number; stdout: string }> {
|
||||
return new Promise((resolve) => {
|
||||
const proc = spawn("tailscale", args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
let stdout = "";
|
||||
proc.stdout.on("data", (data) => {
|
||||
stdout += data;
|
||||
});
|
||||
|
||||
const timer = setTimeout(() => {
|
||||
proc.kill("SIGKILL");
|
||||
resolve({ code: -1, stdout: "" });
|
||||
}, timeoutMs);
|
||||
|
||||
proc.on("close", (code) => {
|
||||
clearTimeout(timer);
|
||||
resolve({ code: code ?? -1, stdout });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export async function getTailscaleSelfInfo(): Promise<TailscaleSelfInfo | null> {
|
||||
const { code, stdout } = await runTailscaleCommand(["status", "--json"]);
|
||||
if (code !== 0) return null;
|
||||
|
||||
try {
|
||||
const status = JSON.parse(stdout);
|
||||
return {
|
||||
dnsName: status.Self?.DNSName?.replace(/\.$/, "") || null,
|
||||
nodeId: status.Self?.ID || null,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function getTailscaleDnsName(): Promise<string | null> {
|
||||
const info = await getTailscaleSelfInfo();
|
||||
return info?.dnsName ?? null;
|
||||
}
|
||||
|
||||
export async function setupTailscaleExposureRoute(opts: {
|
||||
mode: "serve" | "funnel";
|
||||
path: string;
|
||||
localUrl: string;
|
||||
}): Promise<string | null> {
|
||||
const dnsName = await getTailscaleDnsName();
|
||||
if (!dnsName) {
|
||||
console.warn("[voice-call] Could not get Tailscale DNS name");
|
||||
return null;
|
||||
}
|
||||
|
||||
const { code } = await runTailscaleCommand([
|
||||
opts.mode,
|
||||
"--bg",
|
||||
"--yes",
|
||||
"--set-path",
|
||||
opts.path,
|
||||
opts.localUrl,
|
||||
]);
|
||||
|
||||
if (code === 0) {
|
||||
const publicUrl = `https://${dnsName}${opts.path}`;
|
||||
console.log(`[voice-call] Tailscale ${opts.mode} active: ${publicUrl}`);
|
||||
return publicUrl;
|
||||
}
|
||||
|
||||
console.warn(`[voice-call] Tailscale ${opts.mode} failed`);
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function cleanupTailscaleExposureRoute(opts: {
|
||||
mode: "serve" | "funnel";
|
||||
path: string;
|
||||
}): Promise<void> {
|
||||
await runTailscaleCommand([opts.mode, "off", opts.path]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup Tailscale serve/funnel for the webhook server.
|
||||
* This is a helper that shells out to `tailscale serve` or `tailscale funnel`.
|
||||
*/
|
||||
export async function setupTailscaleExposure(
|
||||
config: VoiceCallConfig,
|
||||
): Promise<string | null> {
|
||||
if (config.tailscale.mode === "off") {
|
||||
return null;
|
||||
}
|
||||
|
||||
const mode = config.tailscale.mode === "funnel" ? "funnel" : "serve";
|
||||
// Include the path suffix so tailscale forwards to the correct endpoint
|
||||
// (tailscale strips the mount path prefix when proxying)
|
||||
const localUrl = `http://127.0.0.1:${config.serve.port}${config.serve.path}`;
|
||||
return setupTailscaleExposureRoute({
|
||||
mode,
|
||||
path: config.tailscale.path,
|
||||
localUrl,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup Tailscale serve/funnel.
|
||||
*/
|
||||
export async function cleanupTailscaleExposure(
|
||||
config: VoiceCallConfig,
|
||||
): Promise<void> {
|
||||
if (config.tailscale.mode === "off") {
|
||||
return;
|
||||
}
|
||||
|
||||
const mode = config.tailscale.mode === "funnel" ? "funnel" : "serve";
|
||||
await cleanupTailscaleExposureRoute({ mode, path: config.tailscale.path });
|
||||
}
|
||||
Reference in New Issue
Block a user