Add ez-assistant and kerberos service folders
This commit is contained in:
@@ -0,0 +1,729 @@
|
||||
import AVFAudio
|
||||
import MoltbotKit
|
||||
import MoltbotProtocol
|
||||
import Foundation
|
||||
import Observation
|
||||
import OSLog
|
||||
import Speech
|
||||
|
||||
@MainActor
|
||||
@Observable
|
||||
final class TalkModeManager: NSObject {
|
||||
private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
|
||||
private static let defaultModelIdFallback = "eleven_v3"
|
||||
var isEnabled: Bool = false
|
||||
var isListening: Bool = false
|
||||
var isSpeaking: Bool = false
|
||||
var statusText: String = "Off"
|
||||
|
||||
private let audioEngine = AVAudioEngine()
|
||||
private var speechRecognizer: SFSpeechRecognizer?
|
||||
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||
private var recognitionTask: SFSpeechRecognitionTask?
|
||||
private var silenceTask: Task<Void, Never>?
|
||||
|
||||
private var lastHeard: Date?
|
||||
private var lastTranscript: String = ""
|
||||
private var lastSpokenText: String?
|
||||
private var lastInterruptedAtSeconds: Double?
|
||||
|
||||
private var defaultVoiceId: String?
|
||||
private var currentVoiceId: String?
|
||||
private var defaultModelId: String?
|
||||
private var currentModelId: String?
|
||||
private var voiceOverrideActive = false
|
||||
private var modelOverrideActive = false
|
||||
private var defaultOutputFormat: String?
|
||||
private var apiKey: String?
|
||||
private var voiceAliases: [String: String] = [:]
|
||||
private var interruptOnSpeech: Bool = true
|
||||
private var mainSessionKey: String = "main"
|
||||
private var fallbackVoiceId: String?
|
||||
private var lastPlaybackWasPCM: Bool = false
|
||||
var pcmPlayer: PCMStreamingAudioPlaying = PCMStreamingAudioPlayer.shared
|
||||
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
|
||||
|
||||
private var gateway: GatewayNodeSession?
|
||||
private let silenceWindow: TimeInterval = 0.7
|
||||
|
||||
private var chatSubscribedSessionKeys = Set<String>()
|
||||
|
||||
private let logger = Logger(subsystem: "bot.molt", category: "TalkMode")
|
||||
|
||||
func attachGateway(_ gateway: GatewayNodeSession) {
|
||||
self.gateway = gateway
|
||||
}
|
||||
|
||||
func updateMainSessionKey(_ sessionKey: String?) {
|
||||
let trimmed = (sessionKey ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return }
|
||||
if SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) { return }
|
||||
self.mainSessionKey = trimmed
|
||||
}
|
||||
|
||||
func setEnabled(_ enabled: Bool) {
|
||||
self.isEnabled = enabled
|
||||
if enabled {
|
||||
self.logger.info("enabled")
|
||||
Task { await self.start() }
|
||||
} else {
|
||||
self.logger.info("disabled")
|
||||
self.stop()
|
||||
}
|
||||
}
|
||||
|
||||
func start() async {
|
||||
guard self.isEnabled else { return }
|
||||
if self.isListening { return }
|
||||
|
||||
self.logger.info("start")
|
||||
self.statusText = "Requesting permissions…"
|
||||
let micOk = await Self.requestMicrophonePermission()
|
||||
guard micOk else {
|
||||
self.logger.warning("start blocked: microphone permission denied")
|
||||
self.statusText = "Microphone permission denied"
|
||||
return
|
||||
}
|
||||
let speechOk = await Self.requestSpeechPermission()
|
||||
guard speechOk else {
|
||||
self.logger.warning("start blocked: speech permission denied")
|
||||
self.statusText = "Speech recognition permission denied"
|
||||
return
|
||||
}
|
||||
|
||||
await self.reloadConfig()
|
||||
do {
|
||||
try Self.configureAudioSession()
|
||||
try self.startRecognition()
|
||||
self.isListening = true
|
||||
self.statusText = "Listening"
|
||||
self.startSilenceMonitor()
|
||||
await self.subscribeChatIfNeeded(sessionKey: self.mainSessionKey)
|
||||
self.logger.info("listening")
|
||||
} catch {
|
||||
self.isListening = false
|
||||
self.statusText = "Start failed: \(error.localizedDescription)"
|
||||
self.logger.error("start failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
|
||||
func stop() {
|
||||
self.isEnabled = false
|
||||
self.isListening = false
|
||||
self.statusText = "Off"
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.silenceTask?.cancel()
|
||||
self.silenceTask = nil
|
||||
self.stopRecognition()
|
||||
self.stopSpeaking()
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
TalkSystemSpeechSynthesizer.shared.stop()
|
||||
do {
|
||||
try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
|
||||
} catch {
|
||||
self.logger.warning("audio session deactivate failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
Task { await self.unsubscribeAllChats() }
|
||||
}
|
||||
|
||||
func userTappedOrb() {
|
||||
self.stopSpeaking()
|
||||
}
|
||||
|
||||
private func startRecognition() throws {
|
||||
#if targetEnvironment(simulator)
|
||||
throw NSError(domain: "TalkMode", code: 2, userInfo: [
|
||||
NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator",
|
||||
])
|
||||
#endif
|
||||
|
||||
self.stopRecognition()
|
||||
self.speechRecognizer = SFSpeechRecognizer()
|
||||
guard let recognizer = self.speechRecognizer else {
|
||||
throw NSError(domain: "TalkMode", code: 1, userInfo: [
|
||||
NSLocalizedDescriptionKey: "Speech recognizer unavailable",
|
||||
])
|
||||
}
|
||||
|
||||
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
||||
self.recognitionRequest?.shouldReportPartialResults = true
|
||||
guard let request = self.recognitionRequest else { return }
|
||||
|
||||
let input = self.audioEngine.inputNode
|
||||
let format = input.outputFormat(forBus: 0)
|
||||
guard format.sampleRate > 0, format.channelCount > 0 else {
|
||||
throw NSError(domain: "TalkMode", code: 3, userInfo: [
|
||||
NSLocalizedDescriptionKey: "Invalid audio input format",
|
||||
])
|
||||
}
|
||||
input.removeTap(onBus: 0)
|
||||
let tapBlock = Self.makeAudioTapAppendCallback(request: request)
|
||||
input.installTap(onBus: 0, bufferSize: 2048, format: format, block: tapBlock)
|
||||
|
||||
self.audioEngine.prepare()
|
||||
try self.audioEngine.start()
|
||||
|
||||
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
||||
guard let self else { return }
|
||||
if let error {
|
||||
if !self.isSpeaking {
|
||||
self.statusText = "Speech error: \(error.localizedDescription)"
|
||||
}
|
||||
self.logger.debug("speech recognition error: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
guard let result else { return }
|
||||
let transcript = result.bestTranscription.formattedString
|
||||
Task { @MainActor in
|
||||
await self.handleTranscript(transcript: transcript, isFinal: result.isFinal)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func stopRecognition() {
|
||||
self.recognitionTask?.cancel()
|
||||
self.recognitionTask = nil
|
||||
self.recognitionRequest?.endAudio()
|
||||
self.recognitionRequest = nil
|
||||
self.audioEngine.inputNode.removeTap(onBus: 0)
|
||||
self.audioEngine.stop()
|
||||
self.speechRecognizer = nil
|
||||
}
|
||||
|
||||
private nonisolated static func makeAudioTapAppendCallback(request: SpeechRequest) -> AVAudioNodeTapBlock {
|
||||
{ buffer, _ in
|
||||
request.append(buffer)
|
||||
}
|
||||
}
|
||||
|
||||
private func handleTranscript(transcript: String, isFinal: Bool) async {
|
||||
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if self.isSpeaking, self.interruptOnSpeech {
|
||||
if self.shouldInterrupt(with: trimmed) {
|
||||
self.stopSpeaking()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
guard self.isListening else { return }
|
||||
if !trimmed.isEmpty {
|
||||
self.lastTranscript = trimmed
|
||||
self.lastHeard = Date()
|
||||
}
|
||||
if isFinal {
|
||||
self.lastTranscript = trimmed
|
||||
}
|
||||
}
|
||||
|
||||
private func startSilenceMonitor() {
|
||||
self.silenceTask?.cancel()
|
||||
self.silenceTask = Task { [weak self] in
|
||||
guard let self else { return }
|
||||
while self.isEnabled {
|
||||
try? await Task.sleep(nanoseconds: 200_000_000)
|
||||
await self.checkSilence()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func checkSilence() async {
|
||||
guard self.isListening, !self.isSpeaking else { return }
|
||||
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !transcript.isEmpty else { return }
|
||||
guard let lastHeard else { return }
|
||||
if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return }
|
||||
await self.finalizeTranscript(transcript)
|
||||
}
|
||||
|
||||
private func finalizeTranscript(_ transcript: String) async {
|
||||
self.isListening = false
|
||||
self.statusText = "Thinking…"
|
||||
self.lastTranscript = ""
|
||||
self.lastHeard = nil
|
||||
self.stopRecognition()
|
||||
|
||||
await self.reloadConfig()
|
||||
let prompt = self.buildPrompt(transcript: transcript)
|
||||
guard let gateway else {
|
||||
self.statusText = "Gateway not connected"
|
||||
self.logger.warning("finalize: gateway not connected")
|
||||
await self.start()
|
||||
return
|
||||
}
|
||||
|
||||
do {
|
||||
let startedAt = Date().timeIntervalSince1970
|
||||
let sessionKey = self.mainSessionKey
|
||||
await self.subscribeChatIfNeeded(sessionKey: sessionKey)
|
||||
self.logger.info(
|
||||
"chat.send start sessionKey=\(sessionKey, privacy: .public) chars=\(prompt.count, privacy: .public)")
|
||||
let runId = try await self.sendChat(prompt, gateway: gateway)
|
||||
self.logger.info("chat.send ok runId=\(runId, privacy: .public)")
|
||||
let completion = await self.waitForChatCompletion(runId: runId, gateway: gateway, timeoutSeconds: 120)
|
||||
if completion == .timeout {
|
||||
self.logger.warning(
|
||||
"chat completion timeout runId=\(runId, privacy: .public); attempting history fallback")
|
||||
} else if completion == .aborted {
|
||||
self.statusText = "Aborted"
|
||||
self.logger.warning("chat completion aborted runId=\(runId, privacy: .public)")
|
||||
await self.start()
|
||||
return
|
||||
} else if completion == .error {
|
||||
self.statusText = "Chat error"
|
||||
self.logger.warning("chat completion error runId=\(runId, privacy: .public)")
|
||||
await self.start()
|
||||
return
|
||||
}
|
||||
|
||||
guard let assistantText = try await self.waitForAssistantText(
|
||||
gateway: gateway,
|
||||
since: startedAt,
|
||||
timeoutSeconds: completion == .final ? 12 : 25)
|
||||
else {
|
||||
self.statusText = "No reply"
|
||||
self.logger.warning("assistant text timeout runId=\(runId, privacy: .public)")
|
||||
await self.start()
|
||||
return
|
||||
}
|
||||
self.logger.info("assistant text ok chars=\(assistantText.count, privacy: .public)")
|
||||
await self.playAssistant(text: assistantText)
|
||||
} catch {
|
||||
self.statusText = "Talk failed: \(error.localizedDescription)"
|
||||
self.logger.error("finalize failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
|
||||
await self.start()
|
||||
}
|
||||
|
||||
private func subscribeChatIfNeeded(sessionKey: String) async {
|
||||
let key = sessionKey.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !key.isEmpty else { return }
|
||||
guard let gateway else { return }
|
||||
guard !self.chatSubscribedSessionKeys.contains(key) else { return }
|
||||
|
||||
let payload = "{\"sessionKey\":\"\(key)\"}"
|
||||
await gateway.sendEvent(event: "chat.subscribe", payloadJSON: payload)
|
||||
self.chatSubscribedSessionKeys.insert(key)
|
||||
self.logger.info("chat.subscribe ok sessionKey=\(key, privacy: .public)")
|
||||
}
|
||||
|
||||
private func unsubscribeAllChats() async {
|
||||
guard let gateway else { return }
|
||||
let keys = self.chatSubscribedSessionKeys
|
||||
self.chatSubscribedSessionKeys.removeAll()
|
||||
for key in keys {
|
||||
let payload = "{\"sessionKey\":\"\(key)\"}"
|
||||
await gateway.sendEvent(event: "chat.unsubscribe", payloadJSON: payload)
|
||||
}
|
||||
}
|
||||
|
||||
private func buildPrompt(transcript: String) -> String {
|
||||
let interrupted = self.lastInterruptedAtSeconds
|
||||
self.lastInterruptedAtSeconds = nil
|
||||
return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted)
|
||||
}
|
||||
|
||||
private enum ChatCompletionState: CustomStringConvertible {
|
||||
case final
|
||||
case aborted
|
||||
case error
|
||||
case timeout
|
||||
|
||||
var description: String {
|
||||
switch self {
|
||||
case .final: "final"
|
||||
case .aborted: "aborted"
|
||||
case .error: "error"
|
||||
case .timeout: "timeout"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func sendChat(_ message: String, gateway: GatewayNodeSession) async throws -> String {
|
||||
struct SendResponse: Decodable { let runId: String }
|
||||
let payload: [String: Any] = [
|
||||
"sessionKey": self.mainSessionKey,
|
||||
"message": message,
|
||||
"thinking": "low",
|
||||
"timeoutMs": 30000,
|
||||
"idempotencyKey": UUID().uuidString,
|
||||
]
|
||||
let data = try JSONSerialization.data(withJSONObject: payload)
|
||||
guard let json = String(bytes: data, encoding: .utf8) else {
|
||||
throw NSError(
|
||||
domain: "TalkModeManager",
|
||||
code: 1,
|
||||
userInfo: [NSLocalizedDescriptionKey: "Failed to encode chat payload"])
|
||||
}
|
||||
let res = try await gateway.request(method: "chat.send", paramsJSON: json, timeoutSeconds: 30)
|
||||
let decoded = try JSONDecoder().decode(SendResponse.self, from: res)
|
||||
return decoded.runId
|
||||
}
|
||||
|
||||
private func waitForChatCompletion(
|
||||
runId: String,
|
||||
gateway: GatewayNodeSession,
|
||||
timeoutSeconds: Int = 120) async -> ChatCompletionState
|
||||
{
|
||||
let stream = await gateway.subscribeServerEvents(bufferingNewest: 200)
|
||||
return await withTaskGroup(of: ChatCompletionState.self) { group in
|
||||
group.addTask { [runId] in
|
||||
for await evt in stream {
|
||||
if Task.isCancelled { return .timeout }
|
||||
guard evt.event == "chat", let payload = evt.payload else { continue }
|
||||
guard let chatEvent = try? GatewayPayloadDecoding.decode(payload, as: ChatEvent.self) else {
|
||||
continue
|
||||
}
|
||||
guard chatEvent.runid == runId else { continue }
|
||||
if let state = chatEvent.state.value as? String {
|
||||
switch state {
|
||||
case "final": return .final
|
||||
case "aborted": return .aborted
|
||||
case "error": return .error
|
||||
default: break
|
||||
}
|
||||
}
|
||||
}
|
||||
return .timeout
|
||||
}
|
||||
group.addTask {
|
||||
try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000)
|
||||
return .timeout
|
||||
}
|
||||
let result = await group.next() ?? .timeout
|
||||
group.cancelAll()
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
private func waitForAssistantText(
|
||||
gateway: GatewayNodeSession,
|
||||
since: Double,
|
||||
timeoutSeconds: Int) async throws -> String?
|
||||
{
|
||||
let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds))
|
||||
while Date() < deadline {
|
||||
if let text = try await self.fetchLatestAssistantText(gateway: gateway, since: since) {
|
||||
return text
|
||||
}
|
||||
try? await Task.sleep(nanoseconds: 300_000_000)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private func fetchLatestAssistantText(gateway: GatewayNodeSession, since: Double? = nil) async throws -> String? {
|
||||
let res = try await gateway.request(
|
||||
method: "chat.history",
|
||||
paramsJSON: "{\"sessionKey\":\"\(self.mainSessionKey)\"}",
|
||||
timeoutSeconds: 15)
|
||||
guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return nil }
|
||||
guard let messages = json["messages"] as? [[String: Any]] else { return nil }
|
||||
for msg in messages.reversed() {
|
||||
guard (msg["role"] as? String) == "assistant" else { continue }
|
||||
if let since, let timestamp = msg["timestamp"] as? Double,
|
||||
TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) == false
|
||||
{
|
||||
continue
|
||||
}
|
||||
guard let content = msg["content"] as? [[String: Any]] else { continue }
|
||||
let text = content.compactMap { $0["text"] as? String }.joined(separator: "\n")
|
||||
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if !trimmed.isEmpty { return trimmed }
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private func playAssistant(text: String) async {
|
||||
let parsed = TalkDirectiveParser.parse(text)
|
||||
let directive = parsed.directive
|
||||
let cleaned = parsed.stripped.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !cleaned.isEmpty else { return }
|
||||
|
||||
let requestedVoice = directive?.voiceId?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let resolvedVoice = self.resolveVoiceAlias(requestedVoice)
|
||||
if requestedVoice?.isEmpty == false, resolvedVoice == nil {
|
||||
self.logger.warning("unknown voice alias \(requestedVoice ?? "?", privacy: .public)")
|
||||
}
|
||||
if let voice = resolvedVoice {
|
||||
if directive?.once != true {
|
||||
self.currentVoiceId = voice
|
||||
self.voiceOverrideActive = true
|
||||
}
|
||||
}
|
||||
if let model = directive?.modelId {
|
||||
if directive?.once != true {
|
||||
self.currentModelId = model
|
||||
self.modelOverrideActive = true
|
||||
}
|
||||
}
|
||||
|
||||
self.statusText = "Generating voice…"
|
||||
self.isSpeaking = true
|
||||
self.lastSpokenText = cleaned
|
||||
|
||||
do {
|
||||
let started = Date()
|
||||
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
|
||||
|
||||
let resolvedKey =
|
||||
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
|
||||
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
|
||||
let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let preferredVoice = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
|
||||
let voiceId: String? = if let apiKey, !apiKey.isEmpty {
|
||||
await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
|
||||
} else {
|
||||
nil
|
||||
}
|
||||
let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)
|
||||
|
||||
if canUseElevenLabs, let voiceId, let apiKey {
|
||||
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
|
||||
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
|
||||
if outputFormat == nil, let requestedOutputFormat {
|
||||
self.logger.warning(
|
||||
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
|
||||
}
|
||||
|
||||
let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
|
||||
func makeRequest(outputFormat: String?) -> ElevenLabsTTSRequest {
|
||||
ElevenLabsTTSRequest(
|
||||
text: cleaned,
|
||||
modelId: modelId,
|
||||
outputFormat: outputFormat,
|
||||
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
|
||||
stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
|
||||
similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
|
||||
style: TalkTTSValidation.validatedUnit(directive?.style),
|
||||
speakerBoost: directive?.speakerBoost,
|
||||
seed: TalkTTSValidation.validatedSeed(directive?.seed),
|
||||
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
|
||||
language: language,
|
||||
latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))
|
||||
}
|
||||
|
||||
let request = makeRequest(outputFormat: outputFormat)
|
||||
|
||||
let client = ElevenLabsTTSClient(apiKey: apiKey)
|
||||
let stream = client.streamSynthesize(voiceId: voiceId, request: request)
|
||||
|
||||
if self.interruptOnSpeech {
|
||||
do {
|
||||
try self.startRecognition()
|
||||
} catch {
|
||||
self.logger.warning(
|
||||
"startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
|
||||
self.statusText = "Speaking…"
|
||||
let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
|
||||
let result: StreamingPlaybackResult
|
||||
if let sampleRate {
|
||||
self.lastPlaybackWasPCM = true
|
||||
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
|
||||
if !playback.finished, playback.interruptedAt == nil {
|
||||
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
|
||||
self.logger.warning("pcm playback failed; retrying mp3")
|
||||
self.lastPlaybackWasPCM = false
|
||||
let mp3Stream = client.streamSynthesize(
|
||||
voiceId: voiceId,
|
||||
request: makeRequest(outputFormat: mp3Format))
|
||||
playback = await self.mp3Player.play(stream: mp3Stream)
|
||||
}
|
||||
result = playback
|
||||
} else {
|
||||
self.lastPlaybackWasPCM = false
|
||||
result = await self.mp3Player.play(stream: stream)
|
||||
}
|
||||
let duration = Date().timeIntervalSince(started)
|
||||
self.logger.info("elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(duration, privacy: .public)s")
|
||||
if !result.finished, let interruptedAt = result.interruptedAt {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
} else {
|
||||
self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
|
||||
if self.interruptOnSpeech {
|
||||
do {
|
||||
try self.startRecognition()
|
||||
} catch {
|
||||
self.logger.warning(
|
||||
"startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
self.statusText = "Speaking (System)…"
|
||||
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
|
||||
}
|
||||
} catch {
|
||||
self.logger.error(
|
||||
"tts failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
|
||||
do {
|
||||
if self.interruptOnSpeech {
|
||||
do {
|
||||
try self.startRecognition()
|
||||
} catch {
|
||||
self.logger.warning(
|
||||
"startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
self.statusText = "Speaking (System)…"
|
||||
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
|
||||
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
|
||||
} catch {
|
||||
self.statusText = "Speak failed: \(error.localizedDescription)"
|
||||
self.logger.error("system voice failed: \(error.localizedDescription, privacy: .public)")
|
||||
}
|
||||
}
|
||||
|
||||
self.stopRecognition()
|
||||
self.isSpeaking = false
|
||||
}
|
||||
|
||||
private func stopSpeaking(storeInterruption: Bool = true) {
|
||||
guard self.isSpeaking else { return }
|
||||
let interruptedAt = self.lastPlaybackWasPCM
|
||||
? self.pcmPlayer.stop()
|
||||
: self.mp3Player.stop()
|
||||
if storeInterruption {
|
||||
self.lastInterruptedAtSeconds = interruptedAt
|
||||
}
|
||||
_ = self.lastPlaybackWasPCM
|
||||
? self.mp3Player.stop()
|
||||
: self.pcmPlayer.stop()
|
||||
TalkSystemSpeechSynthesizer.shared.stop()
|
||||
self.isSpeaking = false
|
||||
}
|
||||
|
||||
private func shouldInterrupt(with transcript: String) -> Bool {
|
||||
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard trimmed.count >= 3 else { return false }
|
||||
if let spoken = self.lastSpokenText?.lowercased(), spoken.contains(trimmed.lowercased()) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
private func resolveVoiceAlias(_ value: String?) -> String? {
|
||||
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty else { return nil }
|
||||
let normalized = trimmed.lowercased()
|
||||
if let mapped = self.voiceAliases[normalized] { return mapped }
|
||||
if self.voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) {
|
||||
return trimmed
|
||||
}
|
||||
return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
|
||||
}
|
||||
|
||||
private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
|
||||
let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||
if !trimmed.isEmpty {
|
||||
if let resolved = self.resolveVoiceAlias(trimmed) { return resolved }
|
||||
self.logger.warning("unknown voice alias \(trimmed, privacy: .public)")
|
||||
}
|
||||
if let fallbackVoiceId { return fallbackVoiceId }
|
||||
|
||||
do {
|
||||
let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
|
||||
guard let first = voices.first else {
|
||||
self.logger.warning("elevenlabs voices list empty")
|
||||
return nil
|
||||
}
|
||||
self.fallbackVoiceId = first.voiceId
|
||||
if self.defaultVoiceId == nil {
|
||||
self.defaultVoiceId = first.voiceId
|
||||
}
|
||||
if !self.voiceOverrideActive {
|
||||
self.currentVoiceId = first.voiceId
|
||||
}
|
||||
let name = first.name ?? "unknown"
|
||||
self.logger
|
||||
.info("default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
|
||||
return first.voiceId
|
||||
} catch {
|
||||
self.logger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
private static func isLikelyVoiceId(_ value: String) -> Bool {
|
||||
guard value.count >= 10 else { return false }
|
||||
return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
|
||||
}
|
||||
|
||||
private func reloadConfig() async {
|
||||
guard let gateway else { return }
|
||||
do {
|
||||
let res = try await gateway.request(method: "config.get", paramsJSON: "{}", timeoutSeconds: 8)
|
||||
guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return }
|
||||
guard let config = json["config"] as? [String: Any] else { return }
|
||||
let talk = config["talk"] as? [String: Any]
|
||||
let session = config["session"] as? [String: Any]
|
||||
let mainKey = SessionKey.normalizeMainKey(session?["mainKey"] as? String)
|
||||
if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) {
|
||||
self.mainSessionKey = mainKey
|
||||
}
|
||||
self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if let aliases = talk?["voiceAliases"] as? [String: Any] {
|
||||
var resolved: [String: String] = [:]
|
||||
for (key, value) in aliases {
|
||||
guard let id = value as? String else { continue }
|
||||
let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
||||
let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { continue }
|
||||
resolved[normalizedKey] = trimmedId
|
||||
}
|
||||
self.voiceAliases = resolved
|
||||
} else {
|
||||
self.voiceAliases = [:]
|
||||
}
|
||||
if !self.voiceOverrideActive {
|
||||
self.currentVoiceId = self.defaultVoiceId
|
||||
}
|
||||
let model = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
self.defaultModelId = (model?.isEmpty == false) ? model : Self.defaultModelIdFallback
|
||||
if !self.modelOverrideActive {
|
||||
self.currentModelId = self.defaultModelId
|
||||
}
|
||||
self.defaultOutputFormat = (talk?["outputFormat"] as? String)?
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
self.apiKey = (talk?["apiKey"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if let interrupt = talk?["interruptOnSpeech"] as? Bool {
|
||||
self.interruptOnSpeech = interrupt
|
||||
}
|
||||
} catch {
|
||||
self.defaultModelId = Self.defaultModelIdFallback
|
||||
if !self.modelOverrideActive {
|
||||
self.currentModelId = self.defaultModelId
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static func configureAudioSession() throws {
|
||||
let session = AVAudioSession.sharedInstance()
|
||||
try session.setCategory(.playAndRecord, mode: .voiceChat, options: [
|
||||
.duckOthers,
|
||||
.mixWithOthers,
|
||||
.allowBluetoothHFP,
|
||||
.defaultToSpeaker,
|
||||
])
|
||||
try session.setActive(true, options: [])
|
||||
}
|
||||
|
||||
private nonisolated static func requestMicrophonePermission() async -> Bool {
|
||||
await withCheckedContinuation(isolation: nil) { cont in
|
||||
AVAudioApplication.requestRecordPermission { ok in
|
||||
cont.resume(returning: ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private nonisolated static func requestSpeechPermission() async -> Bool {
|
||||
await withCheckedContinuation(isolation: nil) { cont in
|
||||
SFSpeechRecognizer.requestAuthorization { status in
|
||||
cont.resume(returning: status == .authorized)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
import SwiftUI
|
||||
|
||||
struct TalkOrbOverlay: View {
|
||||
@Environment(NodeAppModel.self) private var appModel
|
||||
@State private var pulse: Bool = false
|
||||
|
||||
var body: some View {
|
||||
let seam = self.appModel.seamColor
|
||||
let status = self.appModel.talkMode.statusText.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
|
||||
VStack(spacing: 14) {
|
||||
ZStack {
|
||||
Circle()
|
||||
.stroke(seam.opacity(0.26), lineWidth: 2)
|
||||
.frame(width: 320, height: 320)
|
||||
.scaleEffect(self.pulse ? 1.15 : 0.96)
|
||||
.opacity(self.pulse ? 0.0 : 1.0)
|
||||
.animation(.easeOut(duration: 1.3).repeatForever(autoreverses: false), value: self.pulse)
|
||||
|
||||
Circle()
|
||||
.stroke(seam.opacity(0.18), lineWidth: 2)
|
||||
.frame(width: 320, height: 320)
|
||||
.scaleEffect(self.pulse ? 1.45 : 1.02)
|
||||
.opacity(self.pulse ? 0.0 : 0.9)
|
||||
.animation(.easeOut(duration: 1.9).repeatForever(autoreverses: false).delay(0.2), value: self.pulse)
|
||||
|
||||
Circle()
|
||||
.fill(
|
||||
RadialGradient(
|
||||
colors: [
|
||||
seam.opacity(0.95),
|
||||
seam.opacity(0.40),
|
||||
Color.black.opacity(0.55),
|
||||
],
|
||||
center: .center,
|
||||
startRadius: 1,
|
||||
endRadius: 112))
|
||||
.frame(width: 190, height: 190)
|
||||
.overlay(
|
||||
Circle()
|
||||
.stroke(seam.opacity(0.35), lineWidth: 1))
|
||||
.shadow(color: seam.opacity(0.32), radius: 26, x: 0, y: 0)
|
||||
.shadow(color: Color.black.opacity(0.50), radius: 22, x: 0, y: 10)
|
||||
}
|
||||
.contentShape(Circle())
|
||||
.onTapGesture {
|
||||
self.appModel.talkMode.userTappedOrb()
|
||||
}
|
||||
|
||||
if !status.isEmpty, status != "Off" {
|
||||
Text(status)
|
||||
.font(.system(.footnote, design: .rounded).weight(.semibold))
|
||||
.foregroundStyle(Color.white.opacity(0.92))
|
||||
.padding(.horizontal, 12)
|
||||
.padding(.vertical, 8)
|
||||
.background(
|
||||
Capsule()
|
||||
.fill(Color.black.opacity(0.40))
|
||||
.overlay(
|
||||
Capsule().stroke(seam.opacity(0.22), lineWidth: 1)))
|
||||
}
|
||||
}
|
||||
.padding(28)
|
||||
.onAppear {
|
||||
self.pulse = true
|
||||
}
|
||||
.accessibilityElement(children: .combine)
|
||||
.accessibilityLabel("Talk Mode \(status)")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
import SwiftUI
|
||||
|
||||
struct VoiceTab: View {
|
||||
@Environment(NodeAppModel.self) private var appModel
|
||||
@Environment(VoiceWakeManager.self) private var voiceWake
|
||||
@AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false
|
||||
@AppStorage("talk.enabled") private var talkEnabled: Bool = false
|
||||
|
||||
var body: some View {
|
||||
NavigationStack {
|
||||
List {
|
||||
Section("Status") {
|
||||
LabeledContent("Voice Wake", value: self.voiceWakeEnabled ? "Enabled" : "Disabled")
|
||||
LabeledContent("Listener", value: self.voiceWake.isListening ? "Listening" : "Idle")
|
||||
Text(self.voiceWake.statusText)
|
||||
.font(.footnote)
|
||||
.foregroundStyle(.secondary)
|
||||
LabeledContent("Talk Mode", value: self.talkEnabled ? "Enabled" : "Disabled")
|
||||
}
|
||||
|
||||
Section("Notes") {
|
||||
let triggers = self.voiceWake.activeTriggerWords
|
||||
Group {
|
||||
if triggers.isEmpty {
|
||||
Text("Add wake words in Settings.")
|
||||
} else if triggers.count == 1 {
|
||||
Text("Say “\(triggers[0]) …” to trigger.")
|
||||
} else if triggers.count == 2 {
|
||||
Text("Say “\(triggers[0]) …” or “\(triggers[1]) …” to trigger.")
|
||||
} else {
|
||||
Text("Say “\(triggers.joined(separator: " …”, “")) …” to trigger.")
|
||||
}
|
||||
}
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
}
|
||||
.navigationTitle("Voice")
|
||||
.onChange(of: self.voiceWakeEnabled) { _, newValue in
|
||||
self.appModel.setVoiceWakeEnabled(newValue)
|
||||
}
|
||||
.onChange(of: self.talkEnabled) { _, newValue in
|
||||
self.appModel.setTalkEnabled(newValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,389 @@
|
||||
import AVFAudio
|
||||
import Foundation
|
||||
import Observation
|
||||
import Speech
|
||||
import SwabbleKit
|
||||
|
||||
private func makeAudioTapEnqueueCallback(queue: AudioBufferQueue) -> @Sendable (AVAudioPCMBuffer, AVAudioTime) -> Void {
|
||||
{ buffer, _ in
|
||||
// This callback is invoked on a realtime audio thread/queue. Keep it tiny and nonisolated.
|
||||
queue.enqueueCopy(of: buffer)
|
||||
}
|
||||
}
|
||||
|
||||
private final class AudioBufferQueue: @unchecked Sendable {
|
||||
private let lock = NSLock()
|
||||
private var buffers: [AVAudioPCMBuffer] = []
|
||||
|
||||
func enqueueCopy(of buffer: AVAudioPCMBuffer) {
|
||||
guard let copy = buffer.deepCopy() else { return }
|
||||
self.lock.lock()
|
||||
self.buffers.append(copy)
|
||||
self.lock.unlock()
|
||||
}
|
||||
|
||||
func drain() -> [AVAudioPCMBuffer] {
|
||||
self.lock.lock()
|
||||
let drained = self.buffers
|
||||
self.buffers.removeAll(keepingCapacity: true)
|
||||
self.lock.unlock()
|
||||
return drained
|
||||
}
|
||||
|
||||
func clear() {
|
||||
self.lock.lock()
|
||||
self.buffers.removeAll(keepingCapacity: false)
|
||||
self.lock.unlock()
|
||||
}
|
||||
}
|
||||
|
||||
extension AVAudioPCMBuffer {
|
||||
fileprivate func deepCopy() -> AVAudioPCMBuffer? {
|
||||
let format = self.format
|
||||
let frameLength = self.frameLength
|
||||
guard let copy = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameLength) else {
|
||||
return nil
|
||||
}
|
||||
copy.frameLength = frameLength
|
||||
|
||||
if let src = self.floatChannelData, let dst = copy.floatChannelData {
|
||||
let channels = Int(format.channelCount)
|
||||
let frames = Int(frameLength)
|
||||
for ch in 0..<channels {
|
||||
dst[ch].update(from: src[ch], count: frames)
|
||||
}
|
||||
return copy
|
||||
}
|
||||
|
||||
if let src = self.int16ChannelData, let dst = copy.int16ChannelData {
|
||||
let channels = Int(format.channelCount)
|
||||
let frames = Int(frameLength)
|
||||
for ch in 0..<channels {
|
||||
dst[ch].update(from: src[ch], count: frames)
|
||||
}
|
||||
return copy
|
||||
}
|
||||
|
||||
if let src = self.int32ChannelData, let dst = copy.int32ChannelData {
|
||||
let channels = Int(format.channelCount)
|
||||
let frames = Int(frameLength)
|
||||
for ch in 0..<channels {
|
||||
dst[ch].update(from: src[ch], count: frames)
|
||||
}
|
||||
return copy
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
@MainActor
|
||||
@Observable
|
||||
final class VoiceWakeManager: NSObject {
|
||||
var isEnabled: Bool = false
|
||||
var isListening: Bool = false
|
||||
var statusText: String = "Off"
|
||||
var triggerWords: [String] = VoiceWakePreferences.loadTriggerWords()
|
||||
var lastTriggeredCommand: String?
|
||||
|
||||
private let audioEngine = AVAudioEngine()
|
||||
private var speechRecognizer: SFSpeechRecognizer?
|
||||
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||
private var recognitionTask: SFSpeechRecognitionTask?
|
||||
private var tapQueue: AudioBufferQueue?
|
||||
private var tapDrainTask: Task<Void, Never>?
|
||||
|
||||
private var lastDispatched: String?
|
||||
private var onCommand: (@Sendable (String) async -> Void)?
|
||||
private var userDefaultsObserver: NSObjectProtocol?
|
||||
|
||||
override init() {
|
||||
super.init()
|
||||
self.triggerWords = VoiceWakePreferences.loadTriggerWords()
|
||||
self.userDefaultsObserver = NotificationCenter.default.addObserver(
|
||||
forName: UserDefaults.didChangeNotification,
|
||||
object: UserDefaults.standard,
|
||||
queue: .main,
|
||||
using: { [weak self] _ in
|
||||
Task { @MainActor in
|
||||
self?.handleUserDefaultsDidChange()
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@MainActor deinit {
|
||||
if let userDefaultsObserver = self.userDefaultsObserver {
|
||||
NotificationCenter.default.removeObserver(userDefaultsObserver)
|
||||
}
|
||||
}
|
||||
|
||||
var activeTriggerWords: [String] {
|
||||
VoiceWakePreferences.sanitizeTriggerWords(self.triggerWords)
|
||||
}
|
||||
|
||||
private func handleUserDefaultsDidChange() {
|
||||
let updated = VoiceWakePreferences.loadTriggerWords()
|
||||
if updated != self.triggerWords {
|
||||
self.triggerWords = updated
|
||||
}
|
||||
}
|
||||
|
||||
func configure(onCommand: @escaping @Sendable (String) async -> Void) {
|
||||
self.onCommand = onCommand
|
||||
}
|
||||
|
||||
func setEnabled(_ enabled: Bool) {
|
||||
self.isEnabled = enabled
|
||||
if enabled {
|
||||
Task { await self.start() }
|
||||
} else {
|
||||
self.stop()
|
||||
}
|
||||
}
|
||||
|
||||
func start() async {
|
||||
guard self.isEnabled else { return }
|
||||
if self.isListening { return }
|
||||
|
||||
if ProcessInfo.processInfo.environment["SIMULATOR_DEVICE_NAME"] != nil ||
|
||||
ProcessInfo.processInfo.environment["SIMULATOR_UDID"] != nil
|
||||
{
|
||||
// The iOS Simulator’s audio stack is unreliable for long-running microphone capture.
|
||||
// (We’ve observed CoreAudio deadlocks after TCC permission prompts.)
|
||||
self.isListening = false
|
||||
self.statusText = "Voice Wake isn’t supported on Simulator"
|
||||
return
|
||||
}
|
||||
|
||||
self.statusText = "Requesting permissions…"
|
||||
|
||||
let micOk = await Self.requestMicrophonePermission()
|
||||
guard micOk else {
|
||||
self.statusText = "Microphone permission denied"
|
||||
self.isListening = false
|
||||
return
|
||||
}
|
||||
|
||||
let speechOk = await Self.requestSpeechPermission()
|
||||
guard speechOk else {
|
||||
self.statusText = "Speech recognition permission denied"
|
||||
self.isListening = false
|
||||
return
|
||||
}
|
||||
|
||||
self.speechRecognizer = SFSpeechRecognizer()
|
||||
guard self.speechRecognizer != nil else {
|
||||
self.statusText = "Speech recognizer unavailable"
|
||||
self.isListening = false
|
||||
return
|
||||
}
|
||||
|
||||
do {
|
||||
try Self.configureAudioSession()
|
||||
try self.startRecognition()
|
||||
self.isListening = true
|
||||
self.statusText = "Listening"
|
||||
} catch {
|
||||
self.isListening = false
|
||||
self.statusText = "Start failed: \(error.localizedDescription)"
|
||||
}
|
||||
}
|
||||
|
||||
func stop() {
|
||||
self.isEnabled = false
|
||||
self.isListening = false
|
||||
self.statusText = "Off"
|
||||
|
||||
self.tapDrainTask?.cancel()
|
||||
self.tapDrainTask = nil
|
||||
self.tapQueue?.clear()
|
||||
self.tapQueue = nil
|
||||
|
||||
self.recognitionTask?.cancel()
|
||||
self.recognitionTask = nil
|
||||
self.recognitionRequest = nil
|
||||
|
||||
if self.audioEngine.isRunning {
|
||||
self.audioEngine.stop()
|
||||
self.audioEngine.inputNode.removeTap(onBus: 0)
|
||||
}
|
||||
|
||||
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
|
||||
}
|
||||
|
||||
/// Temporarily releases the microphone so other subsystems (e.g. camera video capture) can record audio.
|
||||
/// Returns `true` when listening was active and was suspended.
|
||||
func suspendForExternalAudioCapture() -> Bool {
|
||||
guard self.isEnabled, self.isListening else { return false }
|
||||
|
||||
self.isListening = false
|
||||
self.statusText = "Paused"
|
||||
|
||||
self.tapDrainTask?.cancel()
|
||||
self.tapDrainTask = nil
|
||||
self.tapQueue?.clear()
|
||||
self.tapQueue = nil
|
||||
|
||||
self.recognitionTask?.cancel()
|
||||
self.recognitionTask = nil
|
||||
self.recognitionRequest = nil
|
||||
|
||||
if self.audioEngine.isRunning {
|
||||
self.audioEngine.stop()
|
||||
self.audioEngine.inputNode.removeTap(onBus: 0)
|
||||
}
|
||||
|
||||
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
|
||||
return true
|
||||
}
|
||||
|
||||
func resumeAfterExternalAudioCapture(wasSuspended: Bool) {
|
||||
guard wasSuspended else { return }
|
||||
Task { await self.start() }
|
||||
}
|
||||
|
||||
private func startRecognition() throws {
|
||||
self.recognitionTask?.cancel()
|
||||
self.recognitionTask = nil
|
||||
self.tapDrainTask?.cancel()
|
||||
self.tapDrainTask = nil
|
||||
self.tapQueue?.clear()
|
||||
self.tapQueue = nil
|
||||
|
||||
let request = SFSpeechAudioBufferRecognitionRequest()
|
||||
request.shouldReportPartialResults = true
|
||||
self.recognitionRequest = request
|
||||
|
||||
let inputNode = self.audioEngine.inputNode
|
||||
inputNode.removeTap(onBus: 0)
|
||||
|
||||
let recordingFormat = inputNode.outputFormat(forBus: 0)
|
||||
|
||||
let queue = AudioBufferQueue()
|
||||
self.tapQueue = queue
|
||||
let tapBlock: @Sendable (AVAudioPCMBuffer, AVAudioTime) -> Void = makeAudioTapEnqueueCallback(queue: queue)
|
||||
inputNode.installTap(
|
||||
onBus: 0,
|
||||
bufferSize: 1024,
|
||||
format: recordingFormat,
|
||||
block: tapBlock)
|
||||
|
||||
self.audioEngine.prepare()
|
||||
try self.audioEngine.start()
|
||||
|
||||
let handler = self.makeRecognitionResultHandler()
|
||||
self.recognitionTask = self.speechRecognizer?.recognitionTask(with: request, resultHandler: handler)
|
||||
|
||||
self.tapDrainTask = Task { [weak self] in
|
||||
guard let self, let queue = self.tapQueue else { return }
|
||||
while !Task.isCancelled {
|
||||
try? await Task.sleep(nanoseconds: 40_000_000)
|
||||
let drained = queue.drain()
|
||||
if drained.isEmpty { continue }
|
||||
for buf in drained {
|
||||
request.append(buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private nonisolated func makeRecognitionResultHandler() -> @Sendable (SFSpeechRecognitionResult?, Error?) -> Void {
|
||||
{ [weak self] result, error in
|
||||
let transcript = result?.bestTranscription.formattedString
|
||||
let segments = result.flatMap { result in
|
||||
transcript.map { WakeWordSpeechSegments.from(transcription: result.bestTranscription, transcript: $0) }
|
||||
} ?? []
|
||||
let errorText = error?.localizedDescription
|
||||
|
||||
Task { @MainActor in
|
||||
self?.handleRecognitionCallback(transcript: transcript, segments: segments, errorText: errorText)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func handleRecognitionCallback(transcript: String?, segments: [WakeWordSegment], errorText: String?) {
|
||||
if let errorText {
|
||||
self.statusText = "Recognizer error: \(errorText)"
|
||||
self.isListening = false
|
||||
|
||||
let shouldRestart = self.isEnabled
|
||||
if shouldRestart {
|
||||
Task {
|
||||
try? await Task.sleep(nanoseconds: 700_000_000)
|
||||
await self.start()
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
guard let transcript else { return }
|
||||
guard let cmd = self.extractCommand(from: transcript, segments: segments) else { return }
|
||||
|
||||
if cmd == self.lastDispatched { return }
|
||||
self.lastDispatched = cmd
|
||||
self.lastTriggeredCommand = cmd
|
||||
self.statusText = "Triggered"
|
||||
|
||||
Task { [weak self] in
|
||||
guard let self else { return }
|
||||
await self.onCommand?(cmd)
|
||||
await self.startIfEnabled()
|
||||
}
|
||||
}
|
||||
|
||||
private func startIfEnabled() async {
|
||||
let shouldRestart = self.isEnabled
|
||||
if shouldRestart {
|
||||
await self.start()
|
||||
}
|
||||
}
|
||||
|
||||
private func extractCommand(from transcript: String, segments: [WakeWordSegment]) -> String? {
|
||||
Self.extractCommand(from: transcript, segments: segments, triggers: self.activeTriggerWords)
|
||||
}
|
||||
|
||||
nonisolated static func extractCommand(
|
||||
from transcript: String,
|
||||
segments: [WakeWordSegment],
|
||||
triggers: [String],
|
||||
minPostTriggerGap: TimeInterval = 0.45) -> String?
|
||||
{
|
||||
let config = WakeWordGateConfig(triggers: triggers, minPostTriggerGap: minPostTriggerGap)
|
||||
return WakeWordGate.match(transcript: transcript, segments: segments, config: config)?.command
|
||||
}
|
||||
|
||||
private static func configureAudioSession() throws {
|
||||
let session = AVAudioSession.sharedInstance()
|
||||
try session.setCategory(.playAndRecord, mode: .measurement, options: [
|
||||
.duckOthers,
|
||||
.mixWithOthers,
|
||||
.allowBluetoothHFP,
|
||||
.defaultToSpeaker,
|
||||
])
|
||||
try session.setActive(true, options: [])
|
||||
}
|
||||
|
||||
private nonisolated static func requestMicrophonePermission() async -> Bool {
|
||||
await withCheckedContinuation(isolation: nil) { cont in
|
||||
AVAudioApplication.requestRecordPermission { ok in
|
||||
cont.resume(returning: ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private nonisolated static func requestSpeechPermission() async -> Bool {
|
||||
await withCheckedContinuation(isolation: nil) { cont in
|
||||
SFSpeechRecognizer.requestAuthorization { status in
|
||||
cont.resume(returning: status == .authorized)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if DEBUG
|
||||
extension VoiceWakeManager {
|
||||
func _test_handleRecognitionCallback(transcript: String?, segments: [WakeWordSegment], errorText: String?) {
|
||||
self.handleRecognitionCallback(transcript: transcript, segments: segments, errorText: errorText)
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,44 @@
|
||||
import Foundation
|
||||
|
||||
enum VoiceWakePreferences {
|
||||
static let enabledKey = "voiceWake.enabled"
|
||||
static let triggerWordsKey = "voiceWake.triggerWords"
|
||||
|
||||
// Keep defaults aligned with the mac app.
|
||||
static let defaultTriggerWords: [String] = ["clawd", "claude"]
|
||||
static let maxWords = 32
|
||||
static let maxWordLength = 64
|
||||
|
||||
static func decodeGatewayTriggers(from payloadJSON: String) -> [String]? {
|
||||
guard let data = payloadJSON.data(using: .utf8) else { return nil }
|
||||
return self.decodeGatewayTriggers(from: data)
|
||||
}
|
||||
|
||||
static func decodeGatewayTriggers(from data: Data) -> [String]? {
|
||||
struct Payload: Decodable { var triggers: [String] }
|
||||
guard let decoded = try? JSONDecoder().decode(Payload.self, from: data) else { return nil }
|
||||
return self.sanitizeTriggerWords(decoded.triggers)
|
||||
}
|
||||
|
||||
static func loadTriggerWords(defaults: UserDefaults = .standard) -> [String] {
|
||||
defaults.stringArray(forKey: self.triggerWordsKey) ?? self.defaultTriggerWords
|
||||
}
|
||||
|
||||
static func saveTriggerWords(_ words: [String], defaults: UserDefaults = .standard) {
|
||||
defaults.set(words, forKey: self.triggerWordsKey)
|
||||
}
|
||||
|
||||
static func sanitizeTriggerWords(_ words: [String]) -> [String] {
|
||||
let cleaned = words
|
||||
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
||||
.filter { !$0.isEmpty }
|
||||
.prefix(Self.maxWords)
|
||||
.map { String($0.prefix(Self.maxWordLength)) }
|
||||
return cleaned.isEmpty ? Self.defaultTriggerWords : cleaned
|
||||
}
|
||||
|
||||
static func displayString(for words: [String]) -> String {
|
||||
let sanitized = self.sanitizeTriggerWords(words)
|
||||
return sanitized.joined(separator: ", ")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user