iOS: add push-to-talk node commands

This commit is contained in:
Mariano Belinky
2026-02-01 00:25:44 +01:00
committed by Mariano Belinky
parent a884955cd6
commit 9f101d3a9a
8 changed files with 318 additions and 13 deletions

View File

@@ -353,6 +353,8 @@ final class GatewayConnectionController {
OpenClawCanvasA2UICommand.reset.rawValue,
OpenClawScreenCommand.record.rawValue,
OpenClawSystemCommand.notify.rawValue,
OpenClawTalkCommand.pttStart.rawValue,
OpenClawTalkCommand.pttStop.rawValue,
]
let caps = Set(self.currentCaps())

View File

@@ -63,7 +63,7 @@ final class NodeAppModel {
@ObservationIgnored private var cameraHUDDismissTask: Task<Void, Never>?
private let notificationCenter: NotificationCentering
let voiceWake = VoiceWakeManager()
let talkMode = TalkModeManager()
let talkMode: TalkModeManager
private let locationService: any LocationServicing
private let deviceStatusService: any DeviceStatusServicing
private let photosService: any PhotosServicing
@@ -92,7 +92,8 @@ final class NodeAppModel {
contactsService: any ContactsServicing = ContactsService(),
calendarService: any CalendarServicing = CalendarService(),
remindersService: any RemindersServicing = RemindersService(),
motionService: any MotionServicing = MotionService())
motionService: any MotionServicing = MotionService(),
talkMode: TalkModeManager = TalkModeManager())
{
self.screen = screen
self.camera = camera
@@ -105,6 +106,7 @@ final class NodeAppModel {
self.calendarService = calendarService
self.remindersService = remindersService
self.motionService = motionService
self.talkMode = talkMode
self.voiceWake.configure { [weak self] cmd in
guard let self else { return }
@@ -313,6 +315,7 @@ final class NodeAppModel {
self.gatewayStatusText = "Connected"
self.gatewayServerName = url.host ?? "gateway"
self.gatewayConnected = true
self.talkMode.updateGatewayConnected(true)
}
if let addr = await self.gateway.currentRemoteAddress() {
await MainActor.run {
@@ -329,6 +332,7 @@ final class NodeAppModel {
self.gatewayStatusText = "Disconnected"
self.gatewayRemoteAddress = nil
self.gatewayConnected = false
self.talkMode.updateGatewayConnected(false)
self.showLocalCanvasOnDisconnect()
self.gatewayStatusText = "Disconnected: \(reason)"
}
@@ -356,6 +360,7 @@ final class NodeAppModel {
self.gatewayServerName = nil
self.gatewayRemoteAddress = nil
self.gatewayConnected = false
self.talkMode.updateGatewayConnected(false)
self.showLocalCanvasOnDisconnect()
}
let sleepSeconds = min(8.0, 0.5 * pow(1.7, Double(attempt)))
@@ -369,6 +374,7 @@ final class NodeAppModel {
self.gatewayRemoteAddress = nil
self.connectedGatewayID = nil
self.gatewayConnected = false
self.talkMode.updateGatewayConnected(false)
self.seamColorHex = nil
if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) {
self.mainSessionKey = "main"
@@ -390,6 +396,7 @@ final class NodeAppModel {
self.gatewayRemoteAddress = nil
self.connectedGatewayID = nil
self.gatewayConnected = false
self.talkMode.updateGatewayConnected(false)
self.seamColorHex = nil
if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) {
self.mainSessionKey = "main"
@@ -627,6 +634,9 @@ final class NodeAppModel {
case OpenClawMotionCommand.activity.rawValue,
OpenClawMotionCommand.pedometer.rawValue:
return try await self.handleMotionInvoke(req)
case OpenClawTalkCommand.pttStart.rawValue,
OpenClawTalkCommand.pttStop.rawValue:
return try await self.handleTalkInvoke(req)
default:
return BridgeInvokeResponse(
id: req.id,
@@ -646,7 +656,8 @@ final class NodeAppModel {
}
private func isBackgroundRestricted(_ command: String) -> Bool {
command.hasPrefix("canvas.") || command.hasPrefix("camera.") || command.hasPrefix("screen.")
command.hasPrefix("canvas.") || command.hasPrefix("camera.") || command.hasPrefix("screen.") ||
command.hasPrefix("talk.")
}
private func handleLocationInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse {
@@ -1150,6 +1161,24 @@ final class NodeAppModel {
}
}
private func handleTalkInvoke(_ req: BridgeInvokeRequest) async throws -> BridgeInvokeResponse {
switch req.command {
case OpenClawTalkCommand.pttStart.rawValue:
let payload = try await self.talkMode.beginPushToTalk()
let json = try Self.encodePayload(payload)
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json)
case OpenClawTalkCommand.pttStop.rawValue:
let payload = await self.talkMode.endPushToTalk()
let json = try Self.encodePayload(payload)
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json)
default:
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: OpenClawNodeError(code: .invalidRequest, message: "INVALID_REQUEST: unknown command"))
}
}
}
private extension NodeAppModel {

View File

@@ -14,8 +14,21 @@ final class TalkModeManager: NSObject {
var isEnabled: Bool = false
var isListening: Bool = false
var isSpeaking: Bool = false
var isPushToTalkActive: Bool = false
var statusText: String = "Off"
private enum CaptureMode {
case idle
case continuous
case pushToTalk
}
private var captureMode: CaptureMode = .idle
private var resumeContinuousAfterPTT: Bool = false
private var activePTTCaptureId: String?
private let allowSimulatorCapture: Bool
private let audioEngine = AVAudioEngine()
private var inputTapInstalled = false
private var speechRecognizer: SFSpeechRecognizer?
@@ -45,16 +58,26 @@ final class TalkModeManager: NSObject {
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
private var gateway: GatewayNodeSession?
private var gatewayConnected = false
private let silenceWindow: TimeInterval = 0.7
private var chatSubscribedSessionKeys = Set<String>()
private let logger = Logger(subsystem: "bot.molt", category: "TalkMode")
init(allowSimulatorCapture: Bool = false) {
self.allowSimulatorCapture = allowSimulatorCapture
super.init()
}
func attachGateway(_ gateway: GatewayNodeSession) {
self.gateway = gateway
}
func updateGatewayConnected(_ connected: Bool) {
self.gatewayConnected = connected
}
func updateMainSessionKey(_ sessionKey: String?) {
let trimmed = (sessionKey ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return }
@@ -75,6 +98,7 @@ final class TalkModeManager: NSObject {
func start() async {
guard self.isEnabled else { return }
guard self.captureMode != .pushToTalk else { return }
if self.isListening { return }
self.logger.info("start")
@@ -97,6 +121,7 @@ final class TalkModeManager: NSObject {
try Self.configureAudioSession()
try self.startRecognition()
self.isListening = true
self.captureMode = .continuous
self.statusText = "Listening"
self.startSilenceMonitor()
await self.subscribeChatIfNeeded(sessionKey: self.mainSessionKey)
@@ -111,6 +136,8 @@ final class TalkModeManager: NSObject {
func stop() {
self.isEnabled = false
self.isListening = false
self.isPushToTalkActive = false
self.captureMode = .idle
self.statusText = "Off"
self.lastTranscript = ""
self.lastHeard = nil
@@ -119,6 +146,8 @@ final class TalkModeManager: NSObject {
self.stopRecognition()
self.stopSpeaking()
self.lastInterruptedAtSeconds = nil
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
TalkSystemSpeechSynthesizer.shared.stop()
do {
try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
@@ -132,11 +161,127 @@ final class TalkModeManager: NSObject {
self.stopSpeaking()
}
func beginPushToTalk() async throws -> OpenClawTalkPTTStartPayload {
if self.isPushToTalkActive, let captureId = self.activePTTCaptureId {
return OpenClawTalkPTTStartPayload(captureId: captureId)
}
self.stopSpeaking(storeInterruption: false)
self.resumeContinuousAfterPTT = self.isEnabled && self.captureMode == .continuous
self.silenceTask?.cancel()
self.silenceTask = nil
self.stopRecognition()
self.isListening = false
let captureId = UUID().uuidString
self.activePTTCaptureId = captureId
self.lastTranscript = ""
self.lastHeard = nil
self.statusText = "Requesting permissions…"
if !self.allowSimulatorCapture {
let micOk = await Self.requestMicrophonePermission()
guard micOk else {
self.statusText = "Microphone permission denied"
throw NSError(domain: "TalkMode", code: 4, userInfo: [
NSLocalizedDescriptionKey: "Microphone permission denied",
])
}
let speechOk = await Self.requestSpeechPermission()
guard speechOk else {
self.statusText = "Speech recognition permission denied"
throw NSError(domain: "TalkMode", code: 5, userInfo: [
NSLocalizedDescriptionKey: "Speech recognition permission denied",
])
}
}
do {
try Self.configureAudioSession()
self.captureMode = .pushToTalk
try self.startRecognition()
self.isListening = true
self.isPushToTalkActive = true
self.statusText = "Listening (PTT)"
} catch {
self.isListening = false
self.isPushToTalkActive = false
self.captureMode = .idle
self.statusText = "Start failed: \(error.localizedDescription)"
throw error
}
return OpenClawTalkPTTStartPayload(captureId: captureId)
}
func endPushToTalk() async -> OpenClawTalkPTTStopPayload {
let captureId = self.activePTTCaptureId ?? UUID().uuidString
guard self.isPushToTalkActive else {
return OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: nil,
status: "idle")
}
self.isPushToTalkActive = false
self.isListening = false
self.captureMode = .idle
self.stopRecognition()
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
self.lastTranscript = ""
self.lastHeard = nil
guard !transcript.isEmpty else {
self.statusText = "Ready"
if self.resumeContinuousAfterPTT {
await self.start()
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: nil,
status: "empty")
}
guard self.gatewayConnected else {
self.statusText = "Gateway not connected"
if self.resumeContinuousAfterPTT {
await self.start()
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: transcript,
status: "offline")
}
self.statusText = "Thinking…"
Task { @MainActor in
await self.processTranscript(transcript, restartAfter: self.resumeContinuousAfterPTT)
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: transcript,
status: "queued")
}
private func startRecognition() throws {
#if targetEnvironment(simulator)
throw NSError(domain: "TalkMode", code: 2, userInfo: [
NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator",
])
if !self.allowSimulatorCapture {
throw NSError(domain: "TalkMode", code: 2, userInfo: [
NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator",
])
} else {
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
return
}
#endif
self.stopRecognition()
@@ -232,16 +377,18 @@ final class TalkModeManager: NSObject {
}
private func checkSilence() async {
guard self.captureMode == .continuous else { return }
guard self.isListening, !self.isSpeaking else { return }
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !transcript.isEmpty else { return }
guard let lastHeard else { return }
if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return }
await self.finalizeTranscript(transcript)
await self.processTranscript(transcript, restartAfter: true)
}
private func finalizeTranscript(_ transcript: String) async {
private func processTranscript(_ transcript: String, restartAfter: Bool) async {
self.isListening = false
self.captureMode = .idle
self.statusText = "Thinking…"
self.lastTranscript = ""
self.lastHeard = nil
@@ -249,10 +396,12 @@ final class TalkModeManager: NSObject {
await self.reloadConfig()
let prompt = self.buildPrompt(transcript: transcript)
guard let gateway else {
guard self.gatewayConnected, let gateway else {
self.statusText = "Gateway not connected"
self.logger.warning("finalize: gateway not connected")
await self.start()
if restartAfter {
await self.start()
}
return
}
@@ -297,7 +446,9 @@ final class TalkModeManager: NSObject {
self.logger.error("finalize failed: \(error.localizedDescription, privacy: .public)")
}
await self.start()
if restartAfter {
await self.start()
}
}
private func subscribeChatIfNeeded(sessionKey: String) async {
@@ -732,3 +883,12 @@ final class TalkModeManager: NSObject {
}
}
}
#if DEBUG
extension TalkModeManager {
func _test_seedTranscript(_ transcript: String) {
self.lastTranscript = transcript
self.lastHeard = Date()
}
}
#endif

View File

@@ -70,6 +70,7 @@ Sources/Voice/VoiceWakePreferences.swift
../shared/OpenClawKit/Sources/OpenClawKit/ScreenCommands.swift
../shared/OpenClawKit/Sources/OpenClawKit/StoragePaths.swift
../shared/OpenClawKit/Sources/OpenClawKit/SystemCommands.swift
../shared/OpenClawKit/Sources/OpenClawKit/TalkCommands.swift
../shared/OpenClawKit/Sources/OpenClawKit/TalkDirective.swift
../../Swabble/Sources/SwabbleKit/WakeWordGate.swift
Sources/Voice/TalkModeManager.swift

View File

@@ -102,6 +102,8 @@ private func withUserDefaults<T>(_ updates: [String: Any?], _ body: () throws ->
#expect(commands.contains(OpenClawContactsCommand.add.rawValue))
#expect(commands.contains(OpenClawCalendarCommand.add.rawValue))
#expect(commands.contains(OpenClawRemindersCommand.add.rawValue))
#expect(commands.contains(OpenClawTalkCommand.pttStart.rawValue))
#expect(commands.contains(OpenClawTalkCommand.pttStop.rawValue))
}
}

View File

@@ -149,7 +149,8 @@ private func makeTestAppModel(
contactsService: ContactsServicing,
calendarService: CalendarServicing,
remindersService: RemindersServicing,
motionService: MotionServicing) -> NodeAppModel
motionService: MotionServicing,
talkMode: TalkModeManager = TalkModeManager(allowSimulatorCapture: true)) -> NodeAppModel
{
NodeAppModel(
screen: ScreenController(),
@@ -162,7 +163,8 @@ private func makeTestAppModel(
contactsService: contactsService,
calendarService: calendarService,
remindersService: remindersService,
motionService: motionService)
motionService: motionService,
talkMode: talkMode)
}
private func decodePayload<T: Decodable>(_ json: String?, as type: T.Type) throws -> T {
@@ -594,6 +596,86 @@ private func decodePayload<T: Decodable>(_ json: String?, as type: T.Type) throw
#expect(decodedPedometer == pedometerPayload)
}
@Test @MainActor func handleInvokePushToTalkReturnsTranscriptStatus() async throws {
let talkMode = TalkModeManager(allowSimulatorCapture: true)
talkMode.updateGatewayConnected(false)
let appModel = makeTestAppModel(
deviceStatusService: TestDeviceStatusService(
statusPayload: OpenClawDeviceStatusPayload(
battery: OpenClawBatteryStatusPayload(level: 0.5, state: .unplugged, lowPowerModeEnabled: false),
thermal: OpenClawThermalStatusPayload(state: .nominal),
storage: OpenClawStorageStatusPayload(totalBytes: 10, freeBytes: 5, usedBytes: 5),
network: OpenClawNetworkStatusPayload(
status: .satisfied,
isExpensive: false,
isConstrained: false,
interfaces: [.wifi]),
uptimeSeconds: 1),
infoPayload: OpenClawDeviceInfoPayload(
deviceName: "Test",
modelIdentifier: "Test1,1",
systemName: "iOS",
systemVersion: "1.0",
appVersion: "dev",
appBuild: "0",
locale: "en-US")),
photosService: TestPhotosService(payload: OpenClawPhotosLatestPayload(photos: [])),
contactsService: TestContactsService(
searchPayload: OpenClawContactsSearchPayload(contacts: []),
addPayload: OpenClawContactsAddPayload(contact: OpenClawContactPayload(
identifier: "c0",
displayName: "",
givenName: "",
familyName: "",
organizationName: "",
phoneNumbers: [],
emails: []))),
calendarService: TestCalendarService(
eventsPayload: OpenClawCalendarEventsPayload(events: []),
addPayload: OpenClawCalendarAddPayload(event: OpenClawCalendarEventPayload(
identifier: "e0",
title: "Test",
startISO: "2024-01-01T00:00:00Z",
endISO: "2024-01-01T00:10:00Z",
isAllDay: false,
location: nil,
calendarTitle: nil))),
remindersService: TestRemindersService(
listPayload: OpenClawRemindersListPayload(reminders: []),
addPayload: OpenClawRemindersAddPayload(reminder: OpenClawReminderPayload(
identifier: "r0",
title: "Test",
dueISO: nil,
completed: false,
listName: nil))),
motionService: TestMotionService(
activityPayload: OpenClawMotionActivityPayload(activities: []),
pedometerPayload: OpenClawPedometerPayload(
startISO: "2024-01-01T00:00:00Z",
endISO: "2024-01-01T01:00:00Z",
steps: nil,
distanceMeters: nil,
floorsAscended: nil,
floorsDescended: nil)),
talkMode: talkMode)
let startReq = BridgeInvokeRequest(id: "ptt-start", command: OpenClawTalkCommand.pttStart.rawValue)
let startRes = await appModel._test_handleInvoke(startReq)
#expect(startRes.ok == true)
let startPayload = try decodePayload(startRes.payloadJSON, as: OpenClawTalkPTTStartPayload.self)
#expect(!startPayload.captureId.isEmpty)
talkMode._test_seedTranscript("Hello from PTT")
let stopReq = BridgeInvokeRequest(id: "ptt-stop", command: OpenClawTalkCommand.pttStop.rawValue)
let stopRes = await appModel._test_handleInvoke(stopReq)
#expect(stopRes.ok == true)
let stopPayload = try decodePayload(stopRes.payloadJSON, as: OpenClawTalkPTTStopPayload.self)
#expect(stopPayload.captureId == startPayload.captureId)
#expect(stopPayload.transcript == "Hello from PTT")
#expect(stopPayload.status == "offline")
}
@Test @MainActor func handleDeepLinkSetsErrorWhenNotConnected() async {
let appModel = NodeAppModel()
let url = URL(string: "openclaw://agent?message=hello")!

View File

@@ -0,0 +1,26 @@
import Foundation
public enum OpenClawTalkCommand: String, Codable, Sendable {
case pttStart = "talk.ptt.start"
case pttStop = "talk.ptt.stop"
}
public struct OpenClawTalkPTTStartPayload: Codable, Sendable, Equatable {
public var captureId: String
public init(captureId: String) {
self.captureId = captureId
}
}
public struct OpenClawTalkPTTStopPayload: Codable, Sendable, Equatable {
public var captureId: String
public var transcript: String?
public var status: String
public init(captureId: String, transcript: String?, status: String) {
self.captureId = captureId
self.transcript = transcript
self.status = status
}
}