iOS: add PTT once/cancel

This commit is contained in:
Mariano Belinky
2026-02-01 10:04:53 +01:00
committed by Mariano Belinky
parent 17b18971f1
commit 1a48bce294
7 changed files with 273 additions and 69 deletions

View File

@@ -355,6 +355,8 @@ final class GatewayConnectionController {
OpenClawSystemCommand.notify.rawValue,
OpenClawTalkCommand.pttStart.rawValue,
OpenClawTalkCommand.pttStop.rawValue,
OpenClawTalkCommand.pttCancel.rawValue,
OpenClawTalkCommand.pttOnce.rawValue,
]
let caps = Set(self.currentCaps())

View File

@@ -636,7 +636,9 @@ final class NodeAppModel {
OpenClawMotionCommand.pedometer.rawValue:
return try await self.handleMotionInvoke(req)
case OpenClawTalkCommand.pttStart.rawValue,
OpenClawTalkCommand.pttStop.rawValue:
OpenClawTalkCommand.pttStop.rawValue,
OpenClawTalkCommand.pttCancel.rawValue,
OpenClawTalkCommand.pttOnce.rawValue:
return try await self.handleTalkInvoke(req)
default:
return BridgeInvokeResponse(
@@ -1175,6 +1177,21 @@ final class NodeAppModel {
self.pttVoiceWakeSuspended = false
let json = try Self.encodePayload(payload)
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json)
case OpenClawTalkCommand.pttCancel.rawValue:
let payload = await self.talkMode.cancelPushToTalk()
self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: self.pttVoiceWakeSuspended)
self.pttVoiceWakeSuspended = false
let json = try Self.encodePayload(payload)
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json)
case OpenClawTalkCommand.pttOnce.rawValue:
self.pttVoiceWakeSuspended = self.voiceWake.suspendForExternalAudioCapture()
defer {
self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: self.pttVoiceWakeSuspended)
self.pttVoiceWakeSuspended = false
}
let payload = try await self.talkMode.runPushToTalkOnce()
let json = try Self.encodePayload(payload)
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: json)
default:
return BridgeInvokeResponse(
id: req.id,

View File

@@ -26,6 +26,9 @@ final class TalkModeManager: NSObject {
private var captureMode: CaptureMode = .idle
private var resumeContinuousAfterPTT: Bool = false
private var activePTTCaptureId: String?
private var pttAutoStopEnabled: Bool = false
private var pttCompletion: CheckedContinuation<OpenClawTalkPTTStopPayload, Never>?
private var pttTimeoutTask: Task<Void, Never>?
private let allowSimulatorCapture: Bool
@@ -146,6 +149,18 @@ final class TalkModeManager: NSObject {
self.stopRecognition()
self.stopSpeaking()
self.lastInterruptedAtSeconds = nil
let pendingPTT = self.pttCompletion != nil
let pendingCaptureId = self.activePTTCaptureId ?? UUID().uuidString
self.pttTimeoutTask?.cancel()
self.pttTimeoutTask = nil
self.pttAutoStopEnabled = false
if pendingPTT {
let payload = OpenClawTalkPTTStopPayload(
captureId: pendingCaptureId,
transcript: nil,
status: "cancelled")
self.finishPTTOnce(payload)
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
TalkSystemSpeechSynthesizer.shared.stop()
@@ -167,6 +182,9 @@ final class TalkModeManager: NSObject {
}
self.stopSpeaking(storeInterruption: false)
self.pttTimeoutTask?.cancel()
self.pttTimeoutTask = nil
self.pttAutoStopEnabled = false
self.resumeContinuousAfterPTT = self.isEnabled && self.captureMode == .continuous
self.silenceTask?.cancel()
@@ -218,16 +236,21 @@ final class TalkModeManager: NSObject {
func endPushToTalk() async -> OpenClawTalkPTTStopPayload {
let captureId = self.activePTTCaptureId ?? UUID().uuidString
guard self.isPushToTalkActive else {
return OpenClawTalkPTTStopPayload(
let payload = OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: nil,
status: "idle")
self.finishPTTOnce(payload)
return payload
}
self.isPushToTalkActive = false
self.isListening = false
self.captureMode = .idle
self.stopRecognition()
self.pttTimeoutTask?.cancel()
self.pttTimeoutTask = nil
self.pttAutoStopEnabled = false
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
self.lastTranscript = ""
@@ -240,10 +263,12 @@ final class TalkModeManager: NSObject {
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return OpenClawTalkPTTStopPayload(
let payload = OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: nil,
status: "empty")
self.finishPTTOnce(payload)
return payload
}
guard self.gatewayConnected else {
@@ -253,10 +278,12 @@ final class TalkModeManager: NSObject {
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return OpenClawTalkPTTStopPayload(
let payload = OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: transcript,
status: "offline")
self.finishPTTOnce(payload)
return payload
}
self.statusText = "Thinking…"
@@ -265,10 +292,77 @@ final class TalkModeManager: NSObject {
}
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return OpenClawTalkPTTStopPayload(
let payload = OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: transcript,
status: "queued")
self.finishPTTOnce(payload)
return payload
}
func runPushToTalkOnce(maxDurationSeconds: TimeInterval = 12) async throws -> OpenClawTalkPTTStopPayload {
if self.pttCompletion != nil {
_ = await self.cancelPushToTalk()
}
if self.isPushToTalkActive {
let captureId = self.activePTTCaptureId ?? UUID().uuidString
return OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: nil,
status: "busy")
}
_ = try await self.beginPushToTalk()
return await withCheckedContinuation { cont in
self.pttCompletion = cont
self.pttAutoStopEnabled = true
self.startSilenceMonitor()
self.schedulePTTTimeout(seconds: maxDurationSeconds)
}
}
func cancelPushToTalk() async -> OpenClawTalkPTTStopPayload {
let captureId = self.activePTTCaptureId ?? UUID().uuidString
guard self.isPushToTalkActive else {
let payload = OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: nil,
status: "idle")
self.finishPTTOnce(payload)
self.pttAutoStopEnabled = false
self.pttTimeoutTask?.cancel()
self.pttTimeoutTask = nil
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
return payload
}
let shouldResume = self.resumeContinuousAfterPTT
self.isPushToTalkActive = false
self.isListening = false
self.captureMode = .idle
self.stopRecognition()
self.lastTranscript = ""
self.lastHeard = nil
self.pttAutoStopEnabled = false
self.pttTimeoutTask?.cancel()
self.pttTimeoutTask = nil
self.resumeContinuousAfterPTT = false
self.activePTTCaptureId = nil
self.statusText = "Ready"
let payload = OpenClawTalkPTTStopPayload(
captureId: captureId,
transcript: nil,
status: "cancelled")
self.finishPTTOnce(payload)
if shouldResume {
await self.start()
}
return payload
}
private func startRecognition() throws {
@@ -369,7 +463,7 @@ final class TalkModeManager: NSObject {
self.silenceTask?.cancel()
self.silenceTask = Task { [weak self] in
guard let self else { return }
while self.isEnabled {
while self.isEnabled || (self.isPushToTalkActive && self.pttAutoStopEnabled) {
try? await Task.sleep(nanoseconds: 200_000_000)
await self.checkSilence()
}
@@ -377,13 +471,45 @@ final class TalkModeManager: NSObject {
}
private func checkSilence() async {
guard self.captureMode == .continuous else { return }
guard self.isListening, !self.isSpeaking else { return }
if self.captureMode == .continuous {
guard self.isListening, !self.isSpeaking else { return }
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !transcript.isEmpty else { return }
guard let lastHeard else { return }
if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return }
await self.processTranscript(transcript, restartAfter: true)
return
}
guard self.captureMode == .pushToTalk, self.pttAutoStopEnabled else { return }
guard self.isListening, !self.isSpeaking, self.isPushToTalkActive else { return }
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !transcript.isEmpty else { return }
guard let lastHeard else { return }
if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return }
await self.processTranscript(transcript, restartAfter: true)
_ = await self.endPushToTalk()
}
// Guardrail for PTT once so we don't stay open indefinitely.
private func schedulePTTTimeout(seconds: TimeInterval) {
guard seconds > 0 else { return }
let nanos = UInt64(seconds * 1_000_000_000)
self.pttTimeoutTask?.cancel()
self.pttTimeoutTask = Task { [weak self] in
try? await Task.sleep(nanoseconds: nanos)
await self?.handlePTTTimeout()
}
}
private func handlePTTTimeout() async {
guard self.pttAutoStopEnabled, self.isPushToTalkActive else { return }
_ = await self.endPushToTalk()
}
private func finishPTTOnce(_ payload: OpenClawTalkPTTStopPayload) {
guard let continuation = self.pttCompletion else { return }
self.pttCompletion = nil
continuation.resume(returning: payload)
}
private func processTranscript(_ transcript: String, restartAfter: Bool) async {
@@ -890,5 +1016,13 @@ extension TalkModeManager {
self.lastTranscript = transcript
self.lastHeard = Date()
}
func _test_backdateLastHeard(seconds: TimeInterval) {
self.lastHeard = Date().addingTimeInterval(-seconds)
}
func _test_runSilenceCheck() async {
await self.checkSilence()
}
}
#endif

View File

@@ -104,6 +104,8 @@ private func withUserDefaults<T>(_ updates: [String: Any?], _ body: () throws ->
#expect(commands.contains(OpenClawRemindersCommand.add.rawValue))
#expect(commands.contains(OpenClawTalkCommand.pttStart.rawValue))
#expect(commands.contains(OpenClawTalkCommand.pttStop.rawValue))
#expect(commands.contains(OpenClawTalkCommand.pttCancel.rawValue))
#expect(commands.contains(OpenClawTalkCommand.pttOnce.rawValue))
}
}

View File

@@ -167,6 +167,69 @@ private func makeTestAppModel(
talkMode: talkMode)
}
@MainActor
private func makeTalkTestAppModel(talkMode: TalkModeManager) -> NodeAppModel {
makeTestAppModel(
deviceStatusService: TestDeviceStatusService(
statusPayload: OpenClawDeviceStatusPayload(
battery: OpenClawBatteryStatusPayload(level: 0.5, state: .unplugged, lowPowerModeEnabled: false),
thermal: OpenClawThermalStatusPayload(state: .nominal),
storage: OpenClawStorageStatusPayload(totalBytes: 10, freeBytes: 5, usedBytes: 5),
network: OpenClawNetworkStatusPayload(
status: .satisfied,
isExpensive: false,
isConstrained: false,
interfaces: [.wifi]),
uptimeSeconds: 1),
infoPayload: OpenClawDeviceInfoPayload(
deviceName: "Test",
modelIdentifier: "Test1,1",
systemName: "iOS",
systemVersion: "1.0",
appVersion: "dev",
appBuild: "0",
locale: "en-US")),
photosService: TestPhotosService(payload: OpenClawPhotosLatestPayload(photos: [])),
contactsService: TestContactsService(
searchPayload: OpenClawContactsSearchPayload(contacts: []),
addPayload: OpenClawContactsAddPayload(contact: OpenClawContactPayload(
identifier: "c0",
displayName: "",
givenName: "",
familyName: "",
organizationName: "",
phoneNumbers: [],
emails: []))),
calendarService: TestCalendarService(
eventsPayload: OpenClawCalendarEventsPayload(events: []),
addPayload: OpenClawCalendarAddPayload(event: OpenClawCalendarEventPayload(
identifier: "e0",
title: "Test",
startISO: "2024-01-01T00:00:00Z",
endISO: "2024-01-01T00:10:00Z",
isAllDay: false,
location: nil,
calendarTitle: nil))),
remindersService: TestRemindersService(
listPayload: OpenClawRemindersListPayload(reminders: []),
addPayload: OpenClawRemindersAddPayload(reminder: OpenClawReminderPayload(
identifier: "r0",
title: "Test",
dueISO: nil,
completed: false,
listName: nil))),
motionService: TestMotionService(
activityPayload: OpenClawMotionActivityPayload(activities: []),
pedometerPayload: OpenClawPedometerPayload(
startISO: "2024-01-01T00:00:00Z",
endISO: "2024-01-01T01:00:00Z",
steps: nil,
distanceMeters: nil,
floorsAscended: nil,
floorsDescended: nil)),
talkMode: talkMode)
}
private func decodePayload<T: Decodable>(_ json: String?, as type: T.Type) throws -> T {
let data = try #require(json?.data(using: .utf8))
return try JSONDecoder().decode(type, from: data)
@@ -599,65 +662,7 @@ private func decodePayload<T: Decodable>(_ json: String?, as type: T.Type) throw
@Test @MainActor func handleInvokePushToTalkReturnsTranscriptStatus() async throws {
let talkMode = TalkModeManager(allowSimulatorCapture: true)
talkMode.updateGatewayConnected(false)
let appModel = makeTestAppModel(
deviceStatusService: TestDeviceStatusService(
statusPayload: OpenClawDeviceStatusPayload(
battery: OpenClawBatteryStatusPayload(level: 0.5, state: .unplugged, lowPowerModeEnabled: false),
thermal: OpenClawThermalStatusPayload(state: .nominal),
storage: OpenClawStorageStatusPayload(totalBytes: 10, freeBytes: 5, usedBytes: 5),
network: OpenClawNetworkStatusPayload(
status: .satisfied,
isExpensive: false,
isConstrained: false,
interfaces: [.wifi]),
uptimeSeconds: 1),
infoPayload: OpenClawDeviceInfoPayload(
deviceName: "Test",
modelIdentifier: "Test1,1",
systemName: "iOS",
systemVersion: "1.0",
appVersion: "dev",
appBuild: "0",
locale: "en-US")),
photosService: TestPhotosService(payload: OpenClawPhotosLatestPayload(photos: [])),
contactsService: TestContactsService(
searchPayload: OpenClawContactsSearchPayload(contacts: []),
addPayload: OpenClawContactsAddPayload(contact: OpenClawContactPayload(
identifier: "c0",
displayName: "",
givenName: "",
familyName: "",
organizationName: "",
phoneNumbers: [],
emails: []))),
calendarService: TestCalendarService(
eventsPayload: OpenClawCalendarEventsPayload(events: []),
addPayload: OpenClawCalendarAddPayload(event: OpenClawCalendarEventPayload(
identifier: "e0",
title: "Test",
startISO: "2024-01-01T00:00:00Z",
endISO: "2024-01-01T00:10:00Z",
isAllDay: false,
location: nil,
calendarTitle: nil))),
remindersService: TestRemindersService(
listPayload: OpenClawRemindersListPayload(reminders: []),
addPayload: OpenClawRemindersAddPayload(reminder: OpenClawReminderPayload(
identifier: "r0",
title: "Test",
dueISO: nil,
completed: false,
listName: nil))),
motionService: TestMotionService(
activityPayload: OpenClawMotionActivityPayload(activities: []),
pedometerPayload: OpenClawPedometerPayload(
startISO: "2024-01-01T00:00:00Z",
endISO: "2024-01-01T01:00:00Z",
steps: nil,
distanceMeters: nil,
floorsAscended: nil,
floorsDescended: nil)),
talkMode: talkMode)
let appModel = makeTalkTestAppModel(talkMode: talkMode)
let startReq = BridgeInvokeRequest(id: "ptt-start", command: OpenClawTalkCommand.pttStart.rawValue)
let startRes = await appModel._test_handleInvoke(startReq)
@@ -676,6 +681,48 @@ private func decodePayload<T: Decodable>(_ json: String?, as type: T.Type) throw
#expect(stopPayload.status == "offline")
}
@Test @MainActor func handleInvokePushToTalkCancelStopsSession() async throws {
let talkMode = TalkModeManager(allowSimulatorCapture: true)
talkMode.updateGatewayConnected(false)
let appModel = makeTalkTestAppModel(talkMode: talkMode)
let startReq = BridgeInvokeRequest(id: "ptt-start", command: OpenClawTalkCommand.pttStart.rawValue)
let startRes = await appModel._test_handleInvoke(startReq)
#expect(startRes.ok == true)
let startPayload = try decodePayload(startRes.payloadJSON, as: OpenClawTalkPTTStartPayload.self)
let cancelReq = BridgeInvokeRequest(id: "ptt-cancel", command: OpenClawTalkCommand.pttCancel.rawValue)
let cancelRes = await appModel._test_handleInvoke(cancelReq)
#expect(cancelRes.ok == true)
let cancelPayload = try decodePayload(cancelRes.payloadJSON, as: OpenClawTalkPTTStopPayload.self)
#expect(cancelPayload.captureId == startPayload.captureId)
#expect(cancelPayload.status == "cancelled")
}
@Test @MainActor func handleInvokePushToTalkOnceAutoStopsAfterSilence() async throws {
let talkMode = TalkModeManager(allowSimulatorCapture: true)
talkMode.updateGatewayConnected(false)
let appModel = makeTalkTestAppModel(talkMode: talkMode)
let onceReq = BridgeInvokeRequest(id: "ptt-once", command: OpenClawTalkCommand.pttOnce.rawValue)
let onceTask = Task { await appModel._test_handleInvoke(onceReq) }
for _ in 0..<5 where !talkMode.isPushToTalkActive {
await Task.yield()
}
#expect(talkMode.isPushToTalkActive == true)
talkMode._test_seedTranscript("Hello from PTT once")
talkMode._test_backdateLastHeard(seconds: 1.0)
await talkMode._test_runSilenceCheck()
let onceRes = await onceTask.value
#expect(onceRes.ok == true)
let oncePayload = try decodePayload(onceRes.payloadJSON, as: OpenClawTalkPTTStopPayload.self)
#expect(oncePayload.transcript == "Hello from PTT once")
#expect(oncePayload.status == "offline")
}
@Test @MainActor func handleDeepLinkSetsErrorWhenNotConnected() async {
let appModel = NodeAppModel()
let url = URL(string: "openclaw://agent?message=hello")!

View File

@@ -3,6 +3,8 @@ import Foundation
public enum OpenClawTalkCommand: String, Codable, Sendable {
case pttStart = "talk.ptt.start"
case pttStop = "talk.ptt.stop"
case pttCancel = "talk.ptt.cancel"
case pttOnce = "talk.ptt.once"
}
public struct OpenClawTalkPTTStartPayload: Codable, Sendable, Equatable {

View File

@@ -34,7 +34,7 @@ const MOTION_COMMANDS = ["motion.activity", "motion.pedometer"];
const SYSTEM_NOTIFY_COMMANDS = ["system.notify"];
const TALK_COMMANDS = ["talk.ptt.start", "talk.ptt.stop"];
const TALK_COMMANDS = ["talk.ptt.start", "talk.ptt.stop", "talk.ptt.cancel", "talk.ptt.once"];
const SYSTEM_COMMANDS = [
"system.run",