From 6cd3bc3a46f3b1e7ca38673db236ab11bdd6ff00 Mon Sep 17 00:00:00 2001 From: Mariano Belinky Date: Mon, 2 Feb 2026 16:41:49 +0000 Subject: [PATCH] iOS: improve gateway auto-connect and voice permissions --- IOS-PRIORITIES.md | 20 ++- .../Gateway/GatewayConnectionController.swift | 51 +++---- apps/ios/Sources/Model/NodeAppModel.swift | 4 + apps/ios/Sources/Status/StatusPill.swift | 8 +- apps/ios/Sources/Voice/TalkModeManager.swift | 130 +++++++++++++++--- apps/ios/Sources/Voice/VoiceWakeManager.swift | 102 ++++++++++++-- src/auto-reply/reply/commands-core.ts | 2 +- src/auto-reply/reply/commands-ptt.test.ts | 1 - src/auto-reply/reply/commands-ptt.ts | 4 +- src/cli/nodes-cli/register.talk.ts | 2 +- 10 files changed, 257 insertions(+), 67 deletions(-) diff --git a/IOS-PRIORITIES.md b/IOS-PRIORITIES.md index 5c4c8e9e27..043b85fd2e 100644 --- a/IOS-PRIORITIES.md +++ b/IOS-PRIORITIES.md @@ -1,6 +1,7 @@ # iOS App Priorities (OpenClaw / Moltbot) This report is based on repo code + docs in `/Users/mariano/Coding/openclaw`, with focus on: + - iOS Swift sources under `apps/ios/Sources` - Shared Swift packages under `apps/shared/OpenClawKit` - Gateway protocol + node docs in `docs/` @@ -9,11 +10,13 @@ This report is based on repo code + docs in `/Users/mariano/Coding/openclaw`, wi ## Current iOS state (what works today) **Gateway connectivity + pairing** + - Uses the unified Gateway WebSocket protocol with device identity + challenge signing (via `GatewayChannel` in OpenClawKit). - Discovery via Bonjour (`NWBrowser`) for `_openclaw-gw._tcp` plus manual host/port fallback and TLS pinning support (`apps/ios/Sources/Gateway/*`). - Stores gateway token/password in Keychain (`GatewaySettingsStore.swift`). **Node command handling** (implemented in `NodeAppModel.handleInvoke`) + - Canvas: `canvas.present`, `canvas.hide`, `canvas.navigate`, `canvas.eval`, `canvas.snapshot`. - A2UI: `canvas.a2ui.reset`, `canvas.a2ui.push`, `canvas.a2ui.pushJsonl`. - Camera: `camera.list`, `camera.snap`, `camera.clip`. @@ -22,19 +25,23 @@ This report is based on repo code + docs in `/Users/mariano/Coding/openclaw`, wi - Foreground gating: returns `NODE_BACKGROUND_UNAVAILABLE` for canvas/camera/screen when backgrounded. **Voice features** + - Voice Wake: continuous speech recognition with wake-word gating and gateway sync (`VoiceWakeManager.swift`). - Talk Mode: speech-to-text + chat.send + ElevenLabs streaming TTS + system voice fallback (`TalkModeManager.swift`). **Chat UI** + - Uses shared SwiftUI chat client (`OpenClawChatUI`) and Gateway chat APIs (`IOSGatewayChatTransport.swift`). **UI surface** + - Full-screen canvas with overlay controls for chat, settings, and Talk orb (`RootCanvas.swift`). - Settings for gateway selection, voice, camera, location, screen prevent-sleep, and debug flags (`SettingsTab.swift`). ## Protocol requirements the iOS app must honor From `docs/gateway/protocol.md` + `docs/nodes/index.md` + OpenClawKit: + - WebSocket `connect` handshake with `role: "node"`, `caps`, `commands`, and `permissions` claims. - Device identity + challenge signing on connect; device token persistence. - Respond to `node.invoke.request` with `node.invoke.result`. @@ -45,6 +52,7 @@ From `docs/gateway/protocol.md` + `docs/nodes/index.md` + OpenClawKit: **1) Declared commands exceed iOS implementation** `GatewayConnectionController.currentCommands()` includes: + - `system.run`, `system.which`, `system.notify`, `system.execApprovals.get`, `system.execApprovals.set` …but `NodeAppModel.handleInvoke` does not implement any `system.*` commands and will return `INVALID_REQUEST: unknown command` for them. This is a protocol-level mismatch: the gateway will believe iOS supports system execution + notifications, but the node cannot fulfill those requests. @@ -53,6 +61,7 @@ From `docs/gateway/protocol.md` + `docs/nodes/index.md` + OpenClawKit: iOS sends `permissions: [:]` in its connect options, while macOS node reports real permission states via `PermissionManager`. This means the gateway cannot reason about iOS permission availability even though camera/mic/location/screen limitations materially affect command success. **3) Canvas parity gaps** + - `canvas.hide` is currently a no-op on iOS (returns ok but doesn’t change UI). - `canvas.present` ignores placement params (macOS supports window placement). @@ -61,6 +70,7 @@ These may be acceptable platform limitations, but they should be explicitly hand ## iOS vs. macOS node feature parity macOS node mode (`apps/macos/Sources/OpenClaw/NodeMode/*`) supports: + - `system.run`, `system.which`, `system.notify`, `system.execApprovals.get/set`. - Permission reporting in `connect.permissions`. - Canvas window placement + hide. @@ -69,22 +79,22 @@ iOS currently implements the shared node surface (canvas/camera/screen/location ## Prioritized work items (ordered by importance) -1) **Fix the command/implementation mismatch for `system.*`** +1. **Fix the command/implementation mismatch for `system.*`** - Either remove `system.*` from iOS `currentCommands()` **or** implement iOS equivalents (at minimum `system.notify` via local notifications) with clear error semantics for unsupported actions. - This is the highest risk mismatch because it misleads the gateway and any operator about what the iOS node can actually do. -2) **Report real iOS permission state in `connect.permissions`** +2. **Report real iOS permission state in `connect.permissions`** - Mirror macOS behavior by sending camera/microphone/location/screen-recording permission flags. - This enables the gateway to make better decisions and reduces “it failed because permissions” surprises. -3) **Clarify/normalize iOS canvas behaviors** +3. **Clarify/normalize iOS canvas behaviors** - Decide how `canvas.hide` should behave on iOS (e.g., return to the local scaffold) and implement it. - Document that `canvas.present` ignores placement on iOS, or add a platform-specific best effort. -4) **Explicitly document platform deltas vs. macOS node** +4. **Explicitly document platform deltas vs. macOS node** - The docs currently describe `system.*` under “Nodes” and cite macOS/headless node support. iOS should be clearly marked as not supporting system exec to avoid incorrect user expectations. -5) **Release readiness (if the goal is to move beyond internal preview)** +5. **Release readiness (if the goal is to move beyond internal preview)** - Docs state the iOS app is “internal preview” (`docs/platforms/ios.md`). - If public distribution is desired, build out TestFlight/App Store release steps (fastlane exists in `apps/ios/fastlane/`). diff --git a/apps/ios/Sources/Gateway/GatewayConnectionController.swift b/apps/ios/Sources/Gateway/GatewayConnectionController.swift index eb4541158d..973cbc0d57 100644 --- a/apps/ios/Sources/Gateway/GatewayConnectionController.swift +++ b/apps/ios/Sources/Gateway/GatewayConnectionController.swift @@ -189,9 +189,13 @@ final class GatewayConnectionController { guard !manualHost.isEmpty else { return } let manualPort = defaults.integer(forKey: "gateway.manual.port") - let resolvedPort = manualPort > 0 ? manualPort : 18789 let manualTLS = defaults.bool(forKey: "gateway.manual.tls") let resolvedUseTLS = manualTLS || self.shouldForceTLS(host: manualHost) + guard let resolvedPort = self.resolveManualPort( + host: manualHost, + port: manualPort, + useTLS: resolvedUseTLS) + else { return } let stableID = self.manualStableID(host: manualHost, port: resolvedPort) let tlsParams = self.resolveManualTLSParams( @@ -215,6 +219,28 @@ final class GatewayConnectionController { return } + if let lastKnown = GatewaySettingsStore.loadLastGatewayConnection() { + let resolvedUseTLS = lastKnown.useTLS || self.shouldForceTLS(host: lastKnown.host) + let tlsParams = self.resolveManualTLSParams( + stableID: lastKnown.stableID, + tlsEnabled: resolvedUseTLS, + allowTOFUReset: self.shouldForceTLS(host: lastKnown.host)) + guard let url = self.buildGatewayURL( + host: lastKnown.host, + port: lastKnown.port, + useTLS: tlsParams?.required == true) + else { return } + + self.didAutoConnect = true + self.startAutoConnect( + url: url, + gatewayStableID: lastKnown.stableID, + tls: tlsParams, + token: token, + password: password) + return + } + let preferredStableID = defaults.string(forKey: "gateway.preferredStableID")? .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" let lastDiscoveredStableID = defaults.string(forKey: "gateway.lastDiscoveredStableID")? @@ -241,8 +267,7 @@ final class GatewayConnectionController { return } - let lastKnown = GatewaySettingsStore.loadLastGatewayConnection() - if self.gateways.count == 1, lastKnown == nil, let gateway = self.gateways.first { + if self.gateways.count == 1, let gateway = self.gateways.first { guard let host = self.resolveGatewayHost(gateway) else { return } let port = gateway.gatewayPort ?? 18789 let tlsParams = self.resolveDiscoveredTLSParams(gateway: gateway) @@ -258,26 +283,6 @@ final class GatewayConnectionController { password: password) return } - - guard let lastKnown else { return } - let resolvedUseTLS = lastKnown.useTLS || self.shouldForceTLS(host: lastKnown.host) - let tlsParams = self.resolveManualTLSParams( - stableID: lastKnown.stableID, - tlsEnabled: resolvedUseTLS, - allowTOFUReset: self.shouldForceTLS(host: lastKnown.host)) - guard let url = self.buildGatewayURL( - host: lastKnown.host, - port: lastKnown.port, - useTLS: tlsParams?.required == true) - else { return } - - self.didAutoConnect = true - self.startAutoConnect( - url: url, - gatewayStableID: lastKnown.stableID, - tls: tlsParams, - token: token, - password: password) } private func updateLastDiscoveredGateway(from gateways: [GatewayDiscoveryModel.DiscoveredGateway]) { diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index 9db963a846..43434c7bde 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -247,8 +247,12 @@ final class NodeAppModel { switch phase { case .background: self.isBackgrounded = true + self.stopGatewayHealthMonitor() case .active, .inactive: self.isBackgrounded = false + if self.gatewayConnected { + self.startGatewayHealthMonitor() + } @unknown default: self.isBackgrounded = false } diff --git a/apps/ios/Sources/Status/StatusPill.swift b/apps/ios/Sources/Status/StatusPill.swift index cd81c011bb..df69835435 100644 --- a/apps/ios/Sources/Status/StatusPill.swift +++ b/apps/ios/Sources/Status/StatusPill.swift @@ -72,12 +72,6 @@ struct StatusPill: View { .lineLimit(1) } .transition(.opacity.combined(with: .move(edge: .top))) - } else { - Image(systemName: self.voiceWakeEnabled ? "mic.fill" : "mic.slash") - .font(.system(size: 13, weight: .semibold)) - .foregroundStyle(self.voiceWakeEnabled ? .primary : .secondary) - .accessibilityLabel(self.voiceWakeEnabled ? "Voice Wake enabled" : "Voice Wake disabled") - .transition(.opacity.combined(with: .move(edge: .top))) } } .padding(.vertical, 8) @@ -110,7 +104,7 @@ struct StatusPill: View { if let activity { return "\(self.gateway.title), \(activity.title)" } - return "\(self.gateway.title), Voice Wake \(self.voiceWakeEnabled ? "enabled" : "disabled")" + return self.gateway.title } private func updatePulse(for gateway: GatewayState, scenePhase: ScenePhase) { diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 06b6c4085c..6f9aa82fd7 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -118,13 +118,17 @@ final class TalkModeManager: NSObject { let micOk = await Self.requestMicrophonePermission() guard micOk else { self.logger.warning("start blocked: microphone permission denied") - self.statusText = "Microphone permission denied" + self.statusText = Self.permissionMessage( + kind: "Microphone", + status: AVAudioSession.sharedInstance().recordPermission) return } let speechOk = await Self.requestSpeechPermission() guard speechOk else { self.logger.warning("start blocked: speech permission denied") - self.statusText = "Speech recognition permission denied" + self.statusText = Self.permissionMessage( + kind: "Speech recognition", + status: SFSpeechRecognizer.authorizationStatus()) return } @@ -210,14 +214,18 @@ final class TalkModeManager: NSObject { if !self.allowSimulatorCapture { let micOk = await Self.requestMicrophonePermission() guard micOk else { - self.statusText = "Microphone permission denied" + self.statusText = Self.permissionMessage( + kind: "Microphone", + status: AVAudioSession.sharedInstance().recordPermission) throw NSError(domain: "TalkMode", code: 4, userInfo: [ NSLocalizedDescriptionKey: "Microphone permission denied", ]) } let speechOk = await Self.requestSpeechPermission() guard speechOk else { - self.statusText = "Speech recognition permission denied" + self.statusText = Self.permissionMessage( + kind: "Speech recognition", + status: SFSpeechRecognizer.authorizationStatus()) throw NSError(domain: "TalkMode", code: 5, userInfo: [ NSLocalizedDescriptionKey: "Speech recognition permission denied", ]) @@ -1301,21 +1309,6 @@ final class TalkModeManager: NSObject { try session.setActive(true, options: []) } - private nonisolated static func requestMicrophonePermission() async -> Bool { - await withCheckedContinuation(isolation: nil) { cont in - AVAudioApplication.requestRecordPermission { ok in - cont.resume(returning: ok) - } - } - } - - private nonisolated static func requestSpeechPermission() async -> Bool { - await withCheckedContinuation(isolation: nil) { cont in - SFSpeechRecognizer.requestAuthorization { status in - cont.resume(returning: status == .authorized) - } - } - } } private struct IncrementalSpeechBuffer { @@ -1441,6 +1434,105 @@ private struct IncrementalSpeechBuffer { } } +extension TalkModeManager { + nonisolated static func requestMicrophonePermission() async -> Bool { + let session = AVAudioSession.sharedInstance() + switch session.recordPermission { + case .granted: + return true + case .denied: + return false + case .undetermined: + break + @unknown default: + return false + } + + return await self.requestPermissionWithTimeout { completion in + AVAudioSession.sharedInstance().requestRecordPermission { ok in + completion(ok) + } + } + } + + nonisolated static func requestSpeechPermission() async -> Bool { + let status = SFSpeechRecognizer.authorizationStatus() + switch status { + case .authorized: + return true + case .denied, .restricted: + return false + case .notDetermined: + break + @unknown default: + return false + } + + return await self.requestPermissionWithTimeout { completion in + SFSpeechRecognizer.requestAuthorization { authStatus in + completion(authStatus == .authorized) + } + } + } + + private nonisolated static func requestPermissionWithTimeout( + _ operation: @escaping @Sendable (@escaping (Bool) -> Void) -> Void) async -> Bool + { + do { + return try await AsyncTimeout.withTimeout( + seconds: 8, + onTimeout: { NSError(domain: "TalkMode", code: 6, userInfo: [ + NSLocalizedDescriptionKey: "permission request timed out", + ]) }, + operation: { + await withCheckedContinuation(isolation: nil) { cont in + Task { @MainActor in + operation { ok in + cont.resume(returning: ok) + } + } + } + }) + } catch { + return false + } + } + + static func permissionMessage( + kind: String, + status: AVAudioSession.RecordPermission) -> String + { + switch status { + case .denied: + return "\(kind) permission denied" + case .undetermined: + return "\(kind) permission not granted" + case .granted: + return "\(kind) permission denied" + @unknown default: + return "\(kind) permission denied" + } + } + + static func permissionMessage( + kind: String, + status: SFSpeechRecognizerAuthorizationStatus) -> String + { + switch status { + case .denied: + return "\(kind) permission denied" + case .restricted: + return "\(kind) permission restricted" + case .notDetermined: + return "\(kind) permission not granted" + case .authorized: + return "\(kind) permission denied" + @unknown default: + return "\(kind) permission denied" + } + } +} + private struct IncrementalSpeechContext { let apiKey: String? let voiceId: String? diff --git a/apps/ios/Sources/Voice/VoiceWakeManager.swift b/apps/ios/Sources/Voice/VoiceWakeManager.swift index 771b5a77a6..d4ed467d97 100644 --- a/apps/ios/Sources/Voice/VoiceWakeManager.swift +++ b/apps/ios/Sources/Voice/VoiceWakeManager.swift @@ -1,6 +1,7 @@ import AVFAudio import Foundation import Observation +import OpenClawKit import Speech import SwabbleKit @@ -159,14 +160,18 @@ final class VoiceWakeManager: NSObject { let micOk = await Self.requestMicrophonePermission() guard micOk else { - self.statusText = "Microphone permission denied" + self.statusText = Self.permissionMessage( + kind: "Microphone", + status: AVAudioSession.sharedInstance().recordPermission) self.isListening = false return } let speechOk = await Self.requestSpeechPermission() guard speechOk else { - self.statusText = "Speech recognition permission denied" + self.statusText = Self.permissionMessage( + kind: "Speech recognition", + status: SFSpeechRecognizer.authorizationStatus()) self.isListening = false return } @@ -364,20 +369,101 @@ final class VoiceWakeManager: NSObject { } private nonisolated static func requestMicrophonePermission() async -> Bool { - await withCheckedContinuation(isolation: nil) { cont in - AVAudioApplication.requestRecordPermission { ok in - cont.resume(returning: ok) + let session = AVAudioSession.sharedInstance() + switch session.recordPermission { + case .granted: + return true + case .denied: + return false + case .undetermined: + break + @unknown default: + return false + } + + return await self.requestPermissionWithTimeout { completion in + AVAudioSession.sharedInstance().requestRecordPermission { ok in + completion(ok) } } } private nonisolated static func requestSpeechPermission() async -> Bool { - await withCheckedContinuation(isolation: nil) { cont in - SFSpeechRecognizer.requestAuthorization { status in - cont.resume(returning: status == .authorized) + let status = SFSpeechRecognizer.authorizationStatus() + switch status { + case .authorized: + return true + case .denied, .restricted: + return false + case .notDetermined: + break + @unknown default: + return false + } + + return await self.requestPermissionWithTimeout { completion in + SFSpeechRecognizer.requestAuthorization { authStatus in + completion(authStatus == .authorized) } } } + + private nonisolated static func requestPermissionWithTimeout( + _ operation: @escaping @Sendable (@escaping (Bool) -> Void) -> Void) async -> Bool + { + do { + return try await AsyncTimeout.withTimeout( + seconds: 8, + onTimeout: { NSError(domain: "VoiceWake", code: 6, userInfo: [ + NSLocalizedDescriptionKey: "permission request timed out", + ]) }, + operation: { + await withCheckedContinuation(isolation: nil) { cont in + Task { @MainActor in + operation { ok in + cont.resume(returning: ok) + } + } + } + }) + } catch { + return false + } + } + + private static func permissionMessage( + kind: String, + status: AVAudioSession.RecordPermission) -> String + { + switch status { + case .denied: + return "\(kind) permission denied" + case .undetermined: + return "\(kind) permission not granted" + case .granted: + return "\(kind) permission denied" + @unknown default: + return "\(kind) permission denied" + } + } + + private static func permissionMessage( + kind: String, + status: SFSpeechRecognizerAuthorizationStatus) -> String + { + switch status { + case .denied: + return "\(kind) permission denied" + case .restricted: + return "\(kind) permission restricted" + case .notDetermined: + return "\(kind) permission not granted" + case .authorized: + return "\(kind) permission denied" + @unknown default: + return "\(kind) permission denied" + } + } } #if DEBUG diff --git a/src/auto-reply/reply/commands-core.ts b/src/auto-reply/reply/commands-core.ts index 4a4fa0a324..183481363b 100644 --- a/src/auto-reply/reply/commands-core.ts +++ b/src/auto-reply/reply/commands-core.ts @@ -22,7 +22,6 @@ import { import { handleModelsCommand } from "./commands-models.js"; import { handlePluginCommand } from "./commands-plugin.js"; import { handlePTTCommand } from "./commands-ptt.js"; -import { handleTtsCommands } from "./commands-tts.js"; import { handleAbortTrigger, handleActivationCommand, @@ -32,6 +31,7 @@ import { handleUsageCommand, } from "./commands-session.js"; import { handleSubagentsCommand } from "./commands-subagents.js"; +import { handleTtsCommands } from "./commands-tts.js"; import { routeReply } from "./route-reply.js"; let HANDLERS: CommandHandler[] | null = null; diff --git a/src/auto-reply/reply/commands-ptt.test.ts b/src/auto-reply/reply/commands-ptt.test.ts index be9b26c57d..eef4de0d5e 100644 --- a/src/auto-reply/reply/commands-ptt.test.ts +++ b/src/auto-reply/reply/commands-ptt.test.ts @@ -1,5 +1,4 @@ import { describe, expect, it, vi } from "vitest"; - import type { OpenClawConfig } from "../../config/config.js"; import type { MsgContext } from "../templating.js"; import { buildCommandContext, handleCommands } from "./commands.js"; diff --git a/src/auto-reply/reply/commands-ptt.ts b/src/auto-reply/reply/commands-ptt.ts index 1e4847de3b..f104b3f177 100644 --- a/src/auto-reply/reply/commands-ptt.ts +++ b/src/auto-reply/reply/commands-ptt.ts @@ -1,7 +1,7 @@ -import { logVerbose } from "../../globals.js"; -import { callGateway, randomIdempotencyKey } from "../../gateway/call.js"; import type { OpenClawConfig } from "../../config/config.js"; import type { CommandHandler } from "./commands-types.js"; +import { callGateway, randomIdempotencyKey } from "../../gateway/call.js"; +import { logVerbose } from "../../globals.js"; type NodeSummary = { nodeId: string; diff --git a/src/cli/nodes-cli/register.talk.ts b/src/cli/nodes-cli/register.talk.ts index 6bc77a198a..402fb5a50b 100644 --- a/src/cli/nodes-cli/register.talk.ts +++ b/src/cli/nodes-cli/register.talk.ts @@ -1,9 +1,9 @@ import type { Command } from "commander"; +import type { NodesRpcOpts } from "./types.js"; import { randomIdempotencyKey } from "../../gateway/call.js"; import { defaultRuntime } from "../../runtime.js"; import { runNodesCommand } from "./cli-utils.js"; import { callGatewayCli, nodesCallOpts, resolveNodeId } from "./rpc.js"; -import type { NodesRpcOpts } from "./types.js"; type PTTAction = { name: string;