diff --git a/src/auto-reply/templating.ts b/src/auto-reply/templating.ts index 605a0fa691..d71d39ce38 100644 --- a/src/auto-reply/templating.ts +++ b/src/auto-reply/templating.ts @@ -60,9 +60,12 @@ export type MsgContext = { MediaPath?: string; MediaUrl?: string; MediaType?: string; + MediaDir?: string; MediaPaths?: string[]; MediaUrls?: string[]; MediaTypes?: string[]; + OutputDir?: string; + OutputBase?: string; /** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */ MediaRemoteHost?: string; Transcript?: string; diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index 570f738e7e..b4e443d20d 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -29,7 +29,7 @@ export const DEFAULT_PROMPT: Record = { export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; export const DEFAULT_AUDIO_MODELS: Record = { groq: "whisper-large-v3-turbo", - openai: "whisper-1", + openai: "gpt-4o-mini-transcribe", deepgram: "nova-3", }; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; diff --git a/src/media-understanding/providers/google/audio.ts b/src/media-understanding/providers/google/audio.ts new file mode 100644 index 0000000000..52a7136d2c --- /dev/null +++ b/src/media-understanding/providers/google/audio.ts @@ -0,0 +1,84 @@ +import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js"; +import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js"; +import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js"; + +export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"; +const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview"; +const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio."; + +function resolveModel(model?: string): string { + const trimmed = model?.trim(); + if (!trimmed) return DEFAULT_GOOGLE_AUDIO_MODEL; + return normalizeGoogleModelId(trimmed); +} + +function resolvePrompt(prompt?: string): string { + const trimmed = prompt?.trim(); + return trimmed || DEFAULT_GOOGLE_AUDIO_PROMPT; +} + +export async function transcribeGeminiAudio( + params: AudioTranscriptionRequest, +): Promise { + const fetchFn = params.fetchFn ?? fetch; + const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_AUDIO_BASE_URL); + const model = resolveModel(params.model); + const url = `${baseUrl}/models/${model}:generateContent`; + + const headers = new Headers(params.headers); + if (!headers.has("content-type")) { + headers.set("content-type", "application/json"); + } + if (!headers.has("x-goog-api-key")) { + headers.set("x-goog-api-key", params.apiKey); + } + + const body = { + contents: [ + { + role: "user", + parts: [ + { text: resolvePrompt(params.prompt) }, + { + inline_data: { + mime_type: params.mime ?? "audio/wav", + data: params.buffer.toString("base64"), + }, + }, + ], + }, + ], + }; + + const res = await fetchWithTimeout( + url, + { + method: "POST", + headers, + body: JSON.stringify(body), + }, + params.timeoutMs, + fetchFn, + ); + + if (!res.ok) { + const detail = await readErrorResponse(res); + const suffix = detail ? `: ${detail}` : ""; + throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`); + } + + const payload = (await res.json()) as { + candidates?: Array<{ + content?: { parts?: Array<{ text?: string }> }; + }>; + }; + const parts = payload.candidates?.[0]?.content?.parts ?? []; + const text = parts + .map((part) => part?.text?.trim()) + .filter(Boolean) + .join("\n"); + if (!text) { + throw new Error("Audio transcription response missing text"); + } + return { text, model }; +} diff --git a/src/media-understanding/providers/google/index.ts b/src/media-understanding/providers/google/index.ts index 6b3d412ba8..50674aac39 100644 --- a/src/media-understanding/providers/google/index.ts +++ b/src/media-understanding/providers/google/index.ts @@ -1,10 +1,12 @@ import type { MediaUnderstandingProvider } from "../../types.js"; import { describeImageWithModel } from "../image.js"; +import { transcribeGeminiAudio } from "./audio.js"; import { describeGeminiVideo } from "./video.js"; export const googleProvider: MediaUnderstandingProvider = { id: "google", capabilities: ["image", "audio", "video"], describeImage: describeImageWithModel, + transcribeAudio: transcribeGeminiAudio, describeVideo: describeGeminiVideo, }; diff --git a/src/media-understanding/providers/openai/audio.ts b/src/media-understanding/providers/openai/audio.ts index 65ac5735a0..acfe595a9e 100644 --- a/src/media-understanding/providers/openai/audio.ts +++ b/src/media-understanding/providers/openai/audio.ts @@ -4,7 +4,7 @@ import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../ import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js"; export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1"; -const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1"; +const DEFAULT_OPENAI_AUDIO_MODEL = "gpt-4o-mini-transcribe"; function resolveModel(model?: string): string { const trimmed = model?.trim(); diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index eed797f261..2e9bccb083 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -1,3 +1,8 @@ +import { constants as fsConstants } from "node:fs"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + import type { ClawdbotConfig } from "../config/config.js"; import type { MsgContext } from "../auto-reply/templating.js"; import { applyTemplate } from "../auto-reply/templating.js"; @@ -16,9 +21,9 @@ import { } from "./defaults.js"; import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js"; import { - resolveEntriesWithActiveFallback, resolveMaxBytes, resolveMaxChars, + resolveModelEntries, resolvePrompt, resolveScopeDecision, resolveTimeoutMs, @@ -39,7 +44,15 @@ import { import { describeImageWithModel } from "./providers/image.js"; import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js"; -const AUTO_AUDIO_PROVIDERS = ["openai", "groq", "deepgram"] as const; +const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google"] as const; +const AUTO_IMAGE_KEY_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const; +const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const; +const DEFAULT_IMAGE_MODELS: Record = { + openai: "gpt-5-mini", + anthropic: "claude-opus-4-5", + google: "gemini-3-flash-preview", + minimax: "MiniMax-VL-01", +}; export type ActiveMediaModel = { provider: string; @@ -67,27 +80,363 @@ export function createMediaAttachmentCache(attachments: MediaAttachment[]): Medi return new MediaAttachmentCache(attachments); } -async function resolveAutoAudioEntries(params: { +const binaryCache = new Map>(); +const geminiProbeCache = new Map>(); + +function expandHomeDir(value: string): string { + if (!value.startsWith("~")) return value; + const home = os.homedir(); + if (value === "~") return home; + if (value.startsWith("~/")) return path.join(home, value.slice(2)); + return value; +} + +function hasPathSeparator(value: string): boolean { + return value.includes("/") || value.includes("\\"); +} + +function candidateBinaryNames(name: string): string[] { + if (process.platform !== "win32") return [name]; + const ext = path.extname(name); + if (ext) return [name]; + const pathext = (process.env.PATHEXT ?? ".EXE;.CMD;.BAT;.COM") + .split(";") + .map((item) => item.trim()) + .filter(Boolean) + .map((item) => (item.startsWith(".") ? item : `.${item}`)); + const unique = Array.from(new Set(pathext)); + return [name, ...unique.map((item) => `${name}${item}`)]; +} + +async function isExecutable(filePath: string): Promise { + try { + const stat = await fs.stat(filePath); + if (!stat.isFile()) return false; + if (process.platform === "win32") return true; + await fs.access(filePath, fsConstants.X_OK); + return true; + } catch { + return false; + } +} + +async function findBinary(name: string): Promise { + const cached = binaryCache.get(name); + if (cached) return cached; + const resolved = (async () => { + const direct = expandHomeDir(name.trim()); + if (direct && hasPathSeparator(direct)) { + for (const candidate of candidateBinaryNames(direct)) { + if (await isExecutable(candidate)) return candidate; + } + } + + const searchName = name.trim(); + if (!searchName) return null; + const pathEntries = (process.env.PATH ?? "").split(path.delimiter); + const candidates = candidateBinaryNames(searchName); + for (const entryRaw of pathEntries) { + const entry = expandHomeDir(entryRaw.trim().replace(/^"(.*)"$/, "$1")); + if (!entry) continue; + for (const candidate of candidates) { + const fullPath = path.join(entry, candidate); + if (await isExecutable(fullPath)) return fullPath; + } + } + + return null; + })(); + binaryCache.set(name, resolved); + return resolved; +} + +async function hasBinary(name: string): Promise { + return Boolean(await findBinary(name)); +} + +async function fileExists(filePath?: string | null): Promise { + if (!filePath) return false; + try { + await fs.stat(filePath); + return true; + } catch { + return false; + } +} + +function extractLastJsonObject(raw: string): unknown { + const trimmed = raw.trim(); + const start = trimmed.lastIndexOf("{"); + if (start === -1) return null; + const slice = trimmed.slice(start); + try { + return JSON.parse(slice); + } catch { + return null; + } +} + +function extractGeminiResponse(raw: string): string | null { + const payload = extractLastJsonObject(raw); + if (!payload || typeof payload !== "object") return null; + const response = (payload as { response?: unknown }).response; + if (typeof response !== "string") return null; + const trimmed = response.trim(); + return trimmed || null; +} + +function extractSherpaOnnxText(raw: string): string | null { + const tryParse = (value: string): string | null => { + const trimmed = value.trim(); + if (!trimmed) return null; + const head = trimmed[0]; + if (head !== "{" && head !== '"') return null; + try { + const parsed = JSON.parse(trimmed) as unknown; + if (typeof parsed === "string") { + return tryParse(parsed); + } + if (parsed && typeof parsed === "object") { + const text = (parsed as { text?: unknown }).text; + if (typeof text === "string" && text.trim()) { + return text.trim(); + } + } + } catch {} + return null; + }; + + const direct = tryParse(raw); + if (direct) return direct; + + const lines = raw + .split("\n") + .map((line) => line.trim()) + .filter(Boolean); + for (let i = lines.length - 1; i >= 0; i -= 1) { + const parsed = tryParse(lines[i] ?? ""); + if (parsed) return parsed; + } + return null; +} + +async function probeGeminiCli(): Promise { + const cached = geminiProbeCache.get("gemini"); + if (cached) return cached; + const resolved = (async () => { + if (!(await hasBinary("gemini"))) return false; + try { + const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], { + timeoutMs: 8000, + }); + return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok")); + } catch { + return false; + } + })(); + geminiProbeCache.set("gemini", resolved); + return resolved; +} + +async function resolveLocalWhisperCppEntry(): Promise { + if (!(await hasBinary("whisper-cli"))) return null; + const envModel = process.env.WHISPER_CPP_MODEL?.trim(); + const defaultModel = "/opt/homebrew/share/whisper-cpp/for-tests-ggml-tiny.bin"; + const modelPath = envModel && (await fileExists(envModel)) ? envModel : defaultModel; + if (!(await fileExists(modelPath))) return null; + return { + type: "cli", + command: "whisper-cli", + args: ["-m", modelPath, "-otxt", "-of", "{{OutputBase}}", "-np", "-nt", "{{MediaPath}}"], + }; +} + +async function resolveLocalWhisperEntry(): Promise { + if (!(await hasBinary("whisper"))) return null; + return { + type: "cli", + command: "whisper", + args: [ + "--model", + "turbo", + "--output_format", + "txt", + "--output_dir", + "{{OutputDir}}", + "--verbose", + "False", + "{{MediaPath}}", + ], + }; +} + +async function resolveSherpaOnnxEntry(): Promise { + if (!(await hasBinary("sherpa-onnx-offline"))) return null; + const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim(); + if (!modelDir) return null; + const tokens = path.join(modelDir, "tokens.txt"); + const encoder = path.join(modelDir, "encoder.onnx"); + const decoder = path.join(modelDir, "decoder.onnx"); + const joiner = path.join(modelDir, "joiner.onnx"); + if (!(await fileExists(tokens))) return null; + if (!(await fileExists(encoder))) return null; + if (!(await fileExists(decoder))) return null; + if (!(await fileExists(joiner))) return null; + return { + type: "cli", + command: "sherpa-onnx-offline", + args: [ + `--tokens=${tokens}`, + `--encoder=${encoder}`, + `--decoder=${decoder}`, + `--joiner=${joiner}`, + "{{MediaPath}}", + ], + }; +} + +async function resolveLocalAudioEntry(): Promise { + const sherpa = await resolveSherpaOnnxEntry(); + if (sherpa) return sherpa; + const whisperCpp = await resolveLocalWhisperCppEntry(); + if (whisperCpp) return whisperCpp; + return await resolveLocalWhisperEntry(); +} + +async function resolveGeminiCliEntry( + _capability: MediaUnderstandingCapability, +): Promise { + if (!(await probeGeminiCli())) return null; + return { + type: "cli", + command: "gemini", + args: [ + "--output-format", + "json", + "--allowed-tools", + "read_many_files", + "--include-directories", + "{{MediaDir}}", + "{{Prompt}}", + "Use read_many_files to read {{MediaPath}} and respond with only the text output.", + ], + }; +} + +async function resolveKeyEntry(params: { cfg: ClawdbotConfig; agentDir?: string; providerRegistry: ProviderRegistry; -}): Promise { - const entries: MediaUnderstandingModelConfig[] = []; - for (const providerId of AUTO_AUDIO_PROVIDERS) { - const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); - if (!provider?.transcribeAudio) continue; + capability: MediaUnderstandingCapability; + activeModel?: ActiveMediaModel; +}): Promise { + const { cfg, agentDir, providerRegistry, capability } = params; + const checkProvider = async ( + providerId: string, + model?: string, + ): Promise => { + const provider = getMediaUnderstandingProvider(providerId, providerRegistry); + if (!provider) return null; + if (capability === "audio" && !provider.transcribeAudio) return null; + if (capability === "image" && !provider.describeImage) return null; + if (capability === "video" && !provider.describeVideo) return null; try { - await resolveApiKeyForProvider({ - provider: providerId, - cfg: params.cfg, - agentDir: params.agentDir, - }); - entries.push({ type: "provider", provider: providerId }); + await resolveApiKeyForProvider({ provider: providerId, cfg, agentDir }); + return { type: "provider" as const, provider: providerId, model }; } catch { - continue; + return null; } + }; + + if (capability === "image") { + const activeProvider = params.activeModel?.provider?.trim(); + if (activeProvider) { + const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); + if (activeEntry) return activeEntry; + } + for (const providerId of AUTO_IMAGE_KEY_PROVIDERS) { + const model = DEFAULT_IMAGE_MODELS[providerId]; + const entry = await checkProvider(providerId, model); + if (entry) return entry; + } + return null; } - return entries; + + if (capability === "video") { + const activeProvider = params.activeModel?.provider?.trim(); + if (activeProvider) { + const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); + if (activeEntry) return activeEntry; + } + for (const providerId of AUTO_VIDEO_KEY_PROVIDERS) { + const entry = await checkProvider(providerId, undefined); + if (entry) return entry; + } + return null; + } + + const activeProvider = params.activeModel?.provider?.trim(); + if (activeProvider) { + const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); + if (activeEntry) return activeEntry; + } + for (const providerId of AUTO_AUDIO_KEY_PROVIDERS) { + const entry = await checkProvider(providerId, undefined); + if (entry) return entry; + } + return null; +} + +async function resolveAutoEntries(params: { + cfg: ClawdbotConfig; + agentDir?: string; + providerRegistry: ProviderRegistry; + capability: MediaUnderstandingCapability; + activeModel?: ActiveMediaModel; +}): Promise { + const activeEntry = await resolveActiveModelEntry(params); + if (activeEntry) return [activeEntry]; + if (params.capability === "audio") { + const localAudio = await resolveLocalAudioEntry(); + if (localAudio) return [localAudio]; + } + const gemini = await resolveGeminiCliEntry(params.capability); + if (gemini) return [gemini]; + const keys = await resolveKeyEntry(params); + if (keys) return [keys]; + return []; +} + +async function resolveActiveModelEntry(params: { + cfg: ClawdbotConfig; + agentDir?: string; + providerRegistry: ProviderRegistry; + capability: MediaUnderstandingCapability; + activeModel?: ActiveMediaModel; +}): Promise { + const activeProviderRaw = params.activeModel?.provider?.trim(); + if (!activeProviderRaw) return null; + const providerId = normalizeMediaProviderId(activeProviderRaw); + if (!providerId) return null; + const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); + if (!provider) return null; + if (params.capability === "audio" && !provider.transcribeAudio) return null; + if (params.capability === "image" && !provider.describeImage) return null; + if (params.capability === "video" && !provider.describeVideo) return null; + try { + await resolveApiKeyForProvider({ + provider: providerId, + cfg: params.cfg, + agentDir: params.agentDir, + }); + } catch { + return null; + } + return { + type: "provider", + provider: providerId, + model: params.activeModel?.model, + }; } function trimOutput(text: string, maxChars?: number): string { @@ -96,6 +445,74 @@ function trimOutput(text: string, maxChars?: number): string { return trimmed.slice(0, maxChars).trim(); } +function commandBase(command: string): string { + return path.parse(command).name; +} + +function findArgValue(args: string[], keys: string[]): string | undefined { + for (let i = 0; i < args.length; i += 1) { + if (keys.includes(args[i] ?? "")) { + const value = args[i + 1]; + if (value) return value; + } + } + return undefined; +} + +function hasArg(args: string[], keys: string[]): boolean { + return args.some((arg) => keys.includes(arg)); +} + +function resolveWhisperOutputPath(args: string[], mediaPath: string): string | null { + const outputDir = findArgValue(args, ["--output_dir", "-o"]); + const outputFormat = findArgValue(args, ["--output_format"]); + if (!outputDir || !outputFormat) return null; + const formats = outputFormat.split(",").map((value) => value.trim()); + if (!formats.includes("txt")) return null; + const base = path.parse(mediaPath).name; + return path.join(outputDir, `${base}.txt`); +} + +function resolveWhisperCppOutputPath(args: string[]): string | null { + if (!hasArg(args, ["-otxt", "--output-txt"])) return null; + const outputBase = findArgValue(args, ["-of", "--output-file"]); + if (!outputBase) return null; + return `${outputBase}.txt`; +} + +async function resolveCliOutput(params: { + command: string; + args: string[]; + stdout: string; + mediaPath: string; +}): Promise { + const commandId = commandBase(params.command); + const fileOutput = + commandId === "whisper-cli" + ? resolveWhisperCppOutputPath(params.args) + : commandId === "whisper" + ? resolveWhisperOutputPath(params.args, params.mediaPath) + : null; + if (fileOutput && (await fileExists(fileOutput))) { + try { + const content = await fs.readFile(fileOutput, "utf8"); + if (content.trim()) return content.trim(); + } catch {} + } + + if (commandId === "gemini") { + const response = extractGeminiResponse(params.stdout); + if (response) return response; + } + + if (commandId === "sherpa-onnx-offline") { + const response = extractSherpaOnnxText(params.stdout); + if (response) return response; + } + + return params.stdout.trim(); +} + type ProviderQuery = Record; function normalizeProviderQuery( @@ -422,32 +839,48 @@ async function runCliEntry(params: { maxBytes, timeoutMs, }); + const outputDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-cli-")); + const mediaPath = pathResult.path; + const outputBase = path.join(outputDir, path.parse(mediaPath).name); const templCtx: MsgContext = { ...ctx, - MediaPath: pathResult.path, + MediaPath: mediaPath, + MediaDir: path.dirname(mediaPath), + OutputDir: outputDir, + OutputBase: outputBase, Prompt: prompt, MaxChars: maxChars, }; const argv = [command, ...args].map((part, index) => index === 0 ? part : applyTemplate(part, templCtx), ); - if (shouldLogVerbose()) { - logVerbose(`Media understanding via CLI: ${argv.join(" ")}`); + try { + if (shouldLogVerbose()) { + logVerbose(`Media understanding via CLI: ${argv.join(" ")}`); + } + const { stdout } = await runExec(argv[0], argv.slice(1), { + timeoutMs, + maxBuffer: CLI_OUTPUT_MAX_BUFFER, + }); + const resolved = await resolveCliOutput({ + command, + args: argv.slice(1), + stdout, + mediaPath, + }); + const text = trimOutput(resolved, maxChars); + if (!text) return null; + return { + kind: capability === "audio" ? "audio.transcription" : `${capability}.description`, + attachmentIndex: params.attachmentIndex, + text, + provider: "cli", + model: command, + }; + } finally { + await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {}); } - const { stdout } = await runExec(argv[0], argv.slice(1), { - timeoutMs, - maxBuffer: CLI_OUTPUT_MAX_BUFFER, - }); - const text = trimOutput(stdout, maxChars); - if (!text) return null; - return { - kind: capability === "audio" ? "audio.transcription" : `${capability}.description`, - attachmentIndex: params.attachmentIndex, - text, - provider: "cli", - model: command, - }; } async function runAttachmentEntries(params: { @@ -581,19 +1014,20 @@ export async function runCapability(params: { }; } - const entries = resolveEntriesWithActiveFallback({ + const entries = resolveModelEntries({ cfg, capability, config, providerRegistry: params.providerRegistry, - activeModel: params.activeModel, }); let resolvedEntries = entries; - if (resolvedEntries.length === 0 && capability === "audio") { - resolvedEntries = await resolveAutoAudioEntries({ + if (resolvedEntries.length === 0) { + resolvedEntries = await resolveAutoEntries({ cfg, agentDir: params.agentDir, providerRegistry: params.providerRegistry, + capability, + activeModel: params.activeModel, }); } if (resolvedEntries.length === 0) {