feat: improve media auto-detect

2026-02-09 05:19:32 +08:00 · 2026-01-23 05:47:09 +00:00
parent 1d9f230be4
commit 2dfbd1c1f6
6 changed files with 561 additions and 38 deletions
--- a/src/auto-reply/templating.ts
+++ b/src/auto-reply/templating.ts
@@ -60,9 +60,12 @@ export type MsgContext = {
  MediaPath?: string;
  MediaUrl?: string;
  MediaType?: string;
+  MediaDir?: string;
  MediaPaths?: string[];
  MediaUrls?: string[];
  MediaTypes?: string[];
+  OutputDir?: string;
+  OutputBase?: string;
  /** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
  MediaRemoteHost?: string;
  Transcript?: string;
--- a/src/media-understanding/defaults.ts
+++ b/src/media-understanding/defaults.ts
@@ -29,7 +29,7 @@ export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
 export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
 export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
  groq: "whisper-large-v3-turbo",
-  openai: "whisper-1",
+  openai: "gpt-4o-mini-transcribe",
  deepgram: "nova-3",
 };
 export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
--- a/src/media-understanding/providers/google/audio.ts
+++ b/src/media-understanding/providers/google/audio.ts
@@ -0,0 +1,84 @@
+import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
+import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
+import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
+
+export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
+const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
+const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  if (!trimmed) return DEFAULT_GOOGLE_AUDIO_MODEL;
+  return normalizeGoogleModelId(trimmed);
+}
+
+function resolvePrompt(prompt?: string): string {
+  const trimmed = prompt?.trim();
+  return trimmed || DEFAULT_GOOGLE_AUDIO_PROMPT;
+}
+
+export async function transcribeGeminiAudio(
+  params: AudioTranscriptionRequest,
+): Promise<AudioTranscriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_AUDIO_BASE_URL);
+  const model = resolveModel(params.model);
+  const url = `${baseUrl}/models/${model}:generateContent`;
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("content-type")) {
+    headers.set("content-type", "application/json");
+  }
+  if (!headers.has("x-goog-api-key")) {
+    headers.set("x-goog-api-key", params.apiKey);
+  }
+
+  const body = {
+    contents: [
+      {
+        role: "user",
+        parts: [
+          { text: resolvePrompt(params.prompt) },
+          {
+            inline_data: {
+              mime_type: params.mime ?? "audio/wav",
+              data: params.buffer.toString("base64"),
+            },
+          },
+        ],
+      },
+    ],
+  };
+
+  const res = await fetchWithTimeout(
+    url,
+    {
+      method: "POST",
+      headers,
+      body: JSON.stringify(body),
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  if (!res.ok) {
+    const detail = await readErrorResponse(res);
+    const suffix = detail ? `: ${detail}` : "";
+    throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
+  }
+
+  const payload = (await res.json()) as {
+    candidates?: Array<{
+      content?: { parts?: Array<{ text?: string }> };
+    }>;
+  };
+  const parts = payload.candidates?.[0]?.content?.parts ?? [];
+  const text = parts
+    .map((part) => part?.text?.trim())
+    .filter(Boolean)
+    .join("\n");
+  if (!text) {
+    throw new Error("Audio transcription response missing text");
+  }
+  return { text, model };
+}
--- a/src/media-understanding/providers/google/index.ts
+++ b/src/media-understanding/providers/google/index.ts
@@ -1,10 +1,12 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { describeImageWithModel } from "../image.js";
+import { transcribeGeminiAudio } from "./audio.js";
 import { describeGeminiVideo } from "./video.js";

 export const googleProvider: MediaUnderstandingProvider = {
  id: "google",
  capabilities: ["image", "audio", "video"],
  describeImage: describeImageWithModel,
+  transcribeAudio: transcribeGeminiAudio,
  describeVideo: describeGeminiVideo,
 };
--- a/src/media-understanding/providers/openai/audio.ts
+++ b/src/media-understanding/providers/openai/audio.ts
@@ -4,7 +4,7 @@ import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../
 import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";

 export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
-const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
+const DEFAULT_OPENAI_AUDIO_MODEL = "gpt-4o-mini-transcribe";

 function resolveModel(model?: string): string {
  const trimmed = model?.trim();
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -1,3 +1,8 @@
+import { constants as fsConstants } from "node:fs";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
 import type { ClawdbotConfig } from "../config/config.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import { applyTemplate } from "../auto-reply/templating.js";
@@ -16,9 +21,9 @@ import {
 } from "./defaults.js";
 import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js";
 import {
-  resolveEntriesWithActiveFallback,
  resolveMaxBytes,
  resolveMaxChars,
+  resolveModelEntries,
  resolvePrompt,
  resolveScopeDecision,
  resolveTimeoutMs,
@@ -39,7 +44,15 @@ import {
 import { describeImageWithModel } from "./providers/image.js";
 import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";

-const AUTO_AUDIO_PROVIDERS = ["openai", "groq", "deepgram"] as const;
+const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google"] as const;
+const AUTO_IMAGE_KEY_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const;
+const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const;
+const DEFAULT_IMAGE_MODELS: Record<string, string> = {
+  openai: "gpt-5-mini",
+  anthropic: "claude-opus-4-5",
+  google: "gemini-3-flash-preview",
+  minimax: "MiniMax-VL-01",
+};

 export type ActiveMediaModel = {
  provider: string;
@@ -67,27 +80,363 @@ export function createMediaAttachmentCache(attachments: MediaAttachment[]): Medi
  return new MediaAttachmentCache(attachments);
 }

-async function resolveAutoAudioEntries(params: {
+const binaryCache = new Map<string, Promise<string | null>>();
+const geminiProbeCache = new Map<string, Promise<boolean>>();
+
+function expandHomeDir(value: string): string {
+  if (!value.startsWith("~")) return value;
+  const home = os.homedir();
+  if (value === "~") return home;
+  if (value.startsWith("~/")) return path.join(home, value.slice(2));
+  return value;
+}
+
+function hasPathSeparator(value: string): boolean {
+  return value.includes("/") || value.includes("\\");
+}
+
+function candidateBinaryNames(name: string): string[] {
+  if (process.platform !== "win32") return [name];
+  const ext = path.extname(name);
+  if (ext) return [name];
+  const pathext = (process.env.PATHEXT ?? ".EXE;.CMD;.BAT;.COM")
+    .split(";")
+    .map((item) => item.trim())
+    .filter(Boolean)
+    .map((item) => (item.startsWith(".") ? item : `.${item}`));
+  const unique = Array.from(new Set(pathext));
+  return [name, ...unique.map((item) => `${name}${item}`)];
+}
+
+async function isExecutable(filePath: string): Promise<boolean> {
+  try {
+    const stat = await fs.stat(filePath);
+    if (!stat.isFile()) return false;
+    if (process.platform === "win32") return true;
+    await fs.access(filePath, fsConstants.X_OK);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+async function findBinary(name: string): Promise<string | null> {
+  const cached = binaryCache.get(name);
+  if (cached) return cached;
+  const resolved = (async () => {
+    const direct = expandHomeDir(name.trim());
+    if (direct && hasPathSeparator(direct)) {
+      for (const candidate of candidateBinaryNames(direct)) {
+        if (await isExecutable(candidate)) return candidate;
+      }
+    }
+
+    const searchName = name.trim();
+    if (!searchName) return null;
+    const pathEntries = (process.env.PATH ?? "").split(path.delimiter);
+    const candidates = candidateBinaryNames(searchName);
+    for (const entryRaw of pathEntries) {
+      const entry = expandHomeDir(entryRaw.trim().replace(/^"(.*)"$/, "$1"));
+      if (!entry) continue;
+      for (const candidate of candidates) {
+        const fullPath = path.join(entry, candidate);
+        if (await isExecutable(fullPath)) return fullPath;
+      }
+    }
+
+    return null;
+  })();
+  binaryCache.set(name, resolved);
+  return resolved;
+}
+
+async function hasBinary(name: string): Promise<boolean> {
+  return Boolean(await findBinary(name));
+}
+
+async function fileExists(filePath?: string | null): Promise<boolean> {
+  if (!filePath) return false;
+  try {
+    await fs.stat(filePath);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+function extractLastJsonObject(raw: string): unknown {
+  const trimmed = raw.trim();
+  const start = trimmed.lastIndexOf("{");
+  if (start === -1) return null;
+  const slice = trimmed.slice(start);
+  try {
+    return JSON.parse(slice);
+  } catch {
+    return null;
+  }
+}
+
+function extractGeminiResponse(raw: string): string | null {
+  const payload = extractLastJsonObject(raw);
+  if (!payload || typeof payload !== "object") return null;
+  const response = (payload as { response?: unknown }).response;
+  if (typeof response !== "string") return null;
+  const trimmed = response.trim();
+  return trimmed || null;
+}
+
+function extractSherpaOnnxText(raw: string): string | null {
+  const tryParse = (value: string): string | null => {
+    const trimmed = value.trim();
+    if (!trimmed) return null;
+    const head = trimmed[0];
+    if (head !== "{" && head !== '"') return null;
+    try {
+      const parsed = JSON.parse(trimmed) as unknown;
+      if (typeof parsed === "string") {
+        return tryParse(parsed);
+      }
+      if (parsed && typeof parsed === "object") {
+        const text = (parsed as { text?: unknown }).text;
+        if (typeof text === "string" && text.trim()) {
+          return text.trim();
+        }
+      }
+    } catch {}
+    return null;
+  };
+
+  const direct = tryParse(raw);
+  if (direct) return direct;
+
+  const lines = raw
+    .split("\n")
+    .map((line) => line.trim())
+    .filter(Boolean);
+  for (let i = lines.length - 1; i >= 0; i -= 1) {
+    const parsed = tryParse(lines[i] ?? "");
+    if (parsed) return parsed;
+  }
+  return null;
+}
+
+async function probeGeminiCli(): Promise<boolean> {
+  const cached = geminiProbeCache.get("gemini");
+  if (cached) return cached;
+  const resolved = (async () => {
+    if (!(await hasBinary("gemini"))) return false;
+    try {
+      const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], {
+        timeoutMs: 8000,
+      });
+      return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok"));
+    } catch {
+      return false;
+    }
+  })();
+  geminiProbeCache.set("gemini", resolved);
+  return resolved;
+}
+
+async function resolveLocalWhisperCppEntry(): Promise<MediaUnderstandingModelConfig | null> {
+  if (!(await hasBinary("whisper-cli"))) return null;
+  const envModel = process.env.WHISPER_CPP_MODEL?.trim();
+  const defaultModel = "/opt/homebrew/share/whisper-cpp/for-tests-ggml-tiny.bin";
+  const modelPath = envModel && (await fileExists(envModel)) ? envModel : defaultModel;
+  if (!(await fileExists(modelPath))) return null;
+  return {
+    type: "cli",
+    command: "whisper-cli",
+    args: ["-m", modelPath, "-otxt", "-of", "{{OutputBase}}", "-np", "-nt", "{{MediaPath}}"],
+  };
+}
+
+async function resolveLocalWhisperEntry(): Promise<MediaUnderstandingModelConfig | null> {
+  if (!(await hasBinary("whisper"))) return null;
+  return {
+    type: "cli",
+    command: "whisper",
+    args: [
+      "--model",
+      "turbo",
+      "--output_format",
+      "txt",
+      "--output_dir",
+      "{{OutputDir}}",
+      "--verbose",
+      "False",
+      "{{MediaPath}}",
+    ],
+  };
+}
+
+async function resolveSherpaOnnxEntry(): Promise<MediaUnderstandingModelConfig | null> {
+  if (!(await hasBinary("sherpa-onnx-offline"))) return null;
+  const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim();
+  if (!modelDir) return null;
+  const tokens = path.join(modelDir, "tokens.txt");
+  const encoder = path.join(modelDir, "encoder.onnx");
+  const decoder = path.join(modelDir, "decoder.onnx");
+  const joiner = path.join(modelDir, "joiner.onnx");
+  if (!(await fileExists(tokens))) return null;
+  if (!(await fileExists(encoder))) return null;
+  if (!(await fileExists(decoder))) return null;
+  if (!(await fileExists(joiner))) return null;
+  return {
+    type: "cli",
+    command: "sherpa-onnx-offline",
+    args: [
+      `--tokens=${tokens}`,
+      `--encoder=${encoder}`,
+      `--decoder=${decoder}`,
+      `--joiner=${joiner}`,
+      "{{MediaPath}}",
+    ],
+  };
+}
+
+async function resolveLocalAudioEntry(): Promise<MediaUnderstandingModelConfig | null> {
+  const sherpa = await resolveSherpaOnnxEntry();
+  if (sherpa) return sherpa;
+  const whisperCpp = await resolveLocalWhisperCppEntry();
+  if (whisperCpp) return whisperCpp;
+  return await resolveLocalWhisperEntry();
+}
+
+async function resolveGeminiCliEntry(
+  _capability: MediaUnderstandingCapability,
+): Promise<MediaUnderstandingModelConfig | null> {
+  if (!(await probeGeminiCli())) return null;
+  return {
+    type: "cli",
+    command: "gemini",
+    args: [
+      "--output-format",
+      "json",
+      "--allowed-tools",
+      "read_many_files",
+      "--include-directories",
+      "{{MediaDir}}",
+      "{{Prompt}}",
+      "Use read_many_files to read {{MediaPath}} and respond with only the text output.",
+    ],
+  };
+}
+
+async function resolveKeyEntry(params: {
  cfg: ClawdbotConfig;
  agentDir?: string;
  providerRegistry: ProviderRegistry;
-}): Promise<MediaUnderstandingModelConfig[]> {
-  const entries: MediaUnderstandingModelConfig[] = [];
-  for (const providerId of AUTO_AUDIO_PROVIDERS) {
-    const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
-    if (!provider?.transcribeAudio) continue;
+  capability: MediaUnderstandingCapability;
+  activeModel?: ActiveMediaModel;
+}): Promise<MediaUnderstandingModelConfig | null> {
+  const { cfg, agentDir, providerRegistry, capability } = params;
+  const checkProvider = async (
+    providerId: string,
+    model?: string,
+  ): Promise<MediaUnderstandingModelConfig | null> => {
+    const provider = getMediaUnderstandingProvider(providerId, providerRegistry);
+    if (!provider) return null;
+    if (capability === "audio" && !provider.transcribeAudio) return null;
+    if (capability === "image" && !provider.describeImage) return null;
+    if (capability === "video" && !provider.describeVideo) return null;
    try {
-      await resolveApiKeyForProvider({
-        provider: providerId,
-        cfg: params.cfg,
-        agentDir: params.agentDir,
-      });
-      entries.push({ type: "provider", provider: providerId });
+      await resolveApiKeyForProvider({ provider: providerId, cfg, agentDir });
+      return { type: "provider" as const, provider: providerId, model };
    } catch {
-      continue;
+      return null;
    }
+  };
+
+  if (capability === "image") {
+    const activeProvider = params.activeModel?.provider?.trim();
+    if (activeProvider) {
+      const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
+      if (activeEntry) return activeEntry;
+    }
+    for (const providerId of AUTO_IMAGE_KEY_PROVIDERS) {
+      const model = DEFAULT_IMAGE_MODELS[providerId];
+      const entry = await checkProvider(providerId, model);
+      if (entry) return entry;
+    }
+    return null;
  }
-  return entries;
+
+  if (capability === "video") {
+    const activeProvider = params.activeModel?.provider?.trim();
+    if (activeProvider) {
+      const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
+      if (activeEntry) return activeEntry;
+    }
+    for (const providerId of AUTO_VIDEO_KEY_PROVIDERS) {
+      const entry = await checkProvider(providerId, undefined);
+      if (entry) return entry;
+    }
+    return null;
+  }
+
+  const activeProvider = params.activeModel?.provider?.trim();
+  if (activeProvider) {
+    const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
+    if (activeEntry) return activeEntry;
+  }
+  for (const providerId of AUTO_AUDIO_KEY_PROVIDERS) {
+    const entry = await checkProvider(providerId, undefined);
+    if (entry) return entry;
+  }
+  return null;
+}
+
+async function resolveAutoEntries(params: {
+  cfg: ClawdbotConfig;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  capability: MediaUnderstandingCapability;
+  activeModel?: ActiveMediaModel;
+}): Promise<MediaUnderstandingModelConfig[]> {
+  const activeEntry = await resolveActiveModelEntry(params);
+  if (activeEntry) return [activeEntry];
+  if (params.capability === "audio") {
+    const localAudio = await resolveLocalAudioEntry();
+    if (localAudio) return [localAudio];
+  }
+  const gemini = await resolveGeminiCliEntry(params.capability);
+  if (gemini) return [gemini];
+  const keys = await resolveKeyEntry(params);
+  if (keys) return [keys];
+  return [];
+}
+
+async function resolveActiveModelEntry(params: {
+  cfg: ClawdbotConfig;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  capability: MediaUnderstandingCapability;
+  activeModel?: ActiveMediaModel;
+}): Promise<MediaUnderstandingModelConfig | null> {
+  const activeProviderRaw = params.activeModel?.provider?.trim();
+  if (!activeProviderRaw) return null;
+  const providerId = normalizeMediaProviderId(activeProviderRaw);
+  if (!providerId) return null;
+  const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
+  if (!provider) return null;
+  if (params.capability === "audio" && !provider.transcribeAudio) return null;
+  if (params.capability === "image" && !provider.describeImage) return null;
+  if (params.capability === "video" && !provider.describeVideo) return null;
+  try {
+    await resolveApiKeyForProvider({
+      provider: providerId,
+      cfg: params.cfg,
+      agentDir: params.agentDir,
+    });
+  } catch {
+    return null;
+  }
+  return {
+    type: "provider",
+    provider: providerId,
+    model: params.activeModel?.model,
+  };
 }

 function trimOutput(text: string, maxChars?: number): string {
@@ -96,6 +445,74 @@ function trimOutput(text: string, maxChars?: number): string {
  return trimmed.slice(0, maxChars).trim();
 }

+function commandBase(command: string): string {
+  return path.parse(command).name;
+}
+
+function findArgValue(args: string[], keys: string[]): string | undefined {
+  for (let i = 0; i < args.length; i += 1) {
+    if (keys.includes(args[i] ?? "")) {
+      const value = args[i + 1];
+      if (value) return value;
+    }
+  }
+  return undefined;
+}
+
+function hasArg(args: string[], keys: string[]): boolean {
+  return args.some((arg) => keys.includes(arg));
+}
+
+function resolveWhisperOutputPath(args: string[], mediaPath: string): string | null {
+  const outputDir = findArgValue(args, ["--output_dir", "-o"]);
+  const outputFormat = findArgValue(args, ["--output_format"]);
+  if (!outputDir || !outputFormat) return null;
+  const formats = outputFormat.split(",").map((value) => value.trim());
+  if (!formats.includes("txt")) return null;
+  const base = path.parse(mediaPath).name;
+  return path.join(outputDir, `${base}.txt`);
+}
+
+function resolveWhisperCppOutputPath(args: string[]): string | null {
+  if (!hasArg(args, ["-otxt", "--output-txt"])) return null;
+  const outputBase = findArgValue(args, ["-of", "--output-file"]);
+  if (!outputBase) return null;
+  return `${outputBase}.txt`;
+}
+
+async function resolveCliOutput(params: {
+  command: string;
+  args: string[];
+  stdout: string;
+  mediaPath: string;
+}): Promise<string> {
+  const commandId = commandBase(params.command);
+  const fileOutput =
+    commandId === "whisper-cli"
+      ? resolveWhisperCppOutputPath(params.args)
+      : commandId === "whisper"
+        ? resolveWhisperOutputPath(params.args, params.mediaPath)
+        : null;
+  if (fileOutput && (await fileExists(fileOutput))) {
+    try {
+      const content = await fs.readFile(fileOutput, "utf8");
+      if (content.trim()) return content.trim();
+    } catch {}
+  }
+
+  if (commandId === "gemini") {
+    const response = extractGeminiResponse(params.stdout);
+    if (response) return response;
+  }
+
+  if (commandId === "sherpa-onnx-offline") {
+    const response = extractSherpaOnnxText(params.stdout);
+    if (response) return response;
+  }
+
+  return params.stdout.trim();
+}
+
 type ProviderQuery = Record<string, string | number | boolean>;

 function normalizeProviderQuery(
@@ -422,32 +839,48 @@ async function runCliEntry(params: {
    maxBytes,
    timeoutMs,
  });
+  const outputDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-cli-"));
+  const mediaPath = pathResult.path;
+  const outputBase = path.join(outputDir, path.parse(mediaPath).name);

  const templCtx: MsgContext = {
    ...ctx,
-    MediaPath: pathResult.path,
+    MediaPath: mediaPath,
+    MediaDir: path.dirname(mediaPath),
+    OutputDir: outputDir,
+    OutputBase: outputBase,
    Prompt: prompt,
    MaxChars: maxChars,
  };
  const argv = [command, ...args].map((part, index) =>
    index === 0 ? part : applyTemplate(part, templCtx),
  );
-  if (shouldLogVerbose()) {
-    logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
+  try {
+    if (shouldLogVerbose()) {
+      logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
+    }
+    const { stdout } = await runExec(argv[0], argv.slice(1), {
+      timeoutMs,
+      maxBuffer: CLI_OUTPUT_MAX_BUFFER,
+    });
+    const resolved = await resolveCliOutput({
+      command,
+      args: argv.slice(1),
+      stdout,
+      mediaPath,
+    });
+    const text = trimOutput(resolved, maxChars);
+    if (!text) return null;
+    return {
+      kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
+      attachmentIndex: params.attachmentIndex,
+      text,
+      provider: "cli",
+      model: command,
+    };
+  } finally {
+    await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {});
  }
-  const { stdout } = await runExec(argv[0], argv.slice(1), {
-    timeoutMs,
-    maxBuffer: CLI_OUTPUT_MAX_BUFFER,
-  });
-  const text = trimOutput(stdout, maxChars);
-  if (!text) return null;
-  return {
-    kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
-    attachmentIndex: params.attachmentIndex,
-    text,
-    provider: "cli",
-    model: command,
-  };
 }

 async function runAttachmentEntries(params: {
@@ -581,19 +1014,20 @@ export async function runCapability(params: {
    };
  }

-  const entries = resolveEntriesWithActiveFallback({
+  const entries = resolveModelEntries({
    cfg,
    capability,
    config,
    providerRegistry: params.providerRegistry,
-    activeModel: params.activeModel,
  });
  let resolvedEntries = entries;
-  if (resolvedEntries.length === 0 && capability === "audio") {
-    resolvedEntries = await resolveAutoAudioEntries({
+  if (resolvedEntries.length === 0) {
+    resolvedEntries = await resolveAutoEntries({
      cfg,
      agentDir: params.agentDir,
      providerRegistry: params.providerRegistry,
+      capability,
+      activeModel: params.activeModel,
    });
  }
  if (resolvedEntries.length === 0) {