From e1a3bab7e507b98c3de72de819e15fad96b971be Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 20 Dec 2025 12:53:09 +0000 Subject: [PATCH] feat(skills): add media/transcription helpers --- skills/brave-search/SKILL.md | 30 ++- skills/brave-search/scripts/content.mjs | 53 ++++++ skills/brave-search/scripts/search.mjs | 80 ++++++++ skills/nano-pdf/SKILL.md | 19 ++ skills/openai-image-gen/SKILL.md | 20 +- skills/openai-image-gen/scripts/gen.py | 172 ++++++++++++++++++ skills/openai-whisper-api/SKILL.md | 42 +++++ .../openai-whisper-api/scripts/transcribe.sh | 85 +++++++++ skills/video-frames/SKILL.md | 28 +++ skills/video-frames/scripts/frame.sh | 81 +++++++++ 10 files changed, 579 insertions(+), 31 deletions(-) create mode 100644 skills/brave-search/scripts/content.mjs create mode 100644 skills/brave-search/scripts/search.mjs create mode 100644 skills/nano-pdf/SKILL.md create mode 100644 skills/openai-image-gen/scripts/gen.py create mode 100644 skills/openai-whisper-api/SKILL.md create mode 100644 skills/openai-whisper-api/scripts/transcribe.sh create mode 100644 skills/video-frames/SKILL.md create mode 100644 skills/video-frames/scripts/frame.sh diff --git a/skills/brave-search/SKILL.md b/skills/brave-search/SKILL.md index 08d3ff65bb..12c8ebec1a 100644 --- a/skills/brave-search/SKILL.md +++ b/skills/brave-search/SKILL.md @@ -1,35 +1,29 @@ --- name: brave-search description: Web search and content extraction via Brave Search API. -metadata: {"clawdis":{"requires":{"bins":["node","npm"],"env":["BRAVE_API_KEY"]},"primaryEnv":"BRAVE_API_KEY","install":[{"id":"node-brew","kind":"brew","formula":"node","bins":["node","npm"],"label":"Install Node.js (brew)"}]}} +metadata: {"clawdis":{"requires":{"bins":["node"],"env":["BRAVE_API_KEY"]},"primaryEnv":"BRAVE_API_KEY"}} --- # Brave Search -Headless web search and content extraction using Brave Search. No browser required. - -## Setup (run once) - -```bash -cd ~/Projects/agent-scripts/skills/brave-search -npm ci -``` - -Needs env: `BRAVE_API_KEY`. +Headless web search (and lightweight content extraction) using Brave Search API. No browser required. ## Search ```bash -./search.js "query" # Basic search (5 results) -./search.js "query" -n 10 # More results -./search.js "query" --content # Include page content as markdown -./search.js "query" -n 3 --content # Combined +node {baseDir}/scripts/search.mjs "query" +node {baseDir}/scripts/search.mjs "query" -n 10 +node {baseDir}/scripts/search.mjs "query" --content +node {baseDir}/scripts/search.mjs "query" -n 3 --content ``` -## Extract Page Content +## Extract a page ```bash -./content.js https://example.com/article +node {baseDir}/scripts/content.mjs "https://example.com/article" ``` -Fetches a URL and extracts readable content as markdown. +Notes: +- Needs `BRAVE_API_KEY`. +- Content extraction is best-effort (good for articles; not for app-like sites). + - If a site is blocked or too JS-heavy, prefer the `summarize` skill (it can use a Firecrawl fallback). diff --git a/skills/brave-search/scripts/content.mjs b/skills/brave-search/scripts/content.mjs new file mode 100644 index 0000000000..8a2b617e9c --- /dev/null +++ b/skills/brave-search/scripts/content.mjs @@ -0,0 +1,53 @@ +#!/usr/bin/env node + +function usage() { + console.error(`Usage: content.mjs `); + process.exit(2); +} + +export async function fetchAsMarkdown(url) { + const resp = await fetch(url, { + headers: { "User-Agent": "clawdis-brave-search/1.0" }, + }); + const html = await resp.text(); + + // Very lightweight “readability-ish” extraction without dependencies: + // - drop script/style/nav/footer + // - strip tags + // - keep paragraphs + const cleaned = html + .replace(//gi, " ") + .replace(//gi, " ") + .replace(/<(nav|footer|header)[\s\S]*?<\/\1>/gi, " ") + .replace(//gi, "\n") + .replace(/<\/p>/gi, "\n\n") + .replace(/<\/div>/gi, "\n") + .replace(/<[^>]+>/g, " ") + .replace(/ /g, " ") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/\s+\n/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .replace(/[ \t]{2,}/g, " ") + .trim(); + + if (!resp.ok) { + return `> Fetch failed (${resp.status}).\n\n${cleaned.slice(0, 2000)}\n`; + } + + const paras = cleaned + .split("\n\n") + .map((p) => p.trim()) + .filter(Boolean) + .slice(0, 30); + + return paras.map((p) => `- ${p}`).join("\n") + "\n"; +} + +const args = process.argv.slice(2); +if (args.length === 0 || args[0] === "-h" || args[0] === "--help") usage(); +const url = args[0]; +process.stdout.write(await fetchAsMarkdown(url)); diff --git a/skills/brave-search/scripts/search.mjs b/skills/brave-search/scripts/search.mjs new file mode 100644 index 0000000000..abda0863f4 --- /dev/null +++ b/skills/brave-search/scripts/search.mjs @@ -0,0 +1,80 @@ +#!/usr/bin/env node +import { readFileSync } from "node:fs"; + +function usage() { + console.error(`Usage: search.mjs "query" [-n 5] [--content]`); + process.exit(2); +} + +const args = process.argv.slice(2); +if (args.length === 0 || args[0] === "-h" || args[0] === "--help") usage(); + +const query = args[0]; +let n = 5; +let withContent = false; + +for (let i = 1; i < args.length; i++) { + const a = args[i]; + if (a === "-n") { + n = Number.parseInt(args[i + 1] ?? "5", 10); + i++; + continue; + } + if (a === "--content") { + withContent = true; + continue; + } + console.error(`Unknown arg: ${a}`); + usage(); +} + +const apiKey = (process.env.BRAVE_API_KEY ?? "").trim(); +if (!apiKey) { + console.error("Missing BRAVE_API_KEY"); + process.exit(1); +} + +const endpoint = new URL("https://api.search.brave.com/res/v1/web/search"); +endpoint.searchParams.set("q", query); +endpoint.searchParams.set("count", String(Math.max(1, Math.min(n, 20)))); +endpoint.searchParams.set("text_decorations", "false"); +endpoint.searchParams.set("safesearch", "moderate"); + +const resp = await fetch(endpoint, { + headers: { + Accept: "application/json", + "X-Subscription-Token": apiKey, + }, +}); + +if (!resp.ok) { + const text = await resp.text().catch(() => ""); + throw new Error(`Brave Search failed (${resp.status}): ${text}`); +} + +const data = await resp.json(); +const results = (data?.web?.results ?? []).slice(0, n); + +const lines = []; +for (const r of results) { + const title = String(r?.title ?? "").trim(); + const url = String(r?.url ?? "").trim(); + const desc = String(r?.description ?? "").trim(); + if (!title || !url) continue; + lines.push(`- ${title}\n ${url}${desc ? `\n ${desc}` : ""}`); +} + +process.stdout.write(lines.join("\n\n") + "\n"); + +if (!withContent) process.exit(0); + +process.stdout.write("\n---\n\n"); +for (const r of results) { + const title = String(r?.title ?? "").trim(); + const url = String(r?.url ?? "").trim(); + if (!url) continue; + process.stdout.write(`# ${title || url}\n${url}\n\n`); + const child = await import("./content.mjs"); + const text = await child.fetchAsMarkdown(url); + process.stdout.write(text.trimEnd() + "\n\n"); +} diff --git a/skills/nano-pdf/SKILL.md b/skills/nano-pdf/SKILL.md new file mode 100644 index 0000000000..2ff85b9f6f --- /dev/null +++ b/skills/nano-pdf/SKILL.md @@ -0,0 +1,19 @@ +--- +name: nano-pdf +description: Edit PDFs with natural-language instructions using the nano-pdf CLI. +metadata: {"clawdis":{"requires":{"bins":["nano-pdf"]},"install":[{"id":"pipx","kind":"shell","command":"python3 -m pip install --user pipx && python3 -m pipx ensurepath && pipx install nano-pdf","bins":["nano-pdf"],"label":"Install nano-pdf (pipx)"},{"id":"pip","kind":"shell","command":"python3 -m pip install --user nano-pdf","bins":["nano-pdf"],"label":"Install nano-pdf (pip --user)"}]}} +--- + +# nano-pdf + +Use `nano-pdf` to apply edits to a specific page in a PDF using a natural-language instruction. + +## Quick start + +```bash +nano-pdf edit deck.pdf 1 "Change the title to 'Q3 Results' and fix the typo in the subtitle" +``` + +Notes: +- Page numbers are 0-based or 1-based depending on the tool’s version/config; if the result looks off by one, retry with the other. +- Always sanity-check the output PDF before sending it out. diff --git a/skills/openai-image-gen/SKILL.md b/skills/openai-image-gen/SKILL.md index b423b949c2..a6d1cab0d7 100644 --- a/skills/openai-image-gen/SKILL.md +++ b/skills/openai-image-gen/SKILL.md @@ -6,31 +6,25 @@ metadata: {"clawdis":{"requires":{"bins":["python3"],"env":["OPENAI_API_KEY"]}," # OpenAI Image Gen -Generate a handful of "random but structured" prompts and render them via OpenAI Images API. - -## Setup - -- Needs env: `OPENAI_API_KEY` +Generate a handful of “random but structured” prompts and render them via the OpenAI Images API. ## Run -From any directory (outputs to `~/Projects/tmp/...` when present; else `./tmp/...`): - ```bash -python3 ~/Projects/agent-scripts/skills/openai-image-gen/scripts/gen.py -open ~/Projects/tmp/openai-image-gen-*/index.html +python3 {baseDir}/scripts/gen.py +open ./tmp/openai-image-gen-*/index.html ``` Useful flags: ```bash -python3 ~/Projects/agent-scripts/skills/openai-image-gen/scripts/gen.py --count 16 --model gpt-image-1.5 -python3 ~/Projects/agent-scripts/skills/openai-image-gen/scripts/gen.py --prompt "ultra-detailed studio photo of a lobster astronaut" --count 4 -python3 ~/Projects/agent-scripts/skills/openai-image-gen/scripts/gen.py --size 1536x1024 --quality high --out-dir ./out/images +python3 {baseDir}/scripts/gen.py --count 16 --model gpt-image-1 +python3 {baseDir}/scripts/gen.py --prompt "ultra-detailed studio photo of a lobster astronaut" --count 4 +python3 {baseDir}/scripts/gen.py --size 1536x1024 --quality high --out-dir ./out/images ``` ## Output - `*.png` images -- `prompts.json` (prompt to file mapping) +- `prompts.json` (prompt → file mapping) - `index.html` (thumbnail gallery) diff --git a/skills/openai-image-gen/scripts/gen.py b/skills/openai-image-gen/scripts/gen.py new file mode 100644 index 0000000000..097b71c745 --- /dev/null +++ b/skills/openai-image-gen/scripts/gen.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +import argparse +import base64 +import datetime as dt +import json +import os +import random +import re +import sys +import urllib.error +import urllib.request +from pathlib import Path + + +def slugify(text: str) -> str: + text = text.lower().strip() + text = re.sub(r"[^a-z0-9]+", "-", text) + text = re.sub(r"-{2,}", "-", text).strip("-") + return text or "image" + + +def default_out_dir() -> Path: + now = dt.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + base = Path("./tmp") + base.mkdir(parents=True, exist_ok=True) + return base / f"openai-image-gen-{now}" + + +def pick_prompts(count: int) -> list[str]: + subjects = [ + "a lobster astronaut", + "a brutalist lighthouse", + "a cozy reading nook", + "a cyberpunk noodle shop", + "a Vienna street at dusk", + "a minimalist product photo", + "a surreal underwater library", + ] + styles = [ + "ultra-detailed studio photo", + "35mm film still", + "isometric illustration", + "editorial photography", + "soft watercolor", + "architectural render", + "high-contrast monochrome", + ] + lighting = [ + "golden hour", + "overcast soft light", + "neon lighting", + "dramatic rim light", + "candlelight", + "foggy atmosphere", + ] + prompts: list[str] = [] + for _ in range(count): + prompts.append( + f"{random.choice(styles)} of {random.choice(subjects)}, {random.choice(lighting)}" + ) + return prompts + + +def request_images( + api_key: str, + prompt: str, + model: str, + size: str, + quality: str, +) -> dict: + url = "https://api.openai.com/v1/images/generations" + body = json.dumps( + { + "model": model, + "prompt": prompt, + "size": size, + "quality": quality, + "n": 1, + "response_format": "b64_json", + } + ).encode("utf-8") + req = urllib.request.Request( + url, + method="POST", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + data=body, + ) + try: + with urllib.request.urlopen(req, timeout=300) as resp: + return json.loads(resp.read().decode("utf-8")) + except urllib.error.HTTPError as e: + payload = e.read().decode("utf-8", errors="replace") + raise RuntimeError(f"OpenAI Images API failed ({e.code}): {payload}") from e + + +def write_gallery(out_dir: Path, items: list[dict]) -> None: + thumbs = "\n".join( + [ + f""" +
+ +
{it["prompt"]}
+
+""".strip() + for it in items + ] + ) + html = f""" + +openai-image-gen + +

openai-image-gen

+

Output: {out_dir.as_posix()}

+
+{thumbs} +
+""" + (out_dir / "index.html").write_text(html, encoding="utf-8") + + +def main() -> int: + ap = argparse.ArgumentParser(description="Generate images via OpenAI Images API.") + ap.add_argument("--prompt", help="Single prompt. If omitted, random prompts are generated.") + ap.add_argument("--count", type=int, default=8, help="How many images to generate.") + ap.add_argument("--model", default="gpt-image-1", help="Image model id.") + ap.add_argument("--size", default="1024x1024", help="Image size (e.g. 1024x1024, 1536x1024).") + ap.add_argument("--quality", default="high", help="Image quality (varies by model).") + ap.add_argument("--out-dir", default="", help="Output directory (default: ./tmp/openai-image-gen-).") + args = ap.parse_args() + + api_key = (os.environ.get("OPENAI_API_KEY") or "").strip() + if not api_key: + print("Missing OPENAI_API_KEY", file=sys.stderr) + return 2 + + out_dir = Path(args.out_dir).expanduser() if args.out_dir else default_out_dir() + out_dir.mkdir(parents=True, exist_ok=True) + + prompts = [args.prompt] * args.count if args.prompt else pick_prompts(args.count) + + items: list[dict] = [] + for idx, prompt in enumerate(prompts, start=1): + print(f"[{idx}/{len(prompts)}] {prompt}") + res = request_images(api_key, prompt, args.model, args.size, args.quality) + b64 = res.get("data", [{}])[0].get("b64_json") + if not b64: + raise RuntimeError(f"Unexpected response: {json.dumps(res)[:400]}") + png = base64.b64decode(b64) + filename = f"{idx:03d}-{slugify(prompt)[:40]}.png" + (out_dir / filename).write_bytes(png) + items.append({"prompt": prompt, "file": filename}) + + (out_dir / "prompts.json").write_text(json.dumps(items, indent=2), encoding="utf-8") + write_gallery(out_dir, items) + print(f"\nWrote: {(out_dir / 'index.html').as_posix()}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/skills/openai-whisper-api/SKILL.md b/skills/openai-whisper-api/SKILL.md new file mode 100644 index 0000000000..72a7766d0c --- /dev/null +++ b/skills/openai-whisper-api/SKILL.md @@ -0,0 +1,42 @@ +--- +name: openai-whisper-api +description: Transcribe audio via OpenAI Audio Transcriptions API (Whisper). +metadata: {"clawdis":{"requires":{"bins":["curl"],"env":["OPENAI_API_KEY"]},"primaryEnv":"OPENAI_API_KEY"}} +--- + +# OpenAI Whisper API (curl) + +Transcribe an audio file via OpenAI’s `/v1/audio/transcriptions` endpoint. + +## Quick start + +```bash +{baseDir}/scripts/transcribe.sh /path/to/audio.m4a +``` + +Defaults: +- Model: `whisper-1` +- Output: `.txt` + +## Useful flags + +```bash +{baseDir}/scripts/transcribe.sh /path/to/audio.ogg --model whisper-1 --out /tmp/transcript.txt +{baseDir}/scripts/transcribe.sh /path/to/audio.m4a --language en +{baseDir}/scripts/transcribe.sh /path/to/audio.m4a --prompt "Speaker names: Peter, Daniel" +{baseDir}/scripts/transcribe.sh /path/to/audio.m4a --json --out /tmp/transcript.json +``` + +## API key + +Set `OPENAI_API_KEY`, or configure it in `~/.clawdis/clawdis.json`: + +```json5 +{ + skills: { + "openai-whisper-api": { + apiKey: "OPENAI_KEY_HERE" + } + } +} +``` diff --git a/skills/openai-whisper-api/scripts/transcribe.sh b/skills/openai-whisper-api/scripts/transcribe.sh new file mode 100644 index 0000000000..551c7b473e --- /dev/null +++ b/skills/openai-whisper-api/scripts/transcribe.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 <<'EOF' +Usage: + transcribe.sh [--model whisper-1] [--out /path/to/out.txt] [--language en] [--prompt "hint"] [--json] +EOF + exit 2 +} + +if [[ "${1:-}" == "" || "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage +fi + +in="${1:-}" +shift || true + +model="whisper-1" +out="" +language="" +prompt="" +response_format="text" + +while [[ $# -gt 0 ]]; do + case "$1" in + --model) + model="${2:-}" + shift 2 + ;; + --out) + out="${2:-}" + shift 2 + ;; + --language) + language="${2:-}" + shift 2 + ;; + --prompt) + prompt="${2:-}" + shift 2 + ;; + --json) + response_format="json" + shift 1 + ;; + *) + echo "Unknown arg: $1" >&2 + usage + ;; + esac +done + +if [[ ! -f "$in" ]]; then + echo "File not found: $in" >&2 + exit 1 +fi + +if [[ "${OPENAI_API_KEY:-}" == "" ]]; then + echo "Missing OPENAI_API_KEY" >&2 + exit 1 +fi + +if [[ "$out" == "" ]]; then + base="${in%.*}" + if [[ "$response_format" == "json" ]]; then + out="${base}.json" + else + out="${base}.txt" + fi +fi + +mkdir -p "$(dirname "$out")" + +curl -sS https://api.openai.com/v1/audio/transcriptions \ + -H "Authorization: Bearer $OPENAI_API_KEY" \ + -H "Accept: application/json" \ + -F "file=@${in}" \ + -F "model=${model}" \ + -F "response_format=${response_format}" \ + ${language:+-F "language=${language}"} \ + ${prompt:+-F "prompt=${prompt}"} \ + >"$out" + +echo "$out" diff --git a/skills/video-frames/SKILL.md b/skills/video-frames/SKILL.md new file mode 100644 index 0000000000..40ca891134 --- /dev/null +++ b/skills/video-frames/SKILL.md @@ -0,0 +1,28 @@ +--- +name: video-frames +description: Extract frames or short clips from videos using ffmpeg. +metadata: {"clawdis":{"requires":{"bins":["ffmpeg"]},"install":[{"id":"brew","kind":"brew","formula":"ffmpeg","bins":["ffmpeg"],"label":"Install ffmpeg (brew)"}]}} +--- + +# Video Frames (ffmpeg) + +Extract a single frame from a video, or create quick thumbnails for inspection. + +## Quick start + +First frame: + +```bash +{baseDir}/scripts/frame.sh /path/to/video.mp4 --out /tmp/frame.jpg +``` + +At a timestamp: + +```bash +{baseDir}/scripts/frame.sh /path/to/video.mp4 --time 00:00:10 --out /tmp/frame-10s.jpg +``` + +## Notes + +- Prefer `--time` for “what is happening around here?”. +- Use a `.jpg` for quick share; use `.png` for crisp UI frames. diff --git a/skills/video-frames/scripts/frame.sh b/skills/video-frames/scripts/frame.sh new file mode 100644 index 0000000000..31b3adb34c --- /dev/null +++ b/skills/video-frames/scripts/frame.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 <<'EOF' +Usage: + frame.sh [--time HH:MM:SS] [--index N] --out /path/to/frame.jpg + +Examples: + frame.sh video.mp4 --out /tmp/frame.jpg + frame.sh video.mp4 --time 00:00:10 --out /tmp/frame-10s.jpg + frame.sh video.mp4 --index 0 --out /tmp/frame0.png +EOF + exit 2 +} + +if [[ "${1:-}" == "" || "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage +fi + +in="${1:-}" +shift || true + +time="" +index="" +out="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --time) + time="${2:-}" + shift 2 + ;; + --index) + index="${2:-}" + shift 2 + ;; + --out) + out="${2:-}" + shift 2 + ;; + *) + echo "Unknown arg: $1" >&2 + usage + ;; + esac +done + +if [[ ! -f "$in" ]]; then + echo "File not found: $in" >&2 + exit 1 +fi + +if [[ "$out" == "" ]]; then + echo "Missing --out" >&2 + usage +fi + +mkdir -p "$(dirname "$out")" + +if [[ "$index" != "" ]]; then + ffmpeg -hide_banner -loglevel error -y \ + -i "$in" \ + -vf "select=eq(n\\,${index})" \ + -vframes 1 \ + "$out" +elif [[ "$time" != "" ]]; then + ffmpeg -hide_banner -loglevel error -y \ + -ss "$time" \ + -i "$in" \ + -frames:v 1 \ + "$out" +else + ffmpeg -hide_banner -loglevel error -y \ + -i "$in" \ + -vf "select=eq(n\\,0)" \ + -vframes 1 \ + "$out" +fi + +echo "$out"