From 426503e062346345df029b79224614e4f06fd37a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 10 Dec 2025 00:46:50 +0000 Subject: [PATCH] infra: use flock gateway lock --- .npmrc | 2 +- docs/gateway-lock.md | 28 ++++++++++ package.json | 2 + src/gateway/server.ts | 4 +- src/infra/gateway-lock.ts | 104 +++++++++++++++----------------------- 5 files changed, 75 insertions(+), 65 deletions(-) create mode 100644 docs/gateway-lock.md diff --git a/.npmrc b/.npmrc index b9915fc663..0be5f78940 100644 --- a/.npmrc +++ b/.npmrc @@ -1 +1 @@ -allow-build-scripts=@whiskeysockets/baileys,sharp,esbuild,protobufjs +allow-build-scripts=@whiskeysockets/baileys,sharp,esbuild,protobufjs,fs-ext diff --git a/docs/gateway-lock.md b/docs/gateway-lock.md new file mode 100644 index 0000000000..97ee6d061e --- /dev/null +++ b/docs/gateway-lock.md @@ -0,0 +1,28 @@ +--- +summary: "Gateway lock strategy using POSIX flock and PID file" +read_when: + - Running or debugging the gateway process + - Investigating single-instance enforcement +--- +# Gateway lock + +Last updated: 2025-12-10 + +## Why +- Ensure only one gateway instance runs per host. +- Survive crashes/SIGKILL without leaving a blocking stale lock. +- Keep the PID visible for observability and manual debugging. + +## Mechanism +- Uses a single lock file (default `${os.tmpdir()}/clawdis-gateway.lock`, e.g. `/var/folders/.../clawdis-gateway.lock` on macOS) opened once per process. +- An exclusive, non-blocking POSIX `flock` is taken on the file descriptor. The kernel releases the lock automatically on any process exit, including crashes and SIGKILL. +- The PID is written into the same file after locking; the lock (not file existence) is the source of truth. +- On graceful shutdown, we best-effort unlock, close, and unlink the file to reduce crumbs, but correctness does not rely on cleanup. + +## Error surface +- If another instance holds the lock, startup throws `GatewayLockError("another gateway instance is already running")`. +- Unexpected `flock` failures surface as `GatewayLockError("failed to acquire gateway lock: …")`. + +## Operational notes +- The lock file may remain on disk after abnormal exits; this is expected and harmless because the kernel lock is gone. +- If you need to inspect, `cat /tmp/clawdis-gateway.lock` shows the last PID. Do not delete the file while a process is running—you would only remove the convenience marker, not the lock itself. diff --git a/package.json b/package.json index 79eceec808..4e41b1d0e5 100644 --- a/package.json +++ b/package.json @@ -45,6 +45,7 @@ "detect-libc": "^2.1.2", "dotenv": "^17.2.3", "express": "^5.2.1", + "fs-ext": "^2.1.1", "grammy": "^1.38.4", "json5": "^2.2.3", "qrcode-terminal": "^0.12.0", @@ -59,6 +60,7 @@ "@mariozechner/mini-lit": "0.2.1", "@types/body-parser": "^1.19.6", "@types/express": "^5.0.6", + "@types/fs-ext": "^2.0.3", "@types/node": "^24.10.2", "@types/qrcode-terminal": "^0.12.2", "@types/ws": "^8.18.1", diff --git a/src/gateway/server.ts b/src/gateway/server.ts index 052add7b1c..b1938b09a7 100644 --- a/src/gateway/server.ts +++ b/src/gateway/server.ts @@ -667,7 +667,9 @@ export async function startGatewayServer(port = 18789): Promise { if (cached && now - cached.ts < HEALTH_REFRESH_INTERVAL_MS) { respond(true, cached, undefined, { cached: true }); void refreshHealthSnapshot({ probe: true }).catch((err) => - logError(`background health refresh failed: ${formatError(err)}`), + logError( + `background health refresh failed: ${formatError(err)}`, + ), ); break; } diff --git a/src/infra/gateway-lock.ts b/src/infra/gateway-lock.ts index 90cdb892ee..5d818793f8 100644 --- a/src/infra/gateway-lock.ts +++ b/src/infra/gateway-lock.ts @@ -1,100 +1,78 @@ import fs from "node:fs"; -import net from "node:net"; import os from "node:os"; import path from "node:path"; +import { flockSync } from "fs-ext"; + const DEFAULT_LOCK_PATH = path.join(os.tmpdir(), "clawdis-gateway.lock"); export class GatewayLockError extends Error {} type ReleaseFn = () => Promise; +const SIGNALS: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"]; + /** - * Acquire an exclusive single-instance lock for the gateway using a Unix domain socket. + * Acquire an exclusive gateway lock using POSIX flock and write the PID into the same file. * - * Why a socket? If the process crashes or is SIGKILLed, the socket file remains but - * the next start will detect ECONNREFUSED when connecting and clean the stale path - * before retrying. This keeps the lock self-healing without manual pidfile cleanup. + * Kernel locks are released automatically when the process exits or is SIGKILLed, so the + * lock cannot become stale. A best-effort unlink on shutdown keeps the path clean, but + * correctness relies solely on the kernel lock. */ export async function acquireGatewayLock( lockPath = DEFAULT_LOCK_PATH, ): Promise { - // Fast path: try to listen on the lock path. - const attemptListen = (): Promise => - new Promise((resolve, reject) => { - const server = net.createServer(); + fs.mkdirSync(path.dirname(lockPath), { recursive: true }); - server.once("error", async (err: NodeJS.ErrnoException) => { - if (err.code !== "EADDRINUSE") { - reject(new GatewayLockError(`lock listen failed: ${err.message}`)); - return; - } + const fd = fs.openSync(lockPath, "w+"); + try { + flockSync(fd, "exnb"); + } catch (err) { + fs.closeSync(fd); + const code = (err as NodeJS.ErrnoException).code; + if (code === "EWOULDBLOCK" || code === "EAGAIN") { + throw new GatewayLockError("another gateway instance is already running"); + } + throw new GatewayLockError( + `failed to acquire gateway lock: ${(err as Error).message}`, + ); + } - // Something is already bound. Try to connect to see if it is alive. - const client = net.connect({ path: lockPath }); - - client.once("connect", () => { - client.destroy(); - reject( - new GatewayLockError("another gateway instance is already running"), - ); - }); - - client.once("error", (connErr: NodeJS.ErrnoException) => { - // Nothing is listening -> stale socket file. Remove and retry once. - if (connErr.code === "ECONNREFUSED" || connErr.code === "ENOENT") { - try { - fs.rmSync(lockPath, { force: true }); - } catch (rmErr) { - reject( - new GatewayLockError( - `failed to clean stale lock at ${lockPath}: ${String(rmErr)}`, - ), - ); - return; - } - attemptListen().then(resolve, reject); - return; - } - - reject( - new GatewayLockError( - `failed to connect to existing lock (${lockPath}): ${connErr.message}`, - ), - ); - }); - }); - - server.listen(lockPath, () => resolve(server)); - }); - - const server = await attemptListen(); + fs.ftruncateSync(fd, 0); + fs.writeSync(fd, `${process.pid}\n`, 0, "utf8"); + fs.fsyncSync(fd); let released = false; const release = async (): Promise => { if (released) return; released = true; - await new Promise((resolve) => server.close(() => resolve())); + try { + flockSync(fd, "un"); + } catch { + /* ignore unlock errors */ + } + try { + fs.closeSync(fd); + } catch { + /* ignore close errors */ + } try { fs.rmSync(lockPath, { force: true }); } catch { - /* ignore */ + /* ignore unlink errors */ } }; - const cleanupSignals: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"]; - const handleSignal = async () => { - await release(); + const handleSignal = () => { + void release(); process.exit(0); }; - for (const sig of cleanupSignals) { - process.once(sig, () => { - void handleSignal(); - }); + for (const sig of SIGNALS) { + process.once(sig, handleSignal); } + process.once("exit", () => { - // Exit handler must be sync-safe; release is async but close+rm are fast. void release(); });