infra: use flock gateway lock

This commit is contained in:
Peter Steinberger
2025-12-10 00:46:50 +00:00
parent b1834b7cf8
commit 426503e062
5 changed files with 75 additions and 65 deletions

2
.npmrc
View File

@@ -1 +1 @@
allow-build-scripts=@whiskeysockets/baileys,sharp,esbuild,protobufjs
allow-build-scripts=@whiskeysockets/baileys,sharp,esbuild,protobufjs,fs-ext

28
docs/gateway-lock.md Normal file
View File

@@ -0,0 +1,28 @@
---
summary: "Gateway lock strategy using POSIX flock and PID file"
read_when:
- Running or debugging the gateway process
- Investigating single-instance enforcement
---
# Gateway lock
Last updated: 2025-12-10
## Why
- Ensure only one gateway instance runs per host.
- Survive crashes/SIGKILL without leaving a blocking stale lock.
- Keep the PID visible for observability and manual debugging.
## Mechanism
- Uses a single lock file (default `${os.tmpdir()}/clawdis-gateway.lock`, e.g. `/var/folders/.../clawdis-gateway.lock` on macOS) opened once per process.
- An exclusive, non-blocking POSIX `flock` is taken on the file descriptor. The kernel releases the lock automatically on any process exit, including crashes and SIGKILL.
- The PID is written into the same file after locking; the lock (not file existence) is the source of truth.
- On graceful shutdown, we best-effort unlock, close, and unlink the file to reduce crumbs, but correctness does not rely on cleanup.
## Error surface
- If another instance holds the lock, startup throws `GatewayLockError("another gateway instance is already running")`.
- Unexpected `flock` failures surface as `GatewayLockError("failed to acquire gateway lock: …")`.
## Operational notes
- The lock file may remain on disk after abnormal exits; this is expected and harmless because the kernel lock is gone.
- If you need to inspect, `cat /tmp/clawdis-gateway.lock` shows the last PID. Do not delete the file while a process is running—you would only remove the convenience marker, not the lock itself.

View File

@@ -45,6 +45,7 @@
"detect-libc": "^2.1.2",
"dotenv": "^17.2.3",
"express": "^5.2.1",
"fs-ext": "^2.1.1",
"grammy": "^1.38.4",
"json5": "^2.2.3",
"qrcode-terminal": "^0.12.0",
@@ -59,6 +60,7 @@
"@mariozechner/mini-lit": "0.2.1",
"@types/body-parser": "^1.19.6",
"@types/express": "^5.0.6",
"@types/fs-ext": "^2.0.3",
"@types/node": "^24.10.2",
"@types/qrcode-terminal": "^0.12.2",
"@types/ws": "^8.18.1",

View File

@@ -667,7 +667,9 @@ export async function startGatewayServer(port = 18789): Promise<GatewayServer> {
if (cached && now - cached.ts < HEALTH_REFRESH_INTERVAL_MS) {
respond(true, cached, undefined, { cached: true });
void refreshHealthSnapshot({ probe: true }).catch((err) =>
logError(`background health refresh failed: ${formatError(err)}`),
logError(
`background health refresh failed: ${formatError(err)}`,
),
);
break;
}

View File

@@ -1,100 +1,78 @@
import fs from "node:fs";
import net from "node:net";
import os from "node:os";
import path from "node:path";
import { flockSync } from "fs-ext";
const DEFAULT_LOCK_PATH = path.join(os.tmpdir(), "clawdis-gateway.lock");
export class GatewayLockError extends Error {}
type ReleaseFn = () => Promise<void>;
const SIGNALS: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"];
/**
* Acquire an exclusive single-instance lock for the gateway using a Unix domain socket.
* Acquire an exclusive gateway lock using POSIX flock and write the PID into the same file.
*
* Why a socket? If the process crashes or is SIGKILLed, the socket file remains but
* the next start will detect ECONNREFUSED when connecting and clean the stale path
* before retrying. This keeps the lock self-healing without manual pidfile cleanup.
* Kernel locks are released automatically when the process exits or is SIGKILLed, so the
* lock cannot become stale. A best-effort unlink on shutdown keeps the path clean, but
* correctness relies solely on the kernel lock.
*/
export async function acquireGatewayLock(
lockPath = DEFAULT_LOCK_PATH,
): Promise<ReleaseFn> {
// Fast path: try to listen on the lock path.
const attemptListen = (): Promise<net.Server> =>
new Promise((resolve, reject) => {
const server = net.createServer();
fs.mkdirSync(path.dirname(lockPath), { recursive: true });
server.once("error", async (err: NodeJS.ErrnoException) => {
if (err.code !== "EADDRINUSE") {
reject(new GatewayLockError(`lock listen failed: ${err.message}`));
return;
}
const fd = fs.openSync(lockPath, "w+");
try {
flockSync(fd, "exnb");
} catch (err) {
fs.closeSync(fd);
const code = (err as NodeJS.ErrnoException).code;
if (code === "EWOULDBLOCK" || code === "EAGAIN") {
throw new GatewayLockError("another gateway instance is already running");
}
throw new GatewayLockError(
`failed to acquire gateway lock: ${(err as Error).message}`,
);
}
// Something is already bound. Try to connect to see if it is alive.
const client = net.connect({ path: lockPath });
client.once("connect", () => {
client.destroy();
reject(
new GatewayLockError("another gateway instance is already running"),
);
});
client.once("error", (connErr: NodeJS.ErrnoException) => {
// Nothing is listening -> stale socket file. Remove and retry once.
if (connErr.code === "ECONNREFUSED" || connErr.code === "ENOENT") {
try {
fs.rmSync(lockPath, { force: true });
} catch (rmErr) {
reject(
new GatewayLockError(
`failed to clean stale lock at ${lockPath}: ${String(rmErr)}`,
),
);
return;
}
attemptListen().then(resolve, reject);
return;
}
reject(
new GatewayLockError(
`failed to connect to existing lock (${lockPath}): ${connErr.message}`,
),
);
});
});
server.listen(lockPath, () => resolve(server));
});
const server = await attemptListen();
fs.ftruncateSync(fd, 0);
fs.writeSync(fd, `${process.pid}\n`, 0, "utf8");
fs.fsyncSync(fd);
let released = false;
const release = async (): Promise<void> => {
if (released) return;
released = true;
await new Promise<void>((resolve) => server.close(() => resolve()));
try {
flockSync(fd, "un");
} catch {
/* ignore unlock errors */
}
try {
fs.closeSync(fd);
} catch {
/* ignore close errors */
}
try {
fs.rmSync(lockPath, { force: true });
} catch {
/* ignore */
/* ignore unlink errors */
}
};
const cleanupSignals: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"];
const handleSignal = async () => {
await release();
const handleSignal = () => {
void release();
process.exit(0);
};
for (const sig of cleanupSignals) {
process.once(sig, () => {
void handleSignal();
});
for (const sig of SIGNALS) {
process.once(sig, handleSignal);
}
process.once("exit", () => {
// Exit handler must be sync-safe; release is async but close+rm are fast.
void release();
});