Compare commits

...

7 Commits

Author SHA1 Message Date
Nimrod Gutman
9f52a62e70 fix(gateway): harden tailscale cleanup recovery 2026-03-16 14:33:52 +02:00
Nimrod Gutman
02069176bc fix(gateway): soften tailscale cleanup finalization 2026-03-16 12:16:18 +02:00
Nimrod Gutman
97b4f72d6b fix(gateway): serialize tailscale cleanup handoff 2026-03-16 12:03:01 +02:00
Nimrod Gutman
d6f949b648 fix(gateway): harden tailscale owner fallback 2026-03-16 11:42:21 +02:00
Nimrod Gutman
f0a773a716 fix(gateway): harden tailscale lock recovery 2026-03-16 11:23:33 +02:00
Nimrod Gutman
dd570f9b7a fix(gateway): harden tailscale ownership rollback 2026-03-16 11:08:55 +02:00
Nimrod Gutman
c47caaa198 fix(gateway): guard tailscale serve cleanup ownership 2026-03-16 10:59:35 +02:00
2 changed files with 685 additions and 5 deletions

View File

@ -0,0 +1,359 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
const tailscaleState = vi.hoisted(() => ({
enableServe: vi.fn(async (_port: number) => {}),
disableServe: vi.fn(async () => {}),
enableFunnel: vi.fn(async (_port: number) => {}),
disableFunnel: vi.fn(async () => {}),
getHost: vi.fn(async () => "gateway.tailnet.ts.net"),
}));
vi.mock("../infra/tailscale.js", () => ({
enableTailscaleServe: (port: number) => tailscaleState.enableServe(port),
disableTailscaleServe: () => tailscaleState.disableServe(),
enableTailscaleFunnel: (port: number) => tailscaleState.enableFunnel(port),
disableTailscaleFunnel: () => tailscaleState.disableFunnel(),
getTailnetHostname: () => tailscaleState.getHost(),
}));
import { startGatewayTailscaleExposure } from "./server-tailscale.js";
function createOwnerStore() {
let currentOwner: null | {
token: string;
mode: "serve" | "funnel";
port: number;
pid: number;
claimedAt: string;
phase: "active" | "cleaning";
cleanupStartedAt?: string;
alive: boolean;
} = null;
let nextPid = process.pid - 1;
return {
async claim(mode: "serve" | "funnel", port: number) {
const previousOwner = currentOwner;
const owner = {
token: `owner-${++nextPid}`,
mode,
port,
pid: nextPid,
claimedAt: new Date(0).toISOString(),
phase: "active" as const,
alive: true,
};
currentOwner = owner;
return { owner, previousOwner };
},
async replaceIfCurrent(token: string, nextOwner: typeof currentOwner | null) {
if (currentOwner?.token !== token) {
return false;
}
currentOwner = nextOwner;
return true;
},
async runCleanupIfCurrentOwner(token: string, cleanup: () => Promise<void>) {
if (!currentOwner) {
return false;
}
if (currentOwner.token !== token && currentOwner.alive) {
return false;
}
currentOwner = {
...currentOwner,
phase: "cleaning",
cleanupStartedAt: new Date(0).toISOString(),
};
await cleanup();
currentOwner = null;
return true;
},
markCurrentOwnerDead() {
if (currentOwner) {
currentOwner = { ...currentOwner, alive: false };
}
},
};
}
const modeCases = [
{
mode: "serve" as const,
enableMock: tailscaleState.enableServe,
disableMock: tailscaleState.disableServe,
},
{
mode: "funnel" as const,
enableMock: tailscaleState.enableFunnel,
disableMock: tailscaleState.disableFunnel,
},
];
describe.each(modeCases)(
"startGatewayTailscaleExposure ($mode)",
({ mode, enableMock, disableMock }) => {
beforeEach(() => {
vi.restoreAllMocks();
vi.clearAllMocks();
});
it("skips stale cleanup after a newer gateway takes ownership", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
const cleanupB = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
await cleanupA?.();
expect(disableMock).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith(
`${mode} cleanup skipped: not the current owner`,
);
await cleanupB?.();
expect(disableMock).toHaveBeenCalledTimes(1);
});
it("restores the previous live owner after a takeover startup failure", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
enableMock.mockRejectedValueOnce(new Error("boom"));
const cleanupB = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
expect(cleanupB).not.toBeNull();
expect(logTailscale.warn).toHaveBeenCalledWith(`${mode} failed: boom`);
await cleanupB?.();
expect(disableMock).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith(
`${mode} cleanup skipped: not the current owner`,
);
await cleanupA?.();
expect(disableMock).toHaveBeenCalledTimes(1);
});
it("keeps the failed owner when the previous owner is already gone", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
vi.spyOn(process, "kill").mockImplementation(((pid: number) => {
if (pid === process.pid) {
const err = new Error("gone") as NodeJS.ErrnoException;
err.code = "ESRCH";
throw err;
}
return true;
}) as typeof process.kill);
enableMock.mockRejectedValueOnce(new Error("boom"));
const cleanupB = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
expect(cleanupB).not.toBeNull();
expect(logTailscale.warn).toHaveBeenCalledWith(`${mode} failed: boom`);
await cleanupA?.();
expect(disableMock).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith(
`${mode} cleanup skipped: not the current owner`,
);
await cleanupB?.();
expect(disableMock).toHaveBeenCalledTimes(1);
});
it("reclaims cleanup from a dead newer owner before exit", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
ownerStore.markCurrentOwnerDead();
await cleanupA?.();
expect(disableMock).toHaveBeenCalledTimes(1);
expect(logTailscale.info).not.toHaveBeenCalledWith(
`${mode} cleanup skipped: not the current owner`,
);
});
it("falls back to unguarded cleanup when the ownership guard cannot claim", async () => {
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanup = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore: {
async claim() {
throw new Error("lock dir unavailable");
},
async replaceIfCurrent() {
return false;
},
async runCleanupIfCurrentOwner() {
return false;
},
},
});
expect(cleanup).not.toBeNull();
expect(enableMock).toHaveBeenCalledTimes(1);
expect(logTailscale.warn).toHaveBeenCalledWith(
`${mode} ownership guard unavailable: lock dir unavailable`,
);
await cleanup?.();
expect(disableMock).toHaveBeenCalledTimes(1);
});
it("skips unguarded fallback while a previous cleanup is still in progress", async () => {
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanup = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore: {
async claim() {
const err = new Error("busy");
err.name = "TailscaleExposureCleanupInProgressError";
throw err;
},
async replaceIfCurrent() {
return false;
},
async runCleanupIfCurrentOwner() {
return false;
},
},
});
expect(cleanup).toBeNull();
expect(enableMock).not.toHaveBeenCalled();
expect(disableMock).not.toHaveBeenCalled();
expect(logTailscale.warn).toHaveBeenCalledWith(
`${mode} ownership cleanup still in progress; skipping external exposure`,
);
});
it("falls back to a direct reset when guarded cleanup bookkeeping fails", async () => {
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanup = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore: {
async claim() {
return {
owner: {
token: "owner-1",
mode,
port: 18789,
pid: process.pid,
claimedAt: new Date(0).toISOString(),
phase: "active" as const,
},
previousOwner: null,
};
},
async replaceIfCurrent() {
return true;
},
async runCleanupIfCurrentOwner() {
throw new Error("lock dir unavailable");
},
},
});
await cleanup?.();
expect(disableMock).toHaveBeenCalledTimes(1);
expect(logTailscale.warn).toHaveBeenCalledWith(
`${mode} cleanup failed: lock dir unavailable`,
);
expect(logTailscale.warn).toHaveBeenCalledWith(
`${mode} cleanup guard failed; applied direct reset fallback`,
);
});
},
);

View File

@ -1,3 +1,7 @@
import { randomUUID } from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";
import { resolveGatewayLockDir } from "../config/paths.js";
import {
disableTailscaleFunnel,
disableTailscaleServe,
@ -6,17 +10,293 @@ import {
getTailnetHostname,
} from "../infra/tailscale.js";
type GatewayTailscaleMode = "off" | "serve" | "funnel";
type TailscaleExposureOwnerRecord = {
token: string;
mode: Exclude<GatewayTailscaleMode, "off">;
port: number;
pid: number;
claimedAt: string;
phase: "active" | "cleaning";
cleanupStartedAt?: string;
};
type TailscaleExposureOwnerStore = {
claim(
mode: Exclude<GatewayTailscaleMode, "off">,
port: number,
): Promise<{
owner: TailscaleExposureOwnerRecord;
previousOwner: TailscaleExposureOwnerRecord | null;
}>;
replaceIfCurrent(token: string, nextOwner: TailscaleExposureOwnerRecord | null): Promise<boolean>;
runCleanupIfCurrentOwner(token: string, cleanup: () => Promise<void>): Promise<boolean>;
};
function isPidAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch (err) {
return (err as NodeJS.ErrnoException | undefined)?.code !== "ESRCH";
}
}
function createCleanupInProgressError(pid: number): Error {
const err = new Error(`previous cleanup still in progress (pid ${pid})`);
err.name = "TailscaleExposureCleanupInProgressError";
return err;
}
function isCleanupInProgressError(err: unknown): err is Error {
return err instanceof Error && err.name === "TailscaleExposureCleanupInProgressError";
}
function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
const ownerFilePath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.json");
const ownerLockPath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.lock");
const lockRetryMs = 25;
const lockStaleMs = 60_000;
const cleanupClaimWaitMs = 20_000;
let ensureLockDirReady: Promise<void> | null = null;
async function readOwner(): Promise<TailscaleExposureOwnerRecord | null> {
try {
const raw = await fs.readFile(ownerFilePath, "utf8");
const parsed = JSON.parse(raw);
if (
parsed &&
typeof parsed === "object" &&
typeof parsed.token === "string" &&
typeof parsed.mode === "string" &&
typeof parsed.port === "number" &&
typeof parsed.pid === "number" &&
typeof parsed.claimedAt === "string"
) {
return {
...(parsed as Omit<TailscaleExposureOwnerRecord, "phase">),
phase: parsed.phase === "cleaning" ? "cleaning" : "active",
cleanupStartedAt:
typeof parsed.cleanupStartedAt === "string" ? parsed.cleanupStartedAt : undefined,
};
}
} catch {
// ENOENT means the file does not exist yet. Any other parse/read error is
// also ignored so the ownership guard remains best-effort and non-fatal.
}
return null;
}
async function sleep(ms: number) {
await new Promise((resolve) => setTimeout(resolve, ms));
}
function ensureLockDir() {
if (!ensureLockDirReady) {
ensureLockDirReady = fs
.mkdir(path.dirname(ownerLockPath), { recursive: true })
.catch((err: unknown) => {
ensureLockDirReady = null;
throw err;
});
}
return ensureLockDirReady;
}
async function breakStaleLock() {
try {
const stat = await fs.stat(ownerLockPath);
if (Date.now() - stat.mtimeMs < lockStaleMs) {
return;
}
try {
const raw = await fs.readFile(ownerLockPath, "utf8");
const parsed = JSON.parse(raw) as { pid?: unknown };
if (typeof parsed.pid === "number" && isPidAlive(parsed.pid)) {
return;
}
} catch {
// Unreadable lock state is treated as stale so a dead holder cannot block recovery.
}
await fs.unlink(ownerLockPath).catch(() => {});
} catch {
// Ignore malformed or unreadable lock state and retry.
}
}
async function withOwnerLock<T>(fn: () => Promise<T>): Promise<T> {
await ensureLockDir();
while (true) {
try {
const handle = await fs.open(ownerLockPath, "wx");
try {
await handle.writeFile(
JSON.stringify({ pid: process.pid, acquiredAt: new Date().toISOString() }),
);
return await fn();
} finally {
await handle.close().catch(() => {});
await fs.unlink(ownerLockPath).catch(() => {});
}
} catch (err) {
if ((err as NodeJS.ErrnoException | undefined)?.code !== "EEXIST") {
throw err;
}
await breakStaleLock();
await sleep(lockRetryMs);
}
}
}
async function deleteOwnerFile() {
await fs.unlink(ownerFilePath).catch((err: unknown) => {
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
throw err;
}
});
}
return {
async claim(mode, port) {
const claimDeadline = Date.now() + cleanupClaimWaitMs;
while (true) {
const result = await withOwnerLock(async () => {
const previousOwner = await readOwner();
if (previousOwner?.phase === "cleaning" && isPidAlive(previousOwner.pid)) {
if (Date.now() < claimDeadline) {
return { type: "wait" as const };
}
return { type: "blocked" as const, previousOwner };
}
const owner: TailscaleExposureOwnerRecord = {
token: randomUUID(),
mode,
port,
pid: process.pid,
claimedAt: new Date().toISOString(),
phase: "active",
};
await fs.writeFile(ownerFilePath, JSON.stringify(owner), "utf8");
return { type: "claimed" as const, owner, previousOwner };
});
if (result.type === "claimed") {
return result;
}
if (result.type === "blocked") {
throw createCleanupInProgressError(result.previousOwner.pid);
}
await sleep(lockRetryMs);
}
},
async replaceIfCurrent(token, nextOwner) {
return await withOwnerLock(async () => {
const current = await readOwner();
if (current?.token !== token) {
return false;
}
if (nextOwner) {
await fs.writeFile(ownerFilePath, JSON.stringify(nextOwner), "utf8");
} else {
await deleteOwnerFile();
}
return true;
});
},
async runCleanupIfCurrentOwner(token, cleanup) {
const cleanupOwner = await withOwnerLock(async () => {
const current = await readOwner();
if (!current) {
return null;
}
if (current.token !== token && isPidAlive(current.pid)) {
return null;
}
// If a newer owner died without cleaning up, reclaim its stale exposure
// before this older process exits so resetOnExit still clears Tailscale.
// Mark cleanup in progress before releasing the lock so overlapping
// startups cannot claim exposure until this reset finishes or fails.
const nextOwner: TailscaleExposureOwnerRecord = {
...current,
phase: "cleaning",
cleanupStartedAt: new Date().toISOString(),
};
await fs.writeFile(ownerFilePath, JSON.stringify(nextOwner), "utf8");
return nextOwner;
});
if (!cleanupOwner) {
return false;
}
try {
await cleanup();
} catch (err) {
await withOwnerLock(async () => {
const current = await readOwner();
if (current?.token !== token || current.phase !== "cleaning") {
return;
}
await fs.writeFile(
ownerFilePath,
JSON.stringify({
...cleanupOwner,
phase: "active",
cleanupStartedAt: undefined,
}),
"utf8",
);
}).catch(() => {});
throw err;
}
await withOwnerLock(async () => {
const current = await readOwner();
if (current?.token !== token || current.phase !== "cleaning") {
return;
}
await deleteOwnerFile();
}).catch(() => {
// Tailscale cleanup already succeeded. If owner-file deletion fails, leave
// the stale cleaning record for best-effort recovery after this process exits.
});
return true;
},
};
}
export async function startGatewayTailscaleExposure(params: {
tailscaleMode: "off" | "serve" | "funnel";
tailscaleMode: GatewayTailscaleMode;
resetOnExit?: boolean;
port: number;
controlUiBasePath?: string;
logTailscale: { info: (msg: string) => void; warn: (msg: string) => void };
ownerStore?: TailscaleExposureOwnerStore;
}): Promise<(() => Promise<void>) | null> {
if (params.tailscaleMode === "off") {
return null;
}
const ownerStore = params.ownerStore ?? createTailscaleExposureOwnerStore();
let owner: TailscaleExposureOwnerRecord | null = null;
let previousOwner: TailscaleExposureOwnerRecord | null = null;
try {
({ owner, previousOwner } = await ownerStore.claim(params.tailscaleMode, params.port));
} catch (err) {
if (isCleanupInProgressError(err)) {
params.logTailscale.warn(
`${params.tailscaleMode} ownership cleanup still in progress; skipping external exposure`,
);
return null;
}
params.logTailscale.warn(
`${params.tailscaleMode} ownership guard unavailable: ${err instanceof Error ? err.message : String(err)}`,
);
}
try {
if (params.tailscaleMode === "serve") {
await enableTailscaleServe(params.port);
@ -33,6 +313,18 @@ export async function startGatewayTailscaleExposure(params: {
params.logTailscale.info(`${params.tailscaleMode} enabled`);
}
} catch (err) {
if (owner) {
// Restore the previous owner's original token so its own cleanup hook can
// still prove ownership later, or keep this failed owner if it needs to
// reset any partial/orphaned Tailscale state on exit.
const nextOwner =
previousOwner && isPidAlive(previousOwner.pid)
? previousOwner
: params.resetOnExit
? owner
: null;
await ownerStore.replaceIfCurrent(owner.token, nextOwner).catch(() => {});
}
params.logTailscale.warn(
`${params.tailscaleMode} failed: ${err instanceof Error ? err.message : String(err)}`,
);
@ -42,17 +334,46 @@ export async function startGatewayTailscaleExposure(params: {
return null;
}
return async () => {
try {
const disableExposure = async () => {
if (params.tailscaleMode === "serve") {
await disableTailscaleServe();
} else {
await disableTailscaleFunnel();
}
};
return async () => {
try {
if (owner) {
// A failed enable can still leave behind partial or orphaned Tailscale
// config, so shutdown cleanup intentionally re-runs the global reset.
const cleanedUp = await ownerStore.runCleanupIfCurrentOwner(owner.token, async () => {
await disableExposure();
});
if (!cleanedUp) {
params.logTailscale.info(
`${params.tailscaleMode} cleanup skipped: not the current owner`,
);
}
return;
}
await disableExposure();
} catch (err) {
params.logTailscale.warn(
`${params.tailscaleMode} cleanup failed: ${err instanceof Error ? err.message : String(err)}`,
);
if (!owner) {
return;
}
try {
await disableExposure();
params.logTailscale.warn(
`${params.tailscaleMode} cleanup guard failed; applied direct reset fallback`,
);
} catch {
// The direct reset failed too; keep the original warning above.
}
}
};
}