fix(gateway): harden tailscale ownership rollback

This commit is contained in:
Nimrod Gutman 2026-03-16 11:08:55 +02:00
parent c47caaa198
commit dd570f9b7a
2 changed files with 166 additions and 59 deletions

View File

@ -19,28 +19,42 @@ vi.mock("../infra/tailscale.js", () => ({
import { startGatewayTailscaleExposure } from "./server-tailscale.js";
function createOwnerStore() {
let currentToken: string | null = null;
let currentOwner: null | {
token: string;
mode: "serve" | "funnel";
port: number;
pid: number;
claimedAt: string;
} = null;
let nextId = 0;
return {
async claim(mode: "serve" | "funnel", port: number) {
const record = {
const previousOwner = currentOwner;
const owner = {
token: `owner-${++nextId}`,
mode,
port,
pid: nextId,
claimedAt: new Date(0).toISOString(),
};
currentToken = record.token;
return record;
currentOwner = owner;
return { owner, previousOwner };
},
async isCurrentOwner(token: string) {
return currentToken === token;
},
async clearIfCurrentOwner(token: string) {
if (currentToken === token) {
currentToken = null;
async replaceIfCurrent(token: string, nextOwner: typeof currentOwner | null) {
if (currentOwner?.token !== token) {
return false;
}
currentOwner = nextOwner;
return true;
},
async runCleanupIfCurrentOwner(token: string, cleanup: () => Promise<void>) {
if (currentOwner?.token !== token) {
return false;
}
await cleanup();
currentOwner = null;
return true;
},
};
}
@ -74,23 +88,20 @@ describe("startGatewayTailscaleExposure", () => {
await cleanupA?.();
expect(tailscaleState.disableServe).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith(
"serve cleanup skipped: newer gateway owns Tailscale exposure",
);
expect(logTailscale.info).toHaveBeenCalledWith("serve cleanup skipped: not the current owner");
await cleanupB?.();
expect(tailscaleState.disableServe).toHaveBeenCalledTimes(1);
});
it("clears ownership after a startup failure", async () => {
it("restores the previous owner after a takeover startup failure", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
tailscaleState.enableServe.mockRejectedValueOnce(new Error("boom"));
const cleanup = await startGatewayTailscaleExposure({
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: "serve",
resetOnExit: true,
port: 18789,
@ -98,10 +109,24 @@ describe("startGatewayTailscaleExposure", () => {
ownerStore,
});
expect(cleanup).not.toBeNull();
tailscaleState.enableServe.mockRejectedValueOnce(new Error("boom"));
const cleanupB = await startGatewayTailscaleExposure({
tailscaleMode: "serve",
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
expect(cleanupB).not.toBeNull();
expect(logTailscale.warn).toHaveBeenCalledWith("serve failed: boom");
await cleanup?.();
await cleanupB?.();
expect(tailscaleState.disableServe).not.toHaveBeenCalled();
expect(logTailscale.info).toHaveBeenCalledWith("serve cleanup skipped: not the current owner");
await cleanupA?.();
expect(tailscaleState.disableServe).toHaveBeenCalledTimes(1);
});
});

View File

@ -24,13 +24,19 @@ type TailscaleExposureOwnerStore = {
claim(
mode: Exclude<GatewayTailscaleMode, "off">,
port: number,
): Promise<TailscaleExposureOwnerRecord>;
isCurrentOwner(token: string): Promise<boolean>;
clearIfCurrentOwner(token: string): Promise<void>;
): Promise<{
owner: TailscaleExposureOwnerRecord;
previousOwner: TailscaleExposureOwnerRecord | null;
}>;
replaceIfCurrent(token: string, nextOwner: TailscaleExposureOwnerRecord | null): Promise<boolean>;
runCleanupIfCurrentOwner(token: string, cleanup: () => Promise<void>): Promise<boolean>;
};
function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
const ownerFilePath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.json");
const ownerLockPath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.lock");
const lockRetryMs = 25;
const lockStaleMs = 60_000;
async function readOwner(): Promise<TailscaleExposureOwnerRecord | null> {
try {
@ -47,42 +53,120 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
) {
return parsed as TailscaleExposureOwnerRecord;
}
} catch (err) {
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
// Ignore malformed or unreadable ownership state and continue.
}
} catch {
// ENOENT means the file does not exist yet. Any other parse/read error is
// also ignored so the ownership guard remains best-effort and non-fatal.
}
return null;
}
return {
async claim(mode, port) {
const record: TailscaleExposureOwnerRecord = {
token: randomUUID(),
mode,
port,
pid: process.pid,
claimedAt: new Date().toISOString(),
};
await fs.mkdir(path.dirname(ownerFilePath), { recursive: true });
await fs.writeFile(ownerFilePath, JSON.stringify(record), "utf8");
return record;
},
async isCurrentOwner(token) {
const current = await readOwner();
return current?.token === token;
},
async clearIfCurrentOwner(token) {
if (!(await this.isCurrentOwner(token))) {
async function sleep(ms: number) {
await new Promise((resolve) => setTimeout(resolve, ms));
}
function isPidAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch (err) {
return (err as NodeJS.ErrnoException | undefined)?.code !== "ESRCH";
}
}
async function breakStaleLock() {
try {
const [raw, stat] = await Promise.all([
fs.readFile(ownerLockPath, "utf8").catch(() => null),
fs.stat(ownerLockPath),
]);
if (Date.now() - stat.mtimeMs < lockStaleMs) {
return;
}
const parsed = raw ? JSON.parse(raw) : null;
const pid = typeof parsed?.pid === "number" ? parsed.pid : null;
if (pid !== null && isPidAlive(pid)) {
return;
}
await fs.unlink(ownerLockPath).catch(() => {});
} catch (err) {
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
// Ignore malformed or unreadable lock state and retry.
}
}
}
async function withOwnerLock<T>(fn: () => Promise<T>): Promise<T> {
await fs.mkdir(path.dirname(ownerLockPath), { recursive: true });
while (true) {
try {
await fs.unlink(ownerFilePath);
const handle = await fs.open(ownerLockPath, "wx");
try {
await handle.writeFile(
JSON.stringify({ pid: process.pid, acquiredAt: new Date().toISOString() }),
);
return await fn();
} finally {
await handle.close().catch(() => {});
await fs.unlink(ownerLockPath).catch(() => {});
}
} catch (err) {
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
if ((err as NodeJS.ErrnoException | undefined)?.code !== "EEXIST") {
throw err;
}
await breakStaleLock();
await sleep(lockRetryMs);
}
}
}
async function deleteOwnerFile() {
await fs.unlink(ownerFilePath).catch((err: unknown) => {
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
throw err;
}
});
}
return {
async claim(mode, port) {
return await withOwnerLock(async () => {
const previousOwner = await readOwner();
const owner: TailscaleExposureOwnerRecord = {
token: randomUUID(),
mode,
port,
pid: process.pid,
claimedAt: new Date().toISOString(),
};
await fs.writeFile(ownerFilePath, JSON.stringify(owner), "utf8");
return { owner, previousOwner };
});
},
async replaceIfCurrent(token, nextOwner) {
return await withOwnerLock(async () => {
const current = await readOwner();
if (current?.token !== token) {
return false;
}
if (nextOwner) {
await fs.writeFile(ownerFilePath, JSON.stringify(nextOwner), "utf8");
} else {
await deleteOwnerFile();
}
return true;
});
},
async runCleanupIfCurrentOwner(token, cleanup) {
return await withOwnerLock(async () => {
const current = await readOwner();
if (current?.token !== token) {
return false;
}
await cleanup();
await deleteOwnerFile();
return true;
});
},
};
}
@ -100,7 +184,7 @@ export async function startGatewayTailscaleExposure(params: {
}
const ownerStore = params.ownerStore ?? createTailscaleExposureOwnerStore();
const owner = await ownerStore.claim(params.tailscaleMode, params.port);
const { owner, previousOwner } = await ownerStore.claim(params.tailscaleMode, params.port);
try {
if (params.tailscaleMode === "serve") {
@ -118,7 +202,7 @@ export async function startGatewayTailscaleExposure(params: {
params.logTailscale.info(`${params.tailscaleMode} enabled`);
}
} catch (err) {
await ownerStore.clearIfCurrentOwner(owner.token).catch(() => {});
await ownerStore.replaceIfCurrent(owner.token, previousOwner).catch(() => {});
params.logTailscale.warn(
`${params.tailscaleMode} failed: ${err instanceof Error ? err.message : String(err)}`,
);
@ -130,18 +214,16 @@ export async function startGatewayTailscaleExposure(params: {
return async () => {
try {
if (!(await ownerStore.isCurrentOwner(owner.token))) {
params.logTailscale.info(
`${params.tailscaleMode} cleanup skipped: newer gateway owns Tailscale exposure`,
);
return;
const cleanedUp = await ownerStore.runCleanupIfCurrentOwner(owner.token, async () => {
if (params.tailscaleMode === "serve") {
await disableTailscaleServe();
} else {
await disableTailscaleFunnel();
}
});
if (!cleanedUp) {
params.logTailscale.info(`${params.tailscaleMode} cleanup skipped: not the current owner`);
}
if (params.tailscaleMode === "serve") {
await disableTailscaleServe();
} else {
await disableTailscaleFunnel();
}
await ownerStore.clearIfCurrentOwner(owner.token);
} catch (err) {
params.logTailscale.warn(
`${params.tailscaleMode} cleanup failed: ${err instanceof Error ? err.message : String(err)}`,