fix(gateway): harden tailscale cleanup recovery

This commit is contained in:
Nimrod Gutman 2026-03-16 14:33:52 +02:00
parent 02069176bc
commit 9f52a62e70
2 changed files with 119 additions and 19 deletions

View File

@ -27,6 +27,7 @@ function createOwnerStore() {
claimedAt: string;
phase: "active" | "cleaning";
cleanupStartedAt?: string;
alive: boolean;
} = null;
let nextPid = process.pid - 1;
@ -40,6 +41,7 @@ function createOwnerStore() {
pid: nextPid,
claimedAt: new Date(0).toISOString(),
phase: "active" as const,
alive: true,
};
currentOwner = owner;
return { owner, previousOwner };
@ -52,7 +54,10 @@ function createOwnerStore() {
return true;
},
async runCleanupIfCurrentOwner(token: string, cleanup: () => Promise<void>) {
if (currentOwner?.token !== token) {
if (!currentOwner) {
return false;
}
if (currentOwner.token !== token && currentOwner.alive) {
return false;
}
currentOwner = {
@ -64,6 +69,11 @@ function createOwnerStore() {
currentOwner = null;
return true;
},
markCurrentOwnerDead() {
if (currentOwner) {
currentOwner = { ...currentOwner, alive: false };
}
},
};
}
@ -204,6 +214,36 @@ describe.each(modeCases)(
expect(disableMock).toHaveBeenCalledTimes(1);
});
it("reclaims cleanup from a dead newer owner before exit", async () => {
const ownerStore = createOwnerStore();
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanupA = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore,
});
ownerStore.markCurrentOwnerDead();
await cleanupA?.();
expect(disableMock).toHaveBeenCalledTimes(1);
expect(logTailscale.info).not.toHaveBeenCalledWith(
`${mode} cleanup skipped: not the current owner`,
);
});
it("falls back to unguarded cleanup when the ownership guard cannot claim", async () => {
const logTailscale = {
info: vi.fn(),
@ -271,5 +311,49 @@ describe.each(modeCases)(
`${mode} ownership cleanup still in progress; skipping external exposure`,
);
});
it("falls back to a direct reset when guarded cleanup bookkeeping fails", async () => {
const logTailscale = {
info: vi.fn(),
warn: vi.fn(),
};
const cleanup = await startGatewayTailscaleExposure({
tailscaleMode: mode,
resetOnExit: true,
port: 18789,
logTailscale,
ownerStore: {
async claim() {
return {
owner: {
token: "owner-1",
mode,
port: 18789,
pid: process.pid,
claimedAt: new Date(0).toISOString(),
phase: "active" as const,
},
previousOwner: null,
};
},
async replaceIfCurrent() {
return true;
},
async runCleanupIfCurrentOwner() {
throw new Error("lock dir unavailable");
},
},
});
await cleanup?.();
expect(disableMock).toHaveBeenCalledTimes(1);
expect(logTailscale.warn).toHaveBeenCalledWith(
`${mode} cleanup failed: lock dir unavailable`,
);
expect(logTailscale.warn).toHaveBeenCalledWith(
`${mode} cleanup guard failed; applied direct reset fallback`,
);
});
},
);

View File

@ -160,17 +160,12 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
return {
async claim(mode, port) {
const claimDeadline = Date.now() + cleanupClaimWaitMs;
while (true) {
const result = await withOwnerLock(async () => {
const previousOwner = await readOwner();
if (previousOwner?.phase === "cleaning" && isPidAlive(previousOwner.pid)) {
const cleanupStartedAtMs = Date.parse(
previousOwner.cleanupStartedAt ?? previousOwner.claimedAt,
);
const cleanupAgeMs = Number.isFinite(cleanupStartedAtMs)
? Date.now() - cleanupStartedAtMs
: Number.POSITIVE_INFINITY;
if (cleanupAgeMs < cleanupClaimWaitMs) {
if (Date.now() < claimDeadline) {
return { type: "wait" as const };
}
return { type: "blocked" as const, previousOwner };
@ -214,9 +209,14 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
async runCleanupIfCurrentOwner(token, cleanup) {
const cleanupOwner = await withOwnerLock(async () => {
const current = await readOwner();
if (current?.token !== token) {
if (!current) {
return null;
}
if (current.token !== token && isPidAlive(current.pid)) {
return null;
}
// If a newer owner died without cleaning up, reclaim its stale exposure
// before this older process exits so resetOnExit still clears Tailscale.
// Mark cleanup in progress before releasing the lock so overlapping
// startups cannot claim exposure until this reset finishes or fails.
const nextOwner: TailscaleExposureOwnerRecord = {
@ -314,6 +314,9 @@ export async function startGatewayTailscaleExposure(params: {
}
} catch (err) {
if (owner) {
// Restore the previous owner's original token so its own cleanup hook can
// still prove ownership later, or keep this failed owner if it needs to
// reset any partial/orphaned Tailscale state on exit.
const nextOwner =
previousOwner && isPidAlive(previousOwner.pid)
? previousOwner
@ -331,15 +334,21 @@ export async function startGatewayTailscaleExposure(params: {
return null;
}
const disableExposure = async () => {
if (params.tailscaleMode === "serve") {
await disableTailscaleServe();
} else {
await disableTailscaleFunnel();
}
};
return async () => {
try {
if (owner) {
// A failed enable can still leave behind partial or orphaned Tailscale
// config, so shutdown cleanup intentionally re-runs the global reset.
const cleanedUp = await ownerStore.runCleanupIfCurrentOwner(owner.token, async () => {
if (params.tailscaleMode === "serve") {
await disableTailscaleServe();
} else {
await disableTailscaleFunnel();
}
await disableExposure();
});
if (!cleanedUp) {
params.logTailscale.info(
@ -349,15 +358,22 @@ export async function startGatewayTailscaleExposure(params: {
return;
}
if (params.tailscaleMode === "serve") {
await disableTailscaleServe();
} else {
await disableTailscaleFunnel();
}
await disableExposure();
} catch (err) {
params.logTailscale.warn(
`${params.tailscaleMode} cleanup failed: ${err instanceof Error ? err.message : String(err)}`,
);
if (!owner) {
return;
}
try {
await disableExposure();
params.logTailscale.warn(
`${params.tailscaleMode} cleanup guard failed; applied direct reset fallback`,
);
} catch {
// The direct reset failed too; keep the original warning above.
}
}
};
}