From 97b4f72d6bfe1af33eeb44212606d7292680818d Mon Sep 17 00:00:00 2001 From: Nimrod Gutman Date: Mon, 16 Mar 2026 12:03:01 +0200 Subject: [PATCH] fix(gateway): serialize tailscale cleanup handoff --- src/gateway/server-tailscale.test.ts | 50 ++++++++++- src/gateway/server-tailscale.ts | 127 ++++++++++++++++++++++----- 2 files changed, 150 insertions(+), 27 deletions(-) diff --git a/src/gateway/server-tailscale.test.ts b/src/gateway/server-tailscale.test.ts index b7cf1127bcc..1fef3839453 100644 --- a/src/gateway/server-tailscale.test.ts +++ b/src/gateway/server-tailscale.test.ts @@ -25,18 +25,21 @@ function createOwnerStore() { port: number; pid: number; claimedAt: string; + phase: "active" | "cleaning"; + cleanupStartedAt?: string; } = null; - let nextId = 0; + let nextPid = process.pid - 1; return { async claim(mode: "serve" | "funnel", port: number) { const previousOwner = currentOwner; const owner = { - token: `owner-${++nextId}`, + token: `owner-${++nextPid}`, mode, port, - pid: nextId, + pid: nextPid, claimedAt: new Date(0).toISOString(), + phase: "active" as const, }; currentOwner = owner; return { owner, previousOwner }; @@ -52,6 +55,11 @@ function createOwnerStore() { if (currentOwner?.token !== token) { return false; } + currentOwner = { + ...currentOwner, + phase: "cleaning", + cleanupStartedAt: new Date(0).toISOString(), + }; await cleanup(); currentOwner = null; return true; @@ -166,7 +174,7 @@ describe.each(modeCases)( }); vi.spyOn(process, "kill").mockImplementation(((pid: number) => { - if (pid === 1) { + if (pid === process.pid) { const err = new Error("gone") as NodeJS.ErrnoException; err.code = "ESRCH"; throw err; @@ -229,5 +237,39 @@ describe.each(modeCases)( await cleanup?.(); expect(disableMock).toHaveBeenCalledTimes(1); }); + + it("skips unguarded fallback while a previous cleanup is still in progress", async () => { + const logTailscale = { + info: vi.fn(), + warn: vi.fn(), + }; + + const cleanup = await startGatewayTailscaleExposure({ + tailscaleMode: mode, + resetOnExit: true, + port: 18789, + logTailscale, + ownerStore: { + async claim() { + const err = new Error("busy"); + err.name = "TailscaleExposureCleanupInProgressError"; + throw err; + }, + async replaceIfCurrent() { + return false; + }, + async runCleanupIfCurrentOwner() { + return false; + }, + }, + }); + + expect(cleanup).toBeNull(); + expect(enableMock).not.toHaveBeenCalled(); + expect(disableMock).not.toHaveBeenCalled(); + expect(logTailscale.warn).toHaveBeenCalledWith( + `${mode} ownership cleanup still in progress; skipping external exposure`, + ); + }); }, ); diff --git a/src/gateway/server-tailscale.ts b/src/gateway/server-tailscale.ts index c83f5cbd745..dba699efa22 100644 --- a/src/gateway/server-tailscale.ts +++ b/src/gateway/server-tailscale.ts @@ -18,6 +18,8 @@ type TailscaleExposureOwnerRecord = { port: number; pid: number; claimedAt: string; + phase: "active" | "cleaning"; + cleanupStartedAt?: string; }; type TailscaleExposureOwnerStore = { @@ -41,11 +43,22 @@ function isPidAlive(pid: number): boolean { } } +function createCleanupInProgressError(pid: number): Error { + const err = new Error(`previous cleanup still in progress (pid ${pid})`); + err.name = "TailscaleExposureCleanupInProgressError"; + return err; +} + +function isCleanupInProgressError(err: unknown): err is Error { + return err instanceof Error && err.name === "TailscaleExposureCleanupInProgressError"; +} + function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore { const ownerFilePath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.json"); const ownerLockPath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.lock"); const lockRetryMs = 25; const lockStaleMs = 60_000; + const cleanupClaimWaitMs = 20_000; async function readOwner(): Promise { try { @@ -60,7 +73,12 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore { typeof parsed.pid === "number" && typeof parsed.claimedAt === "string" ) { - return parsed as TailscaleExposureOwnerRecord; + return { + ...(parsed as Omit), + phase: parsed.phase === "cleaning" ? "cleaning" : "active", + cleanupStartedAt: + typeof parsed.cleanupStartedAt === "string" ? parsed.cleanupStartedAt : undefined, + }; } } catch { // ENOENT means the file does not exist yet. Any other parse/read error is @@ -89,10 +107,8 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore { // Unreadable lock state is treated as stale so a dead holder cannot block recovery. } await fs.unlink(ownerLockPath).catch(() => {}); - } catch (err) { - if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") { - // Ignore malformed or unreadable lock state and retry. - } + } catch { + // Ignore malformed or unreadable lock state and retry. } } @@ -131,18 +147,42 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore { return { async claim(mode, port) { - return await withOwnerLock(async () => { - const previousOwner = await readOwner(); - const owner: TailscaleExposureOwnerRecord = { - token: randomUUID(), - mode, - port, - pid: process.pid, - claimedAt: new Date().toISOString(), - }; - await fs.writeFile(ownerFilePath, JSON.stringify(owner), "utf8"); - return { owner, previousOwner }; - }); + while (true) { + const result = await withOwnerLock(async () => { + const previousOwner = await readOwner(); + if (previousOwner?.phase === "cleaning" && isPidAlive(previousOwner.pid)) { + const cleanupStartedAtMs = Date.parse( + previousOwner.cleanupStartedAt ?? previousOwner.claimedAt, + ); + const cleanupAgeMs = Number.isFinite(cleanupStartedAtMs) + ? Date.now() - cleanupStartedAtMs + : Number.POSITIVE_INFINITY; + if (cleanupAgeMs < cleanupClaimWaitMs) { + return { type: "wait" as const }; + } + return { type: "blocked" as const, previousOwner }; + } + + const owner: TailscaleExposureOwnerRecord = { + token: randomUUID(), + mode, + port, + pid: process.pid, + claimedAt: new Date().toISOString(), + phase: "active", + }; + await fs.writeFile(ownerFilePath, JSON.stringify(owner), "utf8"); + return { type: "claimed" as const, owner, previousOwner }; + }); + + if (result.type === "claimed") { + return result; + } + if (result.type === "blocked") { + throw createCleanupInProgressError(result.previousOwner.pid); + } + await sleep(lockRetryMs); + } }, async replaceIfCurrent(token, nextOwner) { return await withOwnerLock(async () => { @@ -159,18 +199,53 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore { }); }, async runCleanupIfCurrentOwner(token, cleanup) { - const shouldRunCleanup = await withOwnerLock(async () => { + const cleanupOwner = await withOwnerLock(async () => { const current = await readOwner(); if (current?.token !== token) { - return false; + return null; } - await deleteOwnerFile(); - return true; + // Mark cleanup in progress before releasing the lock so overlapping + // startups cannot claim exposure until this reset finishes or fails. + const nextOwner: TailscaleExposureOwnerRecord = { + ...current, + phase: "cleaning", + cleanupStartedAt: new Date().toISOString(), + }; + await fs.writeFile(ownerFilePath, JSON.stringify(nextOwner), "utf8"); + return nextOwner; }); - if (!shouldRunCleanup) { + if (!cleanupOwner) { return false; } - await cleanup(); + + try { + await cleanup(); + } catch (err) { + await withOwnerLock(async () => { + const current = await readOwner(); + if (current?.token !== token || current.phase !== "cleaning") { + return; + } + await fs.writeFile( + ownerFilePath, + JSON.stringify({ + ...cleanupOwner, + phase: "active", + cleanupStartedAt: undefined, + }), + "utf8", + ); + }).catch(() => {}); + throw err; + } + + await withOwnerLock(async () => { + const current = await readOwner(); + if (current?.token !== token || current.phase !== "cleaning") { + return; + } + await deleteOwnerFile(); + }); return true; }, }; @@ -195,6 +270,12 @@ export async function startGatewayTailscaleExposure(params: { try { ({ owner, previousOwner } = await ownerStore.claim(params.tailscaleMode, params.port)); } catch (err) { + if (isCleanupInProgressError(err)) { + params.logTailscale.warn( + `${params.tailscaleMode} ownership cleanup still in progress; skipping external exposure`, + ); + return null; + } params.logTailscale.warn( `${params.tailscaleMode} ownership guard unavailable: ${err instanceof Error ? err.message : String(err)}`, );