fix(gateway): harden tailscale ownership rollback
This commit is contained in:
parent
c47caaa198
commit
dd570f9b7a
@ -19,28 +19,42 @@ vi.mock("../infra/tailscale.js", () => ({
|
||||
import { startGatewayTailscaleExposure } from "./server-tailscale.js";
|
||||
|
||||
function createOwnerStore() {
|
||||
let currentToken: string | null = null;
|
||||
let currentOwner: null | {
|
||||
token: string;
|
||||
mode: "serve" | "funnel";
|
||||
port: number;
|
||||
pid: number;
|
||||
claimedAt: string;
|
||||
} = null;
|
||||
let nextId = 0;
|
||||
|
||||
return {
|
||||
async claim(mode: "serve" | "funnel", port: number) {
|
||||
const record = {
|
||||
const previousOwner = currentOwner;
|
||||
const owner = {
|
||||
token: `owner-${++nextId}`,
|
||||
mode,
|
||||
port,
|
||||
pid: nextId,
|
||||
claimedAt: new Date(0).toISOString(),
|
||||
};
|
||||
currentToken = record.token;
|
||||
return record;
|
||||
currentOwner = owner;
|
||||
return { owner, previousOwner };
|
||||
},
|
||||
async isCurrentOwner(token: string) {
|
||||
return currentToken === token;
|
||||
},
|
||||
async clearIfCurrentOwner(token: string) {
|
||||
if (currentToken === token) {
|
||||
currentToken = null;
|
||||
async replaceIfCurrent(token: string, nextOwner: typeof currentOwner | null) {
|
||||
if (currentOwner?.token !== token) {
|
||||
return false;
|
||||
}
|
||||
currentOwner = nextOwner;
|
||||
return true;
|
||||
},
|
||||
async runCleanupIfCurrentOwner(token: string, cleanup: () => Promise<void>) {
|
||||
if (currentOwner?.token !== token) {
|
||||
return false;
|
||||
}
|
||||
await cleanup();
|
||||
currentOwner = null;
|
||||
return true;
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -74,23 +88,20 @@ describe("startGatewayTailscaleExposure", () => {
|
||||
|
||||
await cleanupA?.();
|
||||
expect(tailscaleState.disableServe).not.toHaveBeenCalled();
|
||||
expect(logTailscale.info).toHaveBeenCalledWith(
|
||||
"serve cleanup skipped: newer gateway owns Tailscale exposure",
|
||||
);
|
||||
expect(logTailscale.info).toHaveBeenCalledWith("serve cleanup skipped: not the current owner");
|
||||
|
||||
await cleanupB?.();
|
||||
expect(tailscaleState.disableServe).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("clears ownership after a startup failure", async () => {
|
||||
it("restores the previous owner after a takeover startup failure", async () => {
|
||||
const ownerStore = createOwnerStore();
|
||||
const logTailscale = {
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
};
|
||||
tailscaleState.enableServe.mockRejectedValueOnce(new Error("boom"));
|
||||
|
||||
const cleanup = await startGatewayTailscaleExposure({
|
||||
const cleanupA = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: "serve",
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
@ -98,10 +109,24 @@ describe("startGatewayTailscaleExposure", () => {
|
||||
ownerStore,
|
||||
});
|
||||
|
||||
expect(cleanup).not.toBeNull();
|
||||
tailscaleState.enableServe.mockRejectedValueOnce(new Error("boom"));
|
||||
|
||||
const cleanupB = await startGatewayTailscaleExposure({
|
||||
tailscaleMode: "serve",
|
||||
resetOnExit: true,
|
||||
port: 18789,
|
||||
logTailscale,
|
||||
ownerStore,
|
||||
});
|
||||
|
||||
expect(cleanupB).not.toBeNull();
|
||||
expect(logTailscale.warn).toHaveBeenCalledWith("serve failed: boom");
|
||||
|
||||
await cleanup?.();
|
||||
await cleanupB?.();
|
||||
expect(tailscaleState.disableServe).not.toHaveBeenCalled();
|
||||
expect(logTailscale.info).toHaveBeenCalledWith("serve cleanup skipped: not the current owner");
|
||||
|
||||
await cleanupA?.();
|
||||
expect(tailscaleState.disableServe).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
|
||||
@ -24,13 +24,19 @@ type TailscaleExposureOwnerStore = {
|
||||
claim(
|
||||
mode: Exclude<GatewayTailscaleMode, "off">,
|
||||
port: number,
|
||||
): Promise<TailscaleExposureOwnerRecord>;
|
||||
isCurrentOwner(token: string): Promise<boolean>;
|
||||
clearIfCurrentOwner(token: string): Promise<void>;
|
||||
): Promise<{
|
||||
owner: TailscaleExposureOwnerRecord;
|
||||
previousOwner: TailscaleExposureOwnerRecord | null;
|
||||
}>;
|
||||
replaceIfCurrent(token: string, nextOwner: TailscaleExposureOwnerRecord | null): Promise<boolean>;
|
||||
runCleanupIfCurrentOwner(token: string, cleanup: () => Promise<void>): Promise<boolean>;
|
||||
};
|
||||
|
||||
function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
|
||||
const ownerFilePath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.json");
|
||||
const ownerLockPath = path.join(resolveGatewayLockDir(), "tailscale-exposure-owner.lock");
|
||||
const lockRetryMs = 25;
|
||||
const lockStaleMs = 60_000;
|
||||
|
||||
async function readOwner(): Promise<TailscaleExposureOwnerRecord | null> {
|
||||
try {
|
||||
@ -47,42 +53,120 @@ function createTailscaleExposureOwnerStore(): TailscaleExposureOwnerStore {
|
||||
) {
|
||||
return parsed as TailscaleExposureOwnerRecord;
|
||||
}
|
||||
} catch (err) {
|
||||
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
|
||||
// Ignore malformed or unreadable ownership state and continue.
|
||||
}
|
||||
} catch {
|
||||
// ENOENT means the file does not exist yet. Any other parse/read error is
|
||||
// also ignored so the ownership guard remains best-effort and non-fatal.
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
async claim(mode, port) {
|
||||
const record: TailscaleExposureOwnerRecord = {
|
||||
token: randomUUID(),
|
||||
mode,
|
||||
port,
|
||||
pid: process.pid,
|
||||
claimedAt: new Date().toISOString(),
|
||||
};
|
||||
await fs.mkdir(path.dirname(ownerFilePath), { recursive: true });
|
||||
await fs.writeFile(ownerFilePath, JSON.stringify(record), "utf8");
|
||||
return record;
|
||||
},
|
||||
async isCurrentOwner(token) {
|
||||
const current = await readOwner();
|
||||
return current?.token === token;
|
||||
},
|
||||
async clearIfCurrentOwner(token) {
|
||||
if (!(await this.isCurrentOwner(token))) {
|
||||
async function sleep(ms: number) {
|
||||
await new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function isPidAlive(pid: number): boolean {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch (err) {
|
||||
return (err as NodeJS.ErrnoException | undefined)?.code !== "ESRCH";
|
||||
}
|
||||
}
|
||||
|
||||
async function breakStaleLock() {
|
||||
try {
|
||||
const [raw, stat] = await Promise.all([
|
||||
fs.readFile(ownerLockPath, "utf8").catch(() => null),
|
||||
fs.stat(ownerLockPath),
|
||||
]);
|
||||
if (Date.now() - stat.mtimeMs < lockStaleMs) {
|
||||
return;
|
||||
}
|
||||
const parsed = raw ? JSON.parse(raw) : null;
|
||||
const pid = typeof parsed?.pid === "number" ? parsed.pid : null;
|
||||
if (pid !== null && isPidAlive(pid)) {
|
||||
return;
|
||||
}
|
||||
await fs.unlink(ownerLockPath).catch(() => {});
|
||||
} catch (err) {
|
||||
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
|
||||
// Ignore malformed or unreadable lock state and retry.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function withOwnerLock<T>(fn: () => Promise<T>): Promise<T> {
|
||||
await fs.mkdir(path.dirname(ownerLockPath), { recursive: true });
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
await fs.unlink(ownerFilePath);
|
||||
const handle = await fs.open(ownerLockPath, "wx");
|
||||
try {
|
||||
await handle.writeFile(
|
||||
JSON.stringify({ pid: process.pid, acquiredAt: new Date().toISOString() }),
|
||||
);
|
||||
return await fn();
|
||||
} finally {
|
||||
await handle.close().catch(() => {});
|
||||
await fs.unlink(ownerLockPath).catch(() => {});
|
||||
}
|
||||
} catch (err) {
|
||||
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
|
||||
if ((err as NodeJS.ErrnoException | undefined)?.code !== "EEXIST") {
|
||||
throw err;
|
||||
}
|
||||
await breakStaleLock();
|
||||
await sleep(lockRetryMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function deleteOwnerFile() {
|
||||
await fs.unlink(ownerFilePath).catch((err: unknown) => {
|
||||
if ((err as NodeJS.ErrnoException | undefined)?.code !== "ENOENT") {
|
||||
throw err;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
async claim(mode, port) {
|
||||
return await withOwnerLock(async () => {
|
||||
const previousOwner = await readOwner();
|
||||
const owner: TailscaleExposureOwnerRecord = {
|
||||
token: randomUUID(),
|
||||
mode,
|
||||
port,
|
||||
pid: process.pid,
|
||||
claimedAt: new Date().toISOString(),
|
||||
};
|
||||
await fs.writeFile(ownerFilePath, JSON.stringify(owner), "utf8");
|
||||
return { owner, previousOwner };
|
||||
});
|
||||
},
|
||||
async replaceIfCurrent(token, nextOwner) {
|
||||
return await withOwnerLock(async () => {
|
||||
const current = await readOwner();
|
||||
if (current?.token !== token) {
|
||||
return false;
|
||||
}
|
||||
if (nextOwner) {
|
||||
await fs.writeFile(ownerFilePath, JSON.stringify(nextOwner), "utf8");
|
||||
} else {
|
||||
await deleteOwnerFile();
|
||||
}
|
||||
return true;
|
||||
});
|
||||
},
|
||||
async runCleanupIfCurrentOwner(token, cleanup) {
|
||||
return await withOwnerLock(async () => {
|
||||
const current = await readOwner();
|
||||
if (current?.token !== token) {
|
||||
return false;
|
||||
}
|
||||
await cleanup();
|
||||
await deleteOwnerFile();
|
||||
return true;
|
||||
});
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -100,7 +184,7 @@ export async function startGatewayTailscaleExposure(params: {
|
||||
}
|
||||
|
||||
const ownerStore = params.ownerStore ?? createTailscaleExposureOwnerStore();
|
||||
const owner = await ownerStore.claim(params.tailscaleMode, params.port);
|
||||
const { owner, previousOwner } = await ownerStore.claim(params.tailscaleMode, params.port);
|
||||
|
||||
try {
|
||||
if (params.tailscaleMode === "serve") {
|
||||
@ -118,7 +202,7 @@ export async function startGatewayTailscaleExposure(params: {
|
||||
params.logTailscale.info(`${params.tailscaleMode} enabled`);
|
||||
}
|
||||
} catch (err) {
|
||||
await ownerStore.clearIfCurrentOwner(owner.token).catch(() => {});
|
||||
await ownerStore.replaceIfCurrent(owner.token, previousOwner).catch(() => {});
|
||||
params.logTailscale.warn(
|
||||
`${params.tailscaleMode} failed: ${err instanceof Error ? err.message : String(err)}`,
|
||||
);
|
||||
@ -130,18 +214,16 @@ export async function startGatewayTailscaleExposure(params: {
|
||||
|
||||
return async () => {
|
||||
try {
|
||||
if (!(await ownerStore.isCurrentOwner(owner.token))) {
|
||||
params.logTailscale.info(
|
||||
`${params.tailscaleMode} cleanup skipped: newer gateway owns Tailscale exposure`,
|
||||
);
|
||||
return;
|
||||
const cleanedUp = await ownerStore.runCleanupIfCurrentOwner(owner.token, async () => {
|
||||
if (params.tailscaleMode === "serve") {
|
||||
await disableTailscaleServe();
|
||||
} else {
|
||||
await disableTailscaleFunnel();
|
||||
}
|
||||
});
|
||||
if (!cleanedUp) {
|
||||
params.logTailscale.info(`${params.tailscaleMode} cleanup skipped: not the current owner`);
|
||||
}
|
||||
if (params.tailscaleMode === "serve") {
|
||||
await disableTailscaleServe();
|
||||
} else {
|
||||
await disableTailscaleFunnel();
|
||||
}
|
||||
await ownerStore.clearIfCurrentOwner(owner.token);
|
||||
} catch (err) {
|
||||
params.logTailscale.warn(
|
||||
`${params.tailscaleMode} cleanup failed: ${err instanceof Error ? err.message : String(err)}`,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user