fix(cron): preserve reload repairs in manual run and apply paths

This commit is contained in:
create 2026-03-20 16:25:26 +08:00
parent a42cd9473c
commit c3338cdb9f
5 changed files with 219 additions and 56 deletions

View File

@ -2,6 +2,7 @@ import fs from "node:fs/promises";
import { describe, expect, it, vi } from "vitest";
import { setupCronServiceSuite, writeCronStoreSnapshot } from "./service.test-harness.js";
import { recomputeNextRuns, recomputeNextRunsForMaintenance } from "./service/jobs.js";
import { run } from "./service/ops.js";
import { createCronServiceState } from "./service/state.js";
import { ensureLoaded } from "./service/store.js";
import type { CronJob } from "./types.js";
@ -302,4 +303,97 @@ describe("forceReload repairs externally changed cron schedules", () => {
recomputeNextRunsForMaintenance(state);
expect(state.store?.jobs[0]?.state.scheduleErrorCount).toBe(2);
});
it("keeps forceReload repairs when manual-run snapshot is merged back", async () => {
const store = await makeStorePath();
let nowMs = Date.parse("2026-03-19T01:44:00.000Z");
const jobId = "manual-run-reload-merge";
const staleNextRunAtMs = Date.parse("2026-03-19T23:30:00.000Z");
const createJob = (params: {
expr: string;
enabled: boolean;
nextRunAtMs?: number;
lastStatus?: CronJob["state"]["lastStatus"];
}): CronJob => ({
id: jobId,
name: "manual run reload merge",
enabled: params.enabled,
createdAtMs: Date.parse("2026-03-18T00:30:00.000Z"),
updatedAtMs: Date.parse("2026-03-19T01:44:00.000Z"),
schedule: { kind: "cron", expr: params.expr, tz: "Asia/Shanghai", staggerMs: 0 },
sessionTarget: "isolated",
wakeMode: "next-heartbeat",
payload: { kind: "agentTurn", message: "tick" },
state: {
nextRunAtMs: params.nextRunAtMs,
lastStatus: params.lastStatus,
},
});
await writeCronStoreSnapshot({
storePath: store.storePath,
jobs: [
createJob({
expr: "30 23 * * *",
enabled: true,
nextRunAtMs: staleNextRunAtMs,
}),
],
});
const runIsolatedAgentJob = vi.fn(async () => {
await writeCronStoreSnapshot({
storePath: store.storePath,
jobs: [
createJob({
expr: "30 8 * * *",
enabled: false,
nextRunAtMs: staleNextRunAtMs,
lastStatus: "error",
}),
],
});
nowMs += 500;
return { status: "ok" as const, summary: "done" };
});
const state = createCronServiceState({
cronEnabled: true,
storePath: store.storePath,
log: noopLogger,
nowMs: () => nowMs,
enqueueSystemEvent: vi.fn(),
requestHeartbeatNow: vi.fn(),
runIsolatedAgentJob,
});
const result = await run(state, jobId, "force");
expect(result).toEqual({ ok: true, ran: true });
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
const merged = state.store?.jobs.find((job) => job.id === jobId);
expect(merged?.schedule).toEqual({
kind: "cron",
expr: "30 8 * * *",
tz: "Asia/Shanghai",
staggerMs: 0,
});
expect(merged?.enabled).toBe(false);
expect(merged?.state.nextRunAtMs).toBeUndefined();
expect(merged?.state.lastStatus).toBe("ok");
expect(merged?.state.lastRunAtMs).toBeDefined();
const persisted = JSON.parse(await fs.readFile(store.storePath, "utf8")) as {
jobs?: Array<{
id: string;
enabled?: boolean;
state?: { nextRunAtMs?: number; lastStatus?: string };
}>;
};
const persistedJob = persisted.jobs?.find((job) => job.id === jobId);
expect(persistedJob?.enabled).toBe(false);
expect(persistedJob?.state?.nextRunAtMs).toBeUndefined();
expect(persistedJob?.state?.lastStatus).toBe("ok");
});
});

View File

@ -22,7 +22,7 @@ import {
createNoopLogger,
createRunningCronServiceState,
} from "./service.test-harness.js";
import { computeJobNextRunAtMs } from "./service/jobs.js";
import { computeJobNextRunAtMs, recomputeNextRunsForMaintenance } from "./service/jobs.js";
import { enqueueRun, run } from "./service/ops.js";
import { createCronServiceState, type CronEvent } from "./service/state.js";
import {
@ -1748,6 +1748,48 @@ describe("Cron issue regressions", () => {
expect(job.enabled).toBe(true);
});
it("does not double-count reload schedule errors in apply path before maintenance recompute", () => {
const startedAt = Date.parse("2026-03-02T12:10:00.000Z");
const endedAt = startedAt + 25;
const job = createIsolatedRegressionJob({
id: "apply-result-reload-dedupe-30905",
name: "apply-result-reload-dedupe-30905",
scheduledAt: startedAt,
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Invalid/Timezone" },
payload: { kind: "agentTurn", message: "ping" },
state: {
nextRunAtMs: undefined,
runningAtMs: startedAt - 500,
scheduleErrorCount: 1,
lastError: "schedule error: previous",
},
});
const state = createRunningCronServiceState({
storePath: "/tmp/cron-30905-reload-dedupe.json",
log: noopLogger as never,
nowMs: () => endedAt,
jobs: [job],
});
state.skipNextReloadRepairRecomputeJobIds = new Set([job.id]);
applyJobResult(state, job, {
status: "ok",
delivered: true,
startedAt,
endedAt,
});
expect(job.state.scheduleErrorCount).toBe(1);
expect(state.skipNextReloadRepairRecomputeJobIds?.has(job.id)).toBe(true);
recomputeNextRunsForMaintenance(state);
expect(job.state.scheduleErrorCount).toBe(1);
expect(state.skipNextReloadRepairRecomputeJobIds?.has(job.id)).toBe(false);
recomputeNextRunsForMaintenance(state);
expect(job.state.scheduleErrorCount).toBe(2);
});
it("force run preserves 'every' anchor while recording manual lastRunAtMs", () => {
const nowMs = Date.now();
const everyMs = 24 * 60 * 60 * 1_000;

View File

@ -413,12 +413,19 @@ function walkSchedulableJobs(
return changed;
}
function consumeSkipNextReloadRepairRecompute(state: CronServiceState, jobId: string): boolean {
export function hasSkipNextReloadRepairRecompute(state: CronServiceState, jobId: string): boolean {
const pending = state.skipNextReloadRepairRecomputeJobIds;
if (!pending?.has(jobId)) {
return pending?.has(jobId) === true;
}
export function consumeSkipNextReloadRepairRecompute(
state: CronServiceState,
jobId: string,
): boolean {
if (!hasSkipNextReloadRepairRecompute(state, jobId)) {
return false;
}
pending.delete(jobId);
state.skipNextReloadRepairRecomputeJobIds?.delete(jobId);
return true;
}

View File

@ -51,7 +51,6 @@ function mergeManualRunSnapshotAfterReload(params: {
state: CronServiceState;
jobId: string;
snapshot: {
enabled: boolean;
updatedAtMs: number;
state: CronJob["state"];
} | null;
@ -71,9 +70,25 @@ function mergeManualRunSnapshotAfterReload(params: {
if (!reloaded) {
return;
}
reloaded.enabled = params.snapshot.enabled;
reloaded.updatedAtMs = params.snapshot.updatedAtMs;
reloaded.state = params.snapshot.state;
const preservedEnabled = reloaded.enabled;
const preservedNextRunAtMs = reloaded.state.nextRunAtMs;
const preservedScheduleErrorCount = reloaded.state.scheduleErrorCount;
const preservedScheduleErrorText = reloaded.state.lastError;
reloaded.updatedAtMs = Math.max(reloaded.updatedAtMs, params.snapshot.updatedAtMs);
reloaded.state = {
...reloaded.state,
...params.snapshot.state,
};
// Keep externally reloaded schedule/enable repairs even when a manual run
// snapshot was captured before forceReload.
reloaded.enabled = preservedEnabled;
reloaded.state.nextRunAtMs = preservedNextRunAtMs;
if (preservedScheduleErrorCount !== undefined) {
reloaded.state.scheduleErrorCount = preservedScheduleErrorCount;
reloaded.state.lastError = preservedScheduleErrorText;
}
}
async function ensureLoadedForRead(state: CronServiceState) {
@ -519,7 +534,6 @@ async function finishPreparedManualRun(
const postRunSnapshot = shouldDelete
? null
: {
enabled: job.enabled,
updatedAtMs: job.updatedAtMs,
state: structuredClone(job.state),
};

View File

@ -15,6 +15,7 @@ import type {
import {
computeJobPreviousRunAtMs,
computeJobNextRunAtMs,
hasSkipNextReloadRepairRecompute,
nextWakeAtMs,
recomputeNextRunsForMaintenance,
recordScheduleComputeError,
@ -368,6 +369,7 @@ export function applyJobResult(
const shouldDelete =
job.schedule.kind === "at" && job.deleteAfterRun === true && result.status === "ok";
const skipImmediateScheduleRecompute = hasSkipNextReloadRepairRecompute(state, job.id);
if (!shouldDelete) {
if (job.schedule.kind === "at") {
@ -416,54 +418,58 @@ export function applyJobResult(
} else if (result.status === "error" && job.enabled) {
// Apply exponential backoff for errored jobs to prevent retry storms.
const backoff = errorBackoffMs(job.state.consecutiveErrors ?? 1);
let normalNext: number | undefined;
try {
normalNext =
opts?.preserveSchedule && job.schedule.kind === "every"
? computeNextWithPreservedLastRun(result.endedAt)
: computeJobNextRunAtMs(job, result.endedAt);
} catch (err) {
// If the schedule expression/timezone throws (croner edge cases),
// record the schedule error (auto-disables after repeated failures)
// and fall back to backoff-only schedule so the state update is not lost.
recordScheduleComputeError({ state, job, err });
}
const backoffNext = result.endedAt + backoff;
// Use whichever is later: the natural next run or the backoff delay.
job.state.nextRunAtMs =
normalNext !== undefined ? Math.max(normalNext, backoffNext) : backoffNext;
state.deps.log.info(
{
jobId: job.id,
consecutiveErrors: job.state.consecutiveErrors,
backoffMs: backoff,
nextRunAtMs: job.state.nextRunAtMs,
},
"cron: applying error backoff",
);
} else if (job.enabled) {
let naturalNext: number | undefined;
try {
naturalNext =
opts?.preserveSchedule && job.schedule.kind === "every"
? computeNextWithPreservedLastRun(result.endedAt)
: computeJobNextRunAtMs(job, result.endedAt);
} catch (err) {
// If the schedule expression/timezone throws (croner edge cases),
// record the schedule error (auto-disables after repeated failures)
// so a persistent throw doesn't cause a MIN_REFIRE_GAP_MS hot loop.
recordScheduleComputeError({ state, job, err });
}
if (job.schedule.kind === "cron") {
// Safety net: ensure the next fire is at least MIN_REFIRE_GAP_MS
// after the current run ended. Prevents spin-loops when the
// schedule computation lands in the same second due to
// timezone/croner edge cases (see #17821).
const minNext = result.endedAt + MIN_REFIRE_GAP_MS;
if (!skipImmediateScheduleRecompute) {
let normalNext: number | undefined;
try {
normalNext =
opts?.preserveSchedule && job.schedule.kind === "every"
? computeNextWithPreservedLastRun(result.endedAt)
: computeJobNextRunAtMs(job, result.endedAt);
} catch (err) {
// If the schedule expression/timezone throws (croner edge cases),
// record the schedule error (auto-disables after repeated failures)
// and fall back to backoff-only schedule so the state update is not lost.
recordScheduleComputeError({ state, job, err });
}
const backoffNext = result.endedAt + backoff;
// Use whichever is later: the natural next run or the backoff delay.
job.state.nextRunAtMs =
naturalNext !== undefined ? Math.max(naturalNext, minNext) : minNext;
} else {
job.state.nextRunAtMs = naturalNext;
normalNext !== undefined ? Math.max(normalNext, backoffNext) : backoffNext;
state.deps.log.info(
{
jobId: job.id,
consecutiveErrors: job.state.consecutiveErrors,
backoffMs: backoff,
nextRunAtMs: job.state.nextRunAtMs,
},
"cron: applying error backoff",
);
}
} else if (job.enabled) {
if (!skipImmediateScheduleRecompute) {
let naturalNext: number | undefined;
try {
naturalNext =
opts?.preserveSchedule && job.schedule.kind === "every"
? computeNextWithPreservedLastRun(result.endedAt)
: computeJobNextRunAtMs(job, result.endedAt);
} catch (err) {
// If the schedule expression/timezone throws (croner edge cases),
// record the schedule error (auto-disables after repeated failures)
// so a persistent throw doesn't cause a MIN_REFIRE_GAP_MS hot loop.
recordScheduleComputeError({ state, job, err });
}
if (job.schedule.kind === "cron") {
// Safety net: ensure the next fire is at least MIN_REFIRE_GAP_MS
// after the current run ended. Prevents spin-loops when the
// schedule computation lands in the same second due to
// timezone/croner edge cases (see #17821).
const minNext = result.endedAt + MIN_REFIRE_GAP_MS;
job.state.nextRunAtMs =
naturalNext !== undefined ? Math.max(naturalNext, minNext) : minNext;
} else {
job.state.nextRunAtMs = naturalNext;
}
}
} else {
job.state.nextRunAtMs = undefined;