From 8bf3c37c6c78cc83bfe66c65b0d6174105c038d6 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 22 Feb 2026 19:50:34 +0100 Subject: [PATCH] fix(cron): keep watchdog timer armed during ticks --- CHANGELOG.md | 1 + .../service.rearm-timer-when-running.test.ts | 73 ++++++++++++++++++- src/cron/service/timer.ts | 23 ++++-- 3 files changed, 88 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95a54c4b928..11ff315cd66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,7 @@ Docs: https://docs.openclaw.ai - Cron: honor `cron.maxConcurrentRuns` in the timer loop so due jobs can execute up to the configured parallelism instead of always running serially. (#11595) Thanks @Takhoffman. - Cron/Isolation: force fresh session IDs for isolated cron runs so `sessionTarget="isolated"` executions never reuse prior run context. (#23470) Thanks @echoVic. - Cron/Service: execute manual `cron.run` jobs outside the cron lock (while still persisting started/finished state atomically) so `cron.list` and `cron.status` remain responsive during long forced runs. (#23628) Thanks @dsgraves. +- Cron/Timer: keep a watchdog recheck timer armed while `onTimer` is actively executing so the scheduler continues polling even if a due-run tick stalls for an extended period. (#23628) Thanks @dsgraves. - Agents/Compaction: restore embedded compaction safeguard/context-pruning extension loading in production by wiring bundled extension factories into the resource loader instead of runtime file-path resolution. (#22349) Thanks @Glucksberg. - Feishu/Media: for inbound video messages that include both `file_key` (video) and `image_key` (thumbnail), prefer `file_key` when downloading media so video attachments are saved instead of silently failing on thumbnail keys. (#23633) - Hooks/Cron: suppress duplicate main-session events for delivered hook turns and mark `SILENT_REPLY_TOKEN` (`NO_REPLY`) early exits as delivered to prevent hook context pollution. (#20678) Thanks @JonathanWorks. diff --git a/src/cron/service.rearm-timer-when-running.test.ts b/src/cron/service.rearm-timer-when-running.test.ts index 6dfb0284a1e..aac531d85f5 100644 --- a/src/cron/service.rearm-timer-when-running.test.ts +++ b/src/cron/service.rearm-timer-when-running.test.ts @@ -1,9 +1,12 @@ +import fs from "node:fs/promises"; +import path from "node:path"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { - createCronStoreHarness, createNoopLogger, + createCronStoreHarness, createRunningCronServiceState, } from "./service.test-harness.js"; +import { createCronServiceState } from "./service/state.js"; import { onTimer } from "./service/timer.js"; import type { CronJob } from "./types.js"; @@ -31,6 +34,14 @@ function createDueRecurringJob(params: { }; } +function createDeferred() { + let resolve!: (value: T) => void; + const promise = new Promise((res) => { + resolve = res; + }); + return { promise, resolve }; +} + describe("CronService - timer re-arm when running (#12025)", () => { beforeEach(() => { noopLogger.debug.mockClear(); @@ -81,4 +92,64 @@ describe("CronService - timer re-arm when running (#12025)", () => { timeoutSpy.mockRestore(); await store.cleanup(); }); + + it("arms a watchdog timer while a timer tick is still executing", async () => { + const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); + const store = await makeStorePath(); + const now = Date.parse("2026-02-06T10:05:00.000Z"); + const deferredRun = createDeferred<{ status: "ok"; summary: string }>(); + + await fs.mkdir(path.dirname(store.storePath), { recursive: true }); + await fs.writeFile( + store.storePath, + JSON.stringify( + { + version: 1, + jobs: [ + createDueRecurringJob({ + id: "long-running-job", + nowMs: now, + nextRunAtMs: now, + }), + ], + }, + null, + 2, + ), + "utf-8", + ); + + const state = createCronServiceState({ + storePath: store.storePath, + cronEnabled: true, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: vi.fn(async () => await deferredRun.promise), + }); + + let settled = false; + const timerPromise = onTimer(state); + void timerPromise.finally(() => { + settled = true; + }); + + await Promise.resolve(); + expect(settled).toBe(false); + expect(state.running).toBe(true); + expect(state.timer).not.toBeNull(); + + const delays = timeoutSpy.mock.calls + .map(([, delay]) => delay) + .filter((d): d is number => typeof d === "number"); + expect(delays).toContain(60_000); + + deferredRun.resolve({ status: "ok", summary: "done" }); + await timerPromise; + expect(state.running).toBe(false); + + timeoutSpy.mockRestore(); + await store.cleanup(); + }); }); diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index 5b334d3a8e0..a99d2acec65 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -221,6 +221,17 @@ export function armTimer(state: CronServiceState) { ); } +function armRunningRecheckTimer(state: CronServiceState) { + if (state.timer) { + clearTimeout(state.timer); + } + state.timer = setTimeout(() => { + void onTimer(state).catch((err) => { + state.deps.log.error({ err: String(err) }, "cron: timer tick failed"); + }); + }, MAX_TIMER_DELAY_MS); +} + export async function onTimer(state: CronServiceState) { if (state.running) { // Re-arm the timer so the scheduler keeps ticking even when a job is @@ -233,17 +244,13 @@ export async function onTimer(state: CronServiceState) { // zero-delay hot-loop when past-due jobs are waiting for the current // execution to finish. // See: https://github.com/openclaw/openclaw/issues/12025 - if (state.timer) { - clearTimeout(state.timer); - } - state.timer = setTimeout(() => { - void onTimer(state).catch((err) => { - state.deps.log.error({ err: String(err) }, "cron: timer tick failed"); - }); - }, MAX_TIMER_DELAY_MS); + armRunningRecheckTimer(state); return; } state.running = true; + // Keep a watchdog timer armed while a tick is executing. If execution hangs + // (for example in a provider call), the scheduler still wakes to re-check. + armRunningRecheckTimer(state); try { const dueJobs = await locked(state, async () => { await ensureLoaded(state, { forceReload: true, skipRecompute: true });