fix: job-reaper batch-loop instead of single take:500 to avoid missing jobs
All checks were successful
Deploy API Server / build-and-deploy (push) Successful in 45s

- Replace single findMany(take:500) with cursor-based while loop
- REAP_BATCH_SIZE=500 constant; processes all stuck running + expired jobs
- Prevents missing jobs when >500 are stuck simultaneously
- Update tests: reset mocks before custom chains, explicit call ordering

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
wangdl 2026-06-18 11:45:36 +08:00
parent c0594c518d
commit cb24e5fb96
2 changed files with 98 additions and 64 deletions

View File

@ -6,8 +6,8 @@ describe('JobReaperService', () => {
let mockFindMany: jest.Mock; let mockFindMany: jest.Mock;
beforeEach(() => { beforeEach(() => {
mockUpdateMany = jest.fn(); mockUpdateMany = jest.fn().mockResolvedValue({ count: 0 });
mockFindMany = jest.fn(); mockFindMany = jest.fn().mockResolvedValue([]); // default: empty batch → breaks loop
const mockPrisma = { const mockPrisma = {
aiRuntimeJob: { updateMany: mockUpdateMany, findMany: mockFindMany }, aiRuntimeJob: { updateMany: mockUpdateMany, findMany: mockFindMany },
} as any; } as any;
@ -20,8 +20,7 @@ describe('JobReaperService', () => {
}); });
it('expires locked jobs past lockUntil', async () => { it('expires locked jobs past lockUntil', async () => {
mockUpdateMany.mockResolvedValue({ count: 2 }); mockUpdateMany.mockResolvedValueOnce({ count: 2 }); // locked step
mockFindMany.mockResolvedValue([]);
const result = await service.reap(); const result = await service.reap();
@ -34,12 +33,13 @@ describe('JobReaperService', () => {
it('expires running jobs past timeout', async () => { it('expires running jobs past timeout', async () => {
const now = Date.now(); const now = Date.now();
mockUpdateMany.mockResolvedValueOnce({ count: 0 }); // locked // Running batch 1: has a stuck job
mockFindMany.mockResolvedValueOnce([ mockFindMany.mockResolvedValueOnce([
{ id: 'j1', startedAt: new Date(now - 180_000), timeoutSeconds: 120, retryCount: 0, maxRetryCount: 3 }, { id: 'j1', startedAt: new Date(now - 180_000), timeoutSeconds: 120 },
]); ]);
mockUpdateMany.mockResolvedValueOnce({ count: 1 }); // running expired // Running batch 2: empty → loop breaks (default mock handles this)
mockFindMany.mockResolvedValueOnce([]); // expired jobs mockUpdateMany.mockResolvedValueOnce({ count: 0 }); // locked step
mockUpdateMany.mockResolvedValueOnce({ count: 1 }); // expire running
const result = await service.reap(); const result = await service.reap();
@ -51,14 +51,18 @@ describe('JobReaperService', () => {
}); });
it('retries expired jobs with remaining retries', async () => { it('retries expired jobs with remaining retries', async () => {
mockUpdateMany.mockResolvedValueOnce({ count: 0 }); // locked // Running loop: empty on first call → breaks
mockFindMany.mockResolvedValueOnce([]); // running // Expired batch 1: has 2 expired jobs
mockFindMany.mockResolvedValueOnce([ mockFindMany
.mockResolvedValueOnce([]) // running batch 1 (empty → skip)
.mockResolvedValueOnce([
{ id: 'j1', retryCount: 0, maxRetryCount: 3 }, { id: 'j1', retryCount: 0, maxRetryCount: 3 },
{ id: 'j2', retryCount: 3, maxRetryCount: 3 }, { id: 'j2', retryCount: 3, maxRetryCount: 3 },
]); ]);
mockUpdateMany.mockResolvedValueOnce({ count: 1 }); // retry j1 mockUpdateMany
mockUpdateMany.mockResolvedValueOnce({ count: 1 }); // fail j2 .mockResolvedValueOnce({ count: 0 }) // locked
.mockResolvedValueOnce({ count: 1 }) // retry j1
.mockResolvedValueOnce({ count: 1 }); // fail j2
const result = await service.reap(); const result = await service.reap();
@ -67,11 +71,23 @@ describe('JobReaperService', () => {
}); });
it('handles no stuck jobs gracefully', async () => { it('handles no stuck jobs gracefully', async () => {
mockUpdateMany.mockResolvedValue({ count: 0 });
mockFindMany.mockResolvedValue([]);
const result = await service.reap(); const result = await service.reap();
expect(result).toEqual({ expired: 0, retried: 0, failed: 0 }); expect(result).toEqual({ expired: 0, retried: 0, failed: 0 });
}); });
it('processes running jobs correctly with single batch', async () => {
mockFindMany.mockReset().mockResolvedValue([]);
mockUpdateMany.mockReset().mockResolvedValue({ count: 0 });
const now = Date.now();
// Batch returns fewer than BATCH_SIZE → loop stops naturally
mockFindMany.mockResolvedValueOnce([
{ id: 'j1', startedAt: new Date(now - 180_000), timeoutSeconds: 120 },
{ id: 'j2', startedAt: new Date(now - 180_000), timeoutSeconds: 120 },
]);
mockUpdateMany.mockResolvedValueOnce({ count: 0 }); // locked step
mockUpdateMany.mockResolvedValueOnce({ count: 2 }); // expire running
const result = await service.reap();
expect(result.expired).toBe(2);
});
}); });

View File

@ -2,6 +2,7 @@ import { Injectable, Logger, OnModuleInit, OnModuleDestroy } from '@nestjs/commo
import { PrismaService } from '../../infrastructure/database/prisma.service'; import { PrismaService } from '../../infrastructure/database/prisma.service';
const REAP_INTERVAL_MS = 30_000; // every 30 seconds const REAP_INTERVAL_MS = 30_000; // every 30 seconds
const REAP_BATCH_SIZE = 500;
@Injectable() @Injectable()
export class JobReaperService implements OnModuleInit, OnModuleDestroy { export class JobReaperService implements OnModuleInit, OnModuleDestroy {
@ -30,38 +31,52 @@ export class JobReaperService implements OnModuleInit, OnModuleDestroy {
}); });
// 2. Expire stuck running jobs (startedAt + timeoutSeconds < now) // 2. Expire stuck running jobs (startedAt + timeoutSeconds < now)
// We can't do arithmetic in Prisma where, so fetch IDs and update // We can't do arithmetic in Prisma where, so fetch IDs in batches and filter in memory
const stuckRunning = await this.prisma.aiRuntimeJob.findMany({ let expiredRunning = 0;
let cursor: string | undefined;
let hasMore = true;
while (hasMore) {
const batch = await this.prisma.aiRuntimeJob.findMany({
where: { status: 'running' }, where: { status: 'running' },
select: { id: true, startedAt: true, timeoutSeconds: true, retryCount: true, maxRetryCount: true }, select: { id: true, startedAt: true, timeoutSeconds: true },
take: 500, take: REAP_BATCH_SIZE,
...(cursor ? { cursor: { id: cursor }, skip: 1 } : {}),
orderBy: { id: 'asc' },
}); });
const stuckIds = stuckRunning if (batch.length === 0) { hasMore = false; break; }
const stuckIds = batch
.filter(j => j.startedAt && (now.getTime() - j.startedAt.getTime()) > j.timeoutSeconds * 1000) .filter(j => j.startedAt && (now.getTime() - j.startedAt.getTime()) > j.timeoutSeconds * 1000)
.map(j => j.id); .map(j => j.id);
let expiredRunning = 0;
if (stuckIds.length > 0) { if (stuckIds.length > 0) {
const result = await this.prisma.aiRuntimeJob.updateMany({ const result = await this.prisma.aiRuntimeJob.updateMany({
where: { id: { in: stuckIds }, status: 'running' }, where: { id: { in: stuckIds }, status: 'running' },
data: { status: 'expired' }, data: { status: 'expired' },
}); });
expiredRunning = result.count; expiredRunning += result.count;
}
cursor = batch[batch.length - 1].id;
if (batch.length < REAP_BATCH_SIZE) hasMore = false;
} }
// 3. Retry expired jobs where retryCount < maxRetryCount // 3. Retry expired jobs where retryCount < maxRetryCount
// Prisma doesn't support comparing two columns in where, so fetch and batch
const expiredJobs = await this.prisma.aiRuntimeJob.findMany({
where: { status: 'expired' },
select: { id: true, retryCount: true, maxRetryCount: true },
take: 500,
});
const retryIds = expiredJobs.filter(j => j.retryCount < j.maxRetryCount).map(j => j.id);
const failIds = expiredJobs.filter(j => j.retryCount >= j.maxRetryCount).map(j => j.id);
let retried = 0; let retried = 0;
let failed = 0; let failed = 0;
cursor = undefined;
hasMore = true;
while (hasMore) {
const batch = await this.prisma.aiRuntimeJob.findMany({
where: { status: 'expired' },
select: { id: true, retryCount: true, maxRetryCount: true },
take: REAP_BATCH_SIZE,
...(cursor ? { cursor: { id: cursor }, skip: 1 } : {}),
orderBy: { id: 'asc' },
});
if (batch.length === 0) { hasMore = false; break; }
const retryIds = batch.filter(j => j.retryCount < j.maxRetryCount).map(j => j.id);
const failIds = batch.filter(j => j.retryCount >= j.maxRetryCount).map(j => j.id);
if (retryIds.length > 0) { if (retryIds.length > 0) {
const result = await this.prisma.aiRuntimeJob.updateMany({ const result = await this.prisma.aiRuntimeJob.updateMany({
@ -74,7 +89,7 @@ export class JobReaperService implements OnModuleInit, OnModuleDestroy {
retryCount: { increment: 1 }, retryCount: { increment: 1 },
}, },
}); });
retried = result.count; retried += result.count;
} }
if (failIds.length > 0) { if (failIds.length > 0) {
@ -82,7 +97,10 @@ export class JobReaperService implements OnModuleInit, OnModuleDestroy {
where: { id: { in: failIds }, status: 'expired' }, where: { id: { in: failIds }, status: 'expired' },
data: { status: 'failed', finishedAt: new Date() }, data: { status: 'failed', finishedAt: new Date() },
}); });
failed = result.count; failed += result.count;
}
cursor = batch[batch.length - 1].id;
if (batch.length < REAP_BATCH_SIZE) hasMore = false;
} }
const total = expiredLocks.count + expiredRunning + retried + failed; const total = expiredLocks.count + expiredRunning + retried + failed;