Skip to content

Commit

Permalink
enable task monitor to update fatal errors
Browse files Browse the repository at this point in the history
  • Loading branch information
nicktrn committed Oct 28, 2024
1 parent e0c8fe2 commit e54aa85
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 9 deletions.
10 changes: 6 additions & 4 deletions apps/kubernetes-provider/src/taskMonitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ export class TaskMonitor {
const containerState = this.#getContainerStateSummary(containerStatus.state);
const exitCode = containerState.exitCode ?? -1;

if (exitCode === EXIT_CODE_ALREADY_HANDLED || exitCode === EXIT_CODE_CHILD_NONZERO) {
if (exitCode === EXIT_CODE_ALREADY_HANDLED) {
this.#logger.debug("Ignoring pod failure, already handled by worker", {
podName,
});
Expand All @@ -160,7 +160,10 @@ export class TaskMonitor {

let reason = rawReason || "Unknown error";
let logs = rawLogs || "";
let overrideCompletion = false;

/** This will only override existing task errors. It will not crash the run. */
let onlyOverrideExistingError = exitCode === EXIT_CODE_CHILD_NONZERO;

let errorCode: TaskRunInternalError["code"] = TaskRunErrorCodes.POD_UNKNOWN_ERROR;

switch (rawReason) {
Expand All @@ -185,7 +188,6 @@ export class TaskMonitor {
}
break;
case "OOMKilled":
overrideCompletion = true;
reason =
"[TaskMonitor] Your task ran out of memory. Try increasing the machine specs. If this doesn't fix it there might be a memory leak.";
errorCode = TaskRunErrorCodes.TASK_PROCESS_OOM_KILLED;
Expand All @@ -198,7 +200,7 @@ export class TaskMonitor {
exitCode,
reason,
logs,
overrideCompletion,
overrideCompletion: onlyOverrideExistingError,
errorCode,
} satisfies FailureDetails;

Expand Down
13 changes: 8 additions & 5 deletions apps/webapp/app/v3/handleSocketIo.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import { Redis } from "ioredis";
import { createAdapter } from "@socket.io/redis-adapter";
import { CrashTaskRunService } from "./services/crashTaskRun.server";
import { CreateTaskRunAttemptService } from "./services/createTaskRunAttempt.server";
import { UpdateFatalRunErrorService } from "./services/updateFatalRunError.server";

export const socketIo = singleton("socketIo", initalizeIoServer);

Expand Down Expand Up @@ -302,11 +303,13 @@ function createProviderNamespace(io: Server) {
handlers: {
WORKER_CRASHED: async (message) => {
try {
const service = new CrashTaskRunService();

await service.call(message.runId, {
...message,
});
if (message.overrideCompletion) {
const updateErrorService = new UpdateFatalRunErrorService();
await updateErrorService.call(message.runId, { ...message });
} else {
const crashRunService = new CrashTaskRunService();
await crashRunService.call(message.runId, { ...message });
}
} catch (error) {
logger.error("Error while handling crashed worker", { error });
}
Expand Down
5 changes: 5 additions & 0 deletions apps/webapp/app/v3/services/crashTaskRun.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ export class CrashTaskRunService extends BaseService {

logger.debug("CrashTaskRunService.call", { runId, opts });

if (options?.overrideCompletion) {
logger.error("CrashTaskRunService.call: overrideCompletion is deprecated", { runId });
return;
}

const taskRun = await this._prisma.taskRun.findFirst({
where: {
id: runId,
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/v3/schemas/messages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ export const ProviderToPlatformMessages = {
exitCode: z.number().optional(),
message: z.string().optional(),
logs: z.string().optional(),
/** This means we should only update the error if one exists */
overrideCompletion: z.boolean().optional(),
errorCode: TaskRunInternalError.shape.code.optional(),
}),
Expand Down

0 comments on commit e54aa85

Please sign in to comment.