Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuda: prevent task lockup on timeout error #2547

Open
wants to merge 2 commits into
base: criu-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions criu/cr-dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -1963,6 +1963,9 @@ int cr_pre_dump_tasks(pid_t pid)
if (collect_pstree())
goto err;

if (checkpoint_devices())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why checkpoint_devices() should be called on pre-dump? How should it work?

goto err;

if (collect_pstree_ids_predump())
goto err;

Expand Down Expand Up @@ -2192,6 +2195,9 @@ int cr_dump_tasks(pid_t pid)
if (collect_pstree())
goto err;

if (checkpoint_devices())
goto err;

if (collect_pstree_ids())
goto err;

Expand Down
1 change: 1 addition & 0 deletions criu/include/seize.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define __CR_SEIZE_H__

extern int collect_pstree(void);
extern int checkpoint_devices(void);
struct pstree_item;
extern void pstree_switch_state(struct pstree_item *root_item, int st);
extern const char *get_real_freezer_state(void);
Expand Down
23 changes: 16 additions & 7 deletions criu/seize.c
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,6 @@ int collect_pstree(void)
pid_t pid = root_item->pid->real;
int ret, exit_code = -1;
struct proc_status_creds creds;
struct pstree_item *iter;

timing_start(TIME_FREEZING);

Expand Down Expand Up @@ -1078,6 +1077,21 @@ int collect_pstree(void)
goto err;
}

exit_code = 0;
timing_stop(TIME_FREEZING);
timing_start(TIME_FROZEN);

err:
/* Freezing stage finished in time - disable timer. */
alarm(0);
return exit_code;
}

int checkpoint_devices(void)
{
struct pstree_item *iter;
int ret, exit_code = -1;

for_each_pstree_item(iter) {
if (!task_alive(iter))
continue;
Expand All @@ -1087,11 +1101,6 @@ int collect_pstree(void)
}

exit_code = 0;
timing_stop(TIME_FREEZING);
timing_start(TIME_FROZEN);

err:
/* Freezing stage finished in time - disable timer. */
alarm(0);
return exit_code;
}
}
4 changes: 2 additions & 2 deletions plugins/cuda/cuda_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -391,14 +391,14 @@ int cuda_plugin_checkpoint_devices(int pid)
if (resume_restore_thread(restore_tid, &save_sigset)) {
return -1;
}

task_info->checkpointed = 1;
status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf);
goto interrupt;
}

task_info->checkpointed = 1;

interrupt:
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);

Expand Down
Loading