[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#28211: Stack marking issue in multi-threaded code, 2020 edition
From: |
Ludovic Courtès |
Subject: |
bug#28211: Stack marking issue in multi-threaded code, 2020 edition |
Date: |
Fri, 13 Mar 2020 23:38:01 +0100 |
User-agent: |
Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) |
Hi,
Ludovic Courtès <address@hidden> skribis:
> What if ‘scm_i_vm_mark_stack’ walks the stack right before the ‘vp->fp’
> assignment? It can determine that one of the just-assigned ‘sp[nargs]’
> is a dead slot, and thus set it to SCM_UNSPECIFIED. Later, when we set
> ‘vp->fp’, that stack slot that we just initialized has been overwritten
> by ‘scm_i_vm_mark_stack’. Down the road, we get something like:
>
> Wrong type to apply: #<unspecified>
I believe the patch below solves this problem. Also attached is a small
program that reproduces the bug (without the patch) after a couple of
minutes.
Thoughts?
The full grafting test case exposes another VM stack-corruption-looking
crash that I’m still investigating.
Ludo’.
diff --git a/libguile/vm.c b/libguile/vm.c
index b20c6eb5f..5cc934e99 100644
--- a/libguile/vm.c
+++ b/libguile/vm.c
@@ -1351,7 +1351,8 @@ scm_i_vm_emergency_abort (SCM *tag_and_argv, size_t n)
scm_t_bits *prompt;
scm_t_dynstack_prompt_flags flags;
ptrdiff_t fp_offset, sp_offset;
- union scm_vm_stack_element *fp, *sp;
+ union scm_vm_stack_element *fp;
+ volatile union scm_vm_stack_element *sp;
SCM *argv;
uint32_t *vra;
uint8_t *mra;
@@ -1392,16 +1393,19 @@ scm_i_vm_emergency_abort (SCM *tag_and_argv, size_t n)
to be jumping to an older part of the stack. */
if (sp < vp->sp)
abort ();
- sp[nargs].as_scm = cont;
-
- while (nargs--)
- sp[nargs].as_scm = *argv++;
/* Restore VM regs */
vp->fp = fp;
- vp->sp = sp;
+ vp->sp = (union scm_vm_stack_element *) sp;
vp->ip = vra;
+ /* Restore the arguments on SP. This must be done after 'vp->fp' has
+ been set so that a concurrent 'scm_i_vm_mark_stack' does not
+ overwrite it (see <https://bugs.gnu.org/28211>). */
+ sp[nargs].as_scm = cont;
+ while (nargs--)
+ sp[nargs].as_scm = *argv++;
+
/* Jump! */
vp->mra_after_abort = mra;
longjmp (*registers, 1);
@@ -1417,7 +1421,8 @@ abort_to_prompt (scm_thread *thread, uint8_t *saved_mra)
scm_t_bits *prompt;
scm_t_dynstack_prompt_flags flags;
ptrdiff_t fp_offset, sp_offset;
- union scm_vm_stack_element *fp, *sp;
+ union scm_vm_stack_element *fp, *orig_sp;
+ volatile union scm_vm_stack_element *sp;
uint32_t *vra;
uint8_t *mra;
jmp_buf *registers;
@@ -1452,6 +1457,7 @@ abort_to_prompt (scm_thread *thread, uint8_t *saved_mra)
/* Recompute FP, as scm_dynstack_unwind may have expanded the stack. */
fp = vp->stack_top - fp_offset;
sp = vp->stack_top - sp_offset;
+ orig_sp = vp->sp;
/* Continuation gets nargs+1 values: the one more is for the cont. */
sp = sp - nargs - 1;
@@ -1460,15 +1466,19 @@ abort_to_prompt (scm_thread *thread, uint8_t *saved_mra)
to be jumping to an older part of the stack. */
if (sp < vp->sp)
abort ();
- sp[nargs].as_scm = cont;
- while (nargs--)
- sp[nargs] = vp->sp[nargs];
/* Restore VM regs */
vp->fp = fp;
- vp->sp = sp;
+ vp->sp = (union scm_vm_stack_element *) sp;
vp->ip = vra;
+ /* Restore the arguments on SP. This must be done after 'vp->fp' has
+ been set so that a concurrent 'scm_i_vm_mark_stack' does not
+ overwrite it (see <https://bugs.gnu.org/28211>). */
+ sp[nargs].as_scm = cont;
+ while (nargs--)
+ sp[nargs] = orig_sp[nargs];
+
/* If there are intervening C frames, then jump over them, making a
nonlocal exit. Otherwise fall through and let the VM pick up where
it left off. */
;; https://issues.guix.gnu.org/issue/28211
(use-modules (ice-9 threads))
(define (thunk)
(catch 'foo
(lambda ()
(throw 'foo (iota 10)))
(lambda (key lst)
(unless (equal? lst (iota 10))
(primitive-_exit 42)))))
(n-par-for-each (* 2 (current-processor-count))
(lambda _
(let loop ()
(thunk)
(loop)))
(iota 1000))