ghc-9.2.6 segfaults in initCapability()
Summary
(gdb) bt
#0 0x00007fe955f81bbf in initCapability (cap=0xe569d0, i=2) at rts/Capability.c:275
#1 0x00007fe955f820ac in moreCapabilities (from=1, to=to@entry=8) at rts/Capability.c:444
#2 0x00007fe955f97476 in setNumCapabilities (new_n_capabilities=8) at rts/Schedule.c:2308
#3 0x00007fe959a7f283 in ?? () from /usr/lib64/ghc-9.2.6/bin/../ghc-9.2.6/libHSghc-9.2.6-ghc9.2.6.so
#4 0x00000042104158b0 in ?? ()
#5 0x0000000000000000 in ?? ()
70999283 looks sus, 9.2.5 behaves well. @bgamari
Steps to reproduce
Gentoo's build system does it like that:
/usr/bin/ghc -hide-all-packages -package Cabal -package base -package binary -package bytestring -package containers -package deepseq -package directory -package exceptions -package filepath -package haskeline -package mtl -package parsec -package pretty -package process -package stm -package template-haskell -package terminfo -package text -package time -package transformers -package unix -package xhtml --make /var/tmp/portage/dev-haskell/text-1.2.5.0/work/text-1.2.5.0/Setup.lhs -j9 +RTS -A256M -qb0 -RTS -threaded -dynamic -o setup
Environment
- GHC version used: 9.2.6
Optional:
- Operating System: Gentoo GNU/Linux
- System Architecture: amd64
- Show closed items
Is blocked by
Activity
-
Newest first Oldest first
-
Show all activity Show comments only Show history only
- L29Ah changed the description
Compare with previous version changed the description
- Contributor
FWIW, I tried a similar command line on my OpenBSD system (long shot, I know) and was unable to trigger any misbehavior. Can you figure out the reason for SIGSEGV? Is it clear which pointer is corrupted?
Collapse replies Thread 1 "ghc" received signal SIGSEGV, Segmentation fault. 0x00007ffff14fcbbf in initCapability (cap=0x59e3b0, i=1) at rts/Capability.c:275 275 cap->spark_stats.created = 0; (gdb) p cap->spark_stats.created $1 = 0 (gdb) set cap->spark_stats.created = 1 (gdb) p cap->spark_stats.created $2 = 1
what
- Contributor
It might be more revealing to look at disassembly and the registers.
Thread 1 "ghc" received signal SIGSEGV, Segmentation fault. 0x00007ffff14fcbbf in initCapability (cap=0x5ad370, i=2) at rts/Capability.c:275 275 cap->spark_stats.created = 0; (gdb) disassemble Dump of assembler code for function initCapability: 0x00007ffff14fca50 <+0>: push %rbp 0x00007ffff14fca51 <+1>: xor %edx,%edx 0x00007ffff14fca53 <+3>: mov %rsp,%rbp 0x00007ffff14fca56 <+6>: push %r14 0x00007ffff14fca58 <+8>: lea 0x5260d(%rip),%r14 # 0x7ffff154f06c 0x00007ffff14fca5f <+15>: push %r13 0x00007ffff14fca61 <+17>: mov %esi,%r13d 0x00007ffff14fca64 <+20>: mov %r13d,%eax 0x00007ffff14fca67 <+23>: push %r12 0x00007ffff14fca69 <+25>: push %r10 0x00007ffff14fca6b <+27>: push %rbx 0x00007ffff14fca6c <+28>: mov %rdi,%rbx 0x00007ffff14fca6f <+31>: add $0x4c8,%rdi 0x00007ffff14fca76 <+38>: sub $0x28,%rsp 0x00007ffff14fca7a <+42>: divl 0x7a920(%rip) # 0x7ffff15773a0 <n_numa_nodes> 0x00007ffff14fca80 <+48>: vmovq 0x703d8(%rip),%xmm3 # 0x7ffff156ce60 0x00007ffff14fca88 <+56>: vmovq 0x703a8(%rip),%xmm4 # 0x7ffff156ce38 0x00007ffff14fca90 <+64>: mov %r13d,-0x118(%rdi) 0x00007ffff14fca97 <+71>: movb $0x0,-0x108(%rdi) --Type <RET> for more, q to quit, c to continue without paging--c 0x00007ffff14fca9e <+78>: movl $0x0,-0x104(%rdi) 0x00007ffff14fcaa8 <+88>: movb $0x0,-0x100(%rdi) 0x00007ffff14fcaaf <+95>: movl $0x0,-0xe8(%rdi) 0x00007ffff14fcab9 <+105>: vmovddup 0x701df(%rip),%xmm2 # 0x7ffff156cca0 0x00007ffff14fcac1 <+113>: vpinsrq $0x1,0x7031d(%rip),%xmm3,%xmm0 # 0x7ffff156cde8 0x00007ffff14fcacb <+123>: vpinsrq $0x1,0x70243(%rip),%xmm4,%xmm1 # 0x7ffff156cd18 0x00007ffff14fcad5 <+133>: vmovdqa %xmm2,-0xf8(%rdi) 0x00007ffff14fcadd <+141>: vmovdqa %xmm0,-0x50(%rbp) 0x00007ffff14fcae2 <+146>: vmovdqa %xmm1,-0x40(%rbp) 0x00007ffff14fcae7 <+151>: mov %edx,-0x114(%rdi) 0x00007ffff14fcaed <+157>: call 0x7ffff153e2d0 <initMutex> 0x00007ffff14fcaf2 <+162>: lea 0x754ff(%rip),%rax # 0x7ffff1571ff8 0x00007ffff14fcaf9 <+169>: movq $0x0,0x3b8(%rbx) 0x00007ffff14fcb04 <+180>: movq $0x0,0x4b8(%rbx) 0x00007ffff14fcb0f <+191>: movl $0x0,0x4c0(%rbx) 0x00007ffff14fcb19 <+201>: movq $0x0,0x3e8(%rbx) 0x00007ffff14fcb24 <+212>: movl $0x0,0x3f0(%rbx) 0x00007ffff14fcb2e <+222>: movq $0x0,0x4f0(%rbx) 0x00007ffff14fcb39 <+233>: movq $0x0,0x4f8(%rbx) 0x00007ffff14fcb44 <+244>: movl $0x0,0x500(%rbx) 0x00007ffff14fcb4e <+254>: mov %rax,0x508(%rbx) 0x00007ffff14fcb55 <+261>: movq $0x0,0x510(%rbx) 0x00007ffff14fcb60 <+272>: call 0x7ffff1513400 <allocSparkPool> 0x00007ffff14fcb65 <+277>: mov %rax,0x518(%rbx) 0x00007ffff14fcb6c <+284>: lea 0x49fb5(%rip),%rax # 0x7ffff1546b28 <__stg_EAGER_BLACKHOLE_info> 0x00007ffff14fcb73 <+291>: vmovdqa -0x40(%rbp),%xmm1 0x00007ffff14fcb78 <+296>: mov %rax,(%rbx) 0x00007ffff14fcb7b <+299>: lea 0x7b23e(%rip),%r12 # 0x7ffff1577dc0 <RtsFlags> 0x00007ffff14fcb82 <+306>: vmovdqu %xmm1,0x8(%rbx) 0x00007ffff14fcb87 <+311>: mov 0x54(%r12),%edi 0x00007ffff14fcb8c <+316>: movq $0x0,0x540(%rbx) 0x00007ffff14fcb97 <+327>: movq $0x0,0x548(%rbx) 0x00007ffff14fcba2 <+338>: movl $0xffffffff,0x550(%rbx) 0x00007ffff14fcbac <+348>: movq $0x0,0x4b0(%rbx) 0x00007ffff14fcbb7 <+359>: vpxor %xmm2,%xmm2,%xmm2 0x00007ffff14fcbbb <+363>: shl $0x3,%rdi => 0x00007ffff14fcbbf <+367>: vmovdqa %ymm2,0x520(%rbx) 0x00007ffff14fcbc7 <+375>: mov %r14,%rsi 0x00007ffff14fcbca <+378>: vzeroupper 0x00007ffff14fcbcd <+381>: call 0x7ffff150e0e0 <stgMallocBytes> 0x00007ffff14fcbd2 <+386>: mov %rax,0x3f8(%rbx) 0x00007ffff14fcbd9 <+393>: mov 0x54(%r12),%edi 0x00007ffff14fcbde <+398>: mov %r14,%rsi 0x00007ffff14fcbe1 <+401>: shl $0x3,%rdi 0x00007ffff14fcbe5 <+405>: call 0x7ffff150e0e0 <stgMallocBytes> 0x00007ffff14fcbea <+410>: mov 0x54(%r12),%edi 0x00007ffff14fcbef <+415>: xor %edx,%edx 0x00007ffff14fcbf1 <+417>: test %edi,%edi 0x00007ffff14fcbf3 <+419>: mov %rax,0x400(%rbx) 0x00007ffff14fcbfa <+426>: movq $0x0,0x478(%rbx) 0x00007ffff14fcc05 <+437>: movq $0x0,0x408(%rbx) 0x00007ffff14fcc10 <+448>: vmovdqa -0x50(%rbp),%xmm0 0x00007ffff14fcc15 <+453>: je 0x7ffff14fcc3a <initCapability+490> 0x00007ffff14fcc17 <+455>: nopw 0x0(%rax,%rax,1) 0x00007ffff14fcc20 <+464>: mov 0x3f8(%rbx),%rcx 0x00007ffff14fcc27 <+471>: mov %edx,%esi 0x00007ffff14fcc29 <+473>: movq $0x0,(%rcx,%rsi,8) 0x00007ffff14fcc31 <+481>: inc %edx 0x00007ffff14fcc33 <+483>: cmp 0x54(%r12),%edx 0x00007ffff14fcc38 <+488>: jb 0x7ffff14fcc20 <initCapability+464> 0x00007ffff14fcc3a <+490>: lea 0x753af(%rip),%rax # 0x7ffff1571ff0 0x00007ffff14fcc41 <+497>: mov 0x7bfa1(%rip),%esi # 0x7ffff1578be8 <TRACE_cap> 0x00007ffff14fcc47 <+503>: vmovdqu %xmm0,0x558(%rbx) 0x00007ffff14fcc4f <+511>: vpxor %xmm0,%xmm0,%xmm0 0x00007ffff14fcc53 <+515>: mov %rax,0x568(%rbx) 0x00007ffff14fcc5a <+522>: movl $0x0,0x570(%rbx) 0x00007ffff14fcc64 <+532>: movq $0x0,0x4a0(%rbx) 0x00007ffff14fcc6f <+543>: movq $0x0,0x4a8(%rbx) 0x00007ffff14fcc7a <+554>: movq $0x0,0x378(%rbx) 0x00007ffff14fcc85 <+565>: movq $0x0,0x380(%rbx) 0x00007ffff14fcc90 <+576>: vmovdqa %ymm0,0x480(%rbx) 0x00007ffff14fcc98 <+584>: test %esi,%esi 0x00007ffff14fcc9a <+586>: jne 0x7ffff14fccc0 <initCapability+624> 0x00007ffff14fcc9c <+588>: vzeroupper 0x00007ffff14fcc9f <+591>: mov 0x7bf4f(%rip),%eax # 0x7ffff1578bf4 <TRACE_spark_sampled> 0x00007ffff14fcca5 <+597>: test %eax,%eax 0x00007ffff14fcca7 <+599>: jne 0x7ffff14fcd0f <initCapability+703> 0x00007ffff14fcca9 <+601>: lea -0x28(%rbp),%rsp 0x00007ffff14fccad <+605>: pop %rbx 0x00007ffff14fccae <+606>: pop %r10 0x00007ffff14fccb0 <+608>: pop %r12 0x00007ffff14fccb2 <+610>: pop %r13 0x00007ffff14fccb4 <+612>: pop %r14 0x00007ffff14fccb6 <+614>: pop %rbp 0x00007ffff14fccb7 <+615>: ret 0x00007ffff14fccb8 <+616>: nopl 0x0(%rax,%rax,1) 0x00007ffff14fccc0 <+624>: mov $0x2d,%esi 0x00007ffff14fccc5 <+629>: mov %rbx,%rdi 0x00007ffff14fccc8 <+632>: vzeroupper 0x00007ffff14fcccb <+635>: call 0x7ffff151a000 <traceCapEvent_> 0x00007ffff14fccd0 <+640>: mov 0x7bf12(%rip),%ecx # 0x7ffff1578be8 <TRACE_cap> 0x00007ffff14fccd6 <+646>: test %ecx,%ecx 0x00007ffff14fccd8 <+648>: je 0x7ffff14fcc9f <initCapability+591> 0x00007ffff14fccda <+650>: mov %r13,%rdx 0x00007ffff14fccdd <+653>: xor %esi,%esi 0x00007ffff14fccdf <+655>: mov $0x1b,%edi 0x00007ffff14fcce4 <+660>: call 0x7ffff151a030 <traceCapsetEvent_> 0x00007ffff14fcce9 <+665>: mov 0x7bef9(%rip),%edx # 0x7ffff1578be8 <TRACE_cap> 0x00007ffff14fccef <+671>: test %edx,%edx 0x00007ffff14fccf1 <+673>: je 0x7ffff14fcc9f <initCapability+591> 0x00007ffff14fccf3 <+675>: mov %r13,%rdx 0x00007ffff14fccf6 <+678>: mov $0x1,%esi 0x00007ffff14fccfb <+683>: mov $0x1b,%edi 0x00007ffff14fcd00 <+688>: call 0x7ffff151a030 <traceCapsetEvent_> 0x00007ffff14fcd05 <+693>: mov 0x7bee9(%rip),%eax # 0x7ffff1578bf4 <TRACE_spark_sampled> 0x00007ffff14fcd0b <+699>: test %eax,%eax 0x00007ffff14fcd0d <+701>: je 0x7ffff14fcca9 <initCapability+601> 0x00007ffff14fcd0f <+703>: mov 0x518(%rbx),%rax 0x00007ffff14fcd16 <+710>: mov %rbx,%rdi 0x00007ffff14fcd19 <+713>: mov 0x10(%rax),%rdx 0x00007ffff14fcd1d <+717>: mov 0x18(%rax),%rsi 0x00007ffff14fcd21 <+721>: sub %rdx,%rsi 0x00007ffff14fcd24 <+724>: mov $0x0,%eax 0x00007ffff14fcd29 <+729>: cmovs %rax,%rsi 0x00007ffff14fcd2d <+733>: sub $0x30,%rsp 0x00007ffff14fcd31 <+737>: vmovdqa 0x520(%rbx),%xmm5 0x00007ffff14fcd39 <+745>: vmovdqu %xmm5,(%rsp) 0x00007ffff14fcd3e <+750>: vmovdqa 0x530(%rbx),%xmm6 0x00007ffff14fcd46 <+758>: vmovdqu %xmm6,0x10(%rsp) 0x00007ffff14fcd4c <+764>: vmovdqa 0x540(%rbx),%xmm7 0x00007ffff14fcd54 <+772>: vmovdqu %xmm7,0x20(%rsp) 0x00007ffff14fcd5a <+778>: call 0x7ffff151a180 <traceSparkCounters_> 0x00007ffff14fcd5f <+783>: add $0x30,%rsp 0x00007ffff14fcd63 <+787>: lea -0x28(%rbp),%rsp 0x00007ffff14fcd67 <+791>: pop %rbx 0x00007ffff14fcd68 <+792>: pop %r10 0x00007ffff14fcd6a <+794>: pop %r12 0x00007ffff14fcd6c <+796>: pop %r13 0x00007ffff14fcd6e <+798>: pop %r14 0x00007ffff14fcd70 <+800>: pop %rbp 0x00007ffff14fcd71 <+801>: ret End of assembler dump. (gdb) info registers rax 0x7ffff1546b28 140737242229544 rbx 0x5ad370 5952368 rcx 0x7ffff03f6aa0 140737224075936 rdx 0x0 0 rsi 0x10000 65536 rdi 0x10 16 rbp 0x7fffffff97a0 0x7fffffff97a0 rsp 0x7fffffff9750 0x7fffffff9750 r8 0x306e1 198369 r9 0x0 0 r10 0x0 0 r11 0x5b5000 5984256 r12 0x7ffff1577dc0 140737242430912 r13 0x2 2 r14 0x7ffff154f06c 140737242263660 r15 0x1 1 rip 0x7ffff14fcbbf 0x7ffff14fcbbf <initCapability+367> eflags 0x10202 [ IF RF ] cs 0x33 51 ss 0x2b 43 ds 0x0 0 es 0x0 0 fs 0x0 0 gs 0x0 0 (gdb) p $ymm2 $1 = {v16_bfloat16 = {0 <repeats 16 times>}, v16_half = {0 <repeats 16 times>}, v8_float = {0, 0, 0, 0, 0, 0, 0, 0}, v4_double = {0, 0, 0, 0}, v32_int8 = {0 <repeats 32 times>}, v16_int16 = {0 <repeats 16 times>}, v8_int32 = {0, 0, 0, 0, 0, 0, 0, 0}, v4_int64 = {0, 0, 0, 0}, v2_int128 = {0, 0}} (gdb) p ((int *)$rbx)[0x520] $2 = 0
I can't seem to reproduce this, using the GHC 9.2.6 binary from https://downloads.haskell.org/ghc/9.2.6/ghc-9.2.6-x86_64-fedora27-linux.tar.xz:
I used the following command in a loop:
ghc-9.2.6 -hide-all-packages -package Cabal -package base -package binary -package bytestring -package containers -package deepseq -package directory -package exceptions -package filepath -package haskeline -package mtl -package parsec -package pretty -package process -package stm -package template-haskell -package terminfo -package text -package time -package transformers -package unix -package xhtml Setup.lhs -j9 +RTS -A256M -qb0 -RTS -threaded -dynamic -o setup -fforce-recomp
Edited by ZubinCollapse replies
- Maintainer
This is concerning as it suggests that something is amiss in my recent capabilities changes. However, I have never observed anything along these lines. I will try to reproduce.
- Ben Gamari assigned to @bgamari
assigned to @bgamari
- Maintainer
@L29Ah it would be great if you could provide the output from gdb's
disassemble
andinfo regs
commands on the crashed execution state.Edited by Ben Gamari Collapse replies
- Maintainer
Alright, I suspect that the problem here is to do with alignment. Specifically, the C compiler emitted a
movdqa
, which expects an aligned address:0x00007ffff14fcbbb <+363>: shl $0x3,%rdi => 0x00007ffff14fcbbf <+367>: vmovdqa %ymm2,0x520(%rbx) 0x00007ffff14fcbc7 <+375>: mov %r14,%rsi
This suggests that the issue is in fact due to e653408c, which adds a field to the
Capability
structure. This interacts badly with thealigned
attribute ofCapability
as we allocate an array of these structures.This is likely observed only in Gentoo as Gentoo tends to use much more aggressive optimization flags that other distributions. We should likely fix this CI blindspot.
Collapse replies - Maintainer
Indeed that likely explains it. This bug was ultimately due to an unstated and untested invariant which I have fixed in !9969 (closed). I'll open an MR fixing the issue on the 9.2 branch.
- Maintainer
Unfortunately, I am unable to reproduce this using gcc 11.3. Specifically, gcc seems to aggressively pad
struct Capability
to a multiple of the alignment size regardless of whether the new field is present or not. - Maintainer
Nor can I reproduce this with gcc 12.2:
$ cat hi.c #include "Rts.h" #include "Capability.h" int main() { printf("%d\n", sizeof(struct Capability_)); return 0; } $ nix shell nixpkgs#gcc12 -c gcc hi.c -I../includes -I. -I../_build/stage1/rts/build/ -malign-data=cacheline -O2 -DTHREADED_RTS $ ./a.out 1408
- Maintainer
Very interesting since that is a 64-byte-aligned size.
Looking at the original assembler, 0x520 is the offset of the
spark_stat
field ofCapability_
. This makes sense since the zero-initialization ofspark_stat
ininitCapability
is an ideal place for the compiler to use a instruction likeVMOVDQA
. The VEX.256 form ofVMOVDQA
requires 32-byte alignment of its memory operand and the 0x520 offset ofspark_stat
is 32-byte aligned.The base address of the
Capability_
is%rbx = 0x5ad370
. This address is only 16-byte aligned. This is very strange given thatsizeof(Capability*)
is a multiple of 64. IIRC we allocate thecapabilities
array withmalloc
; perhaps we neglect to inform it of our alignment constraints. However, this would mean that this is a very old bug and unrelated to the (apparently benign) bug I noted above.@L29Ah, has this build configuration ever worked?
Edited by Ben Gamari
- Ben Gamari mentioned in commit 01661e4e
mentioned in commit 01661e4e
- Ben Gamari mentioned in merge request !9969 (closed)
mentioned in merge request !9969 (closed)
- Ben Gamari mentioned in commit 777aa4f4
mentioned in commit 777aa4f4
- Ben Gamari mentioned in commit a4228b6e
mentioned in commit a4228b6e
- Ben Gamari mentioned in merge request !9971
mentioned in merge request !9971
- Ben Gamari mentioned in commit d05dcb71
mentioned in commit d05dcb71
- Ben Gamari mentioned in commit d5f765ce
mentioned in commit d5f765ce
- Ben Gamari mentioned in commit 8e6ac03c
mentioned in commit 8e6ac03c
- Ben Gamari mentioned in commit 4af27fea
mentioned in commit 4af27fea
- Ben Gamari mentioned in commit b585225b
mentioned in commit b585225b
- Ben Gamari mentioned in commit 489215f0
mentioned in commit 489215f0
- Ben Gamari mentioned in commit 1741fd25
mentioned in commit 1741fd25
- Ben Gamari mentioned in issue #22975 (closed)
mentioned in issue #22975 (closed)
- Ben Gamari marked this issue as blocked by #22975 (closed)
marked this issue as blocked by #22975 (closed)
- Maintainer
To summarize, the bug here appears to be the long-standing fact that when allocating
Capability
s we don't maintain the alignment that we claim to due to #22975 (closed). Unfortunately, I have no explanation for why this only started failing now.My previous hypothesis that the addition of the
current_segments
field toCapability
in 71adc788 caused an inconsistency between the size of the structure and its claimed alignment does not hold water. Specifically,sizeof
reflects the padding added by the compiler to achieve the desired alignment. This I verified with:#include <stdio.h> struct s { int a; } __attribute__((aligned(256))); int main() { printf("%d\n", sizeof(struct s)); return 0; }
which should (and does) print
256
.Edited by Ben Gamari - Ben Gamari mentioned in commit 1aaefa1e
mentioned in commit 1aaefa1e
- Ben Gamari changed milestone to %9.2.6
changed milestone to %9.2.6
- Matthew Pickering changed milestone to %9.2.7
changed milestone to %9.2.7
- Ben Gamari mentioned in commit c9f2a568
mentioned in commit c9f2a568
- Ben Gamari mentioned in commit a2f2d102
mentioned in commit a2f2d102
- Ben Gamari mentioned in commit 2f90abd4
mentioned in commit 2f90abd4
- Ben Gamari mentioned in commit 956b2c9f
mentioned in commit 956b2c9f
- Ben Gamari mentioned in commit f225ba8d
mentioned in commit f225ba8d
- Ben Gamari mentioned in commit 6c037b1d
mentioned in commit 6c037b1d
- Ben Gamari mentioned in commit 485ccdda
mentioned in commit 485ccdda
- Ben Gamari mentioned in commit 2cca72cd
mentioned in commit 2cca72cd
- Ben Gamari mentioned in commit 1de404a6
mentioned in commit 1de404a6
- Ben Gamari mentioned in commit cdb39b95
mentioned in commit cdb39b95
- Ben Gamari mentioned in commit 0978122d
mentioned in commit 0978122d
- Ben Gamari mentioned in commit 09fd3535
mentioned in commit 09fd3535
- Ben Gamari mentioned in commit 0a0e22f5
mentioned in commit 0a0e22f5
- Ben Gamari mentioned in commit 262762f3
mentioned in commit 262762f3
- Ben Gamari mentioned in commit ac7bbf64
mentioned in commit ac7bbf64
- Ben Gamari mentioned in commit 90f0d5b6
mentioned in commit 90f0d5b6
- Ben Gamari mentioned in commit 965beadf
mentioned in commit 965beadf
- Ben Gamari mentioned in commit 7730a0d6
mentioned in commit 7730a0d6
- Ben Gamari mentioned in commit 8244522e
mentioned in commit 8244522e
- Ben Gamari mentioned in commit db83f8bb
mentioned in commit db83f8bb
- Ben Gamari closed with commit 8a6f745d
closed with commit 8a6f745d