random is up to 2x slower in GHC 9.2
As I mentioned in #19727 (closed), certain random
routines became 2x slower between GHC 9.0 and 9.2. Here is a reproducer:
git clone https://github.com/Bodigrim/random
cd random
git checkout ghc92
cabal build -w ghc-9.2.0.20210422 --allow-newer='splitmix:base'
This emits the following Core:
-- RHS size: {terms: 2, types: 0, coercions: 0, joins: 0/0}
uniformR_Int2 :: Word8
uniformR_Int2 = W8# 0##8
-- RHS size: {terms: 23, types: 4, coercions: 0, joins: 0/1}
uniformR_Int8_mask :: Word8
uniformR_Int8_mask
= let {
i#_s3ga :: Int#
i#_s3ga = word2Int# (clz8# 255##) } in
case >=# i#_s3ga 0# of {
__DEFAULT -> overflowError;
1# ->
case >=# i#_s3ga 64# of {
__DEFAULT -> W8# (wordToWord8# (uncheckedShiftRL# 255## i#_s3ga));
1# -> uniformR_Int2
}
}
I would expect GHC to perform constant folding here: clz8# 255##
is 0, so all this thunk should boil down to uniformR_Int8_mask = W8# 255##8
. FWIW GHC 9.0 was not able to optimise this piece either, but it would be nice to do so.
-- RHS size: {terms: 104, types: 31, coercions: 4, joins: 1/9}
$wgo :: Word# -> Word# -> (# Word8, StdGen #)
$wgo
= \ (ww_s3gz :: Word#) (ww1_s3gA :: Word#) ->
case uniformR_Int8_mask of { W8# y#_i3bT ->
let {
seed'_s3g8 :: Word#
seed'_s3g8 = plusWord# ww_s3gz ww1_s3gA } in
let {
x#_s3g7 :: Word#
x#_s3g7
= timesWord#
(xor# seed'_s3g8 (uncheckedShiftRL# seed'_s3g8 33#))
18397679294719823053## } in
let {
x#1_s3g6 :: Word#
x#1_s3g6
= timesWord#
(xor# x#_s3g7 (uncheckedShiftRL# x#_s3g7 33#))
14181476777654086739## } in
let {
x_i3fk :: Word8#
x_i3fk
= wordToWord8#
(and#
(and# (xor# x#1_s3g6 (uncheckedShiftRL# x#1_s3g6 33#)) 255##)
(word8ToWord# y#_i3bT)) } in
This seems to be a consequence of sized primitive types: we widen y#_i3bT
only to conjunct it with already 8-bit long value of and# ... 255##
and immediately narrow the result back. I would expect GHC to emit x_i3fk = and# (xor# ...) y#_i3bT
.
case gtWord# (word8ToWord# x_i3fk) 255## of {
__DEFAULT ->
(# W8# x_i3fk, (SMGen seed'_s3g8 ww1_s3gA) `cast` <Co:2> #);
1# ->
joinrec {
$wgo1_X1 :: Word# -> Word# -> (# Word8, StdGen #)
$wgo1_X1 (ww2_X2 :: Word#) (ww3_X3 :: Word#)
= let {
seed'1_X5 :: Word#
seed'1_X5 = plusWord# ww2_X2 ww3_X3 } in
let {
x#2_X6 :: Word#
x#2_X6
= timesWord#
(xor# seed'1_X5 (uncheckedShiftRL# seed'1_X5 33#))
18397679294719823053## } in
let {
x#3_X7 :: Word#
x#3_X7
= timesWord#
(xor# x#2_X6 (uncheckedShiftRL# x#2_X6 33#))
14181476777654086739## } in
let {
x1_X8 :: Word8#
x1_X8
= wordToWord8#
(and#
(and# (xor# x#3_X7 (uncheckedShiftRL# x#3_X7 33#)) 255##)
(word8ToWord# y#_i3bT)) } in
Redundant widening and narrowing again.
case gtWord# (word8ToWord# x1_X8) 255## of {
__DEFAULT ->
(# W8# x1_X8, (SMGen seed'1_X5 ww3_X3) `cast` <Co:2> #);
1# -> jump $wgo1_X1 seed'1_X5 ww3_X3
}; } in
jump $wgo1_X1 seed'_s3g8 ww1_s3gA
}
}
-- RHS size: {terms: 46, types: 23, coercions: 10, joins: 0/0}
uniformR_Int1 :: Identity (Int8, StdGen)
uniformR_Int1
= case >=# (word2Int# (popCnt64# 16374150590275443167##)) 24# of {
Missed constant folding opportunity, similar to clz8#
above.
__DEFAULT ->
case $wgo 9297814886316923340## 1692676764259697727## of
{ (# ww1_s3gG, ww2_s3gH #) ->
(case ww1_s3gG of { W8# x#1_i3br ->
I8#
(intToInt8#
(+#
-128#
(int8ToInt# (intToInt8# (word2Int# (word8ToWord# x#1_i3br))))))
Redundant widening and narrowing, but even worse than before. There are five casts, where one would do.
},
ww2_s3gH)
`cast` <Co:5>
};
1# ->
case $wgo 9297814886316923340## 13679457532755275413## of
{ (# ww1_s3gG, ww2_s3gH #) ->
(case ww1_s3gG of { W8# x#1_i3br ->
I8#
(intToInt8#
(+#
-128#
(int8ToInt# (intToInt8# (word2Int# (word8ToWord# x#1_i3br))))))
Same as above.
},
ww2_s3gH)
`cast` <Co:5>
}
}
-- RHS size: {terms: 1, types: 0, coercions: 4, joins: 0/0}
uniformR_Int8 :: (Int8, StdGen)
uniformR_Int8 = uniformR_Int1 `cast` <Co:4>