random is up to 2x slower in GHC 9.2

As I mentioned in #19727 (closed), certain random routines became 2x slower between GHC 9.0 and 9.2. Here is a reproducer:

git clone https://github.com/Bodigrim/random
cd random
git checkout ghc92 
cabal build -w ghc-9.2.0.20210422 --allow-newer='splitmix:base'

This emits the following Core:

-- RHS size: {terms: 2, types: 0, coercions: 0, joins: 0/0}
uniformR_Int2 :: Word8
uniformR_Int2 = W8# 0##8

-- RHS size: {terms: 23, types: 4, coercions: 0, joins: 0/1}
uniformR_Int8_mask :: Word8
uniformR_Int8_mask
  = let {
      i#_s3ga :: Int#
      i#_s3ga = word2Int# (clz8# 255##) } in
    case >=# i#_s3ga 0# of {
      __DEFAULT -> overflowError;
      1# ->
        case >=# i#_s3ga 64# of {
          __DEFAULT -> W8# (wordToWord8# (uncheckedShiftRL# 255## i#_s3ga));
          1# -> uniformR_Int2
        }
    }

I would expect GHC to perform constant folding here: clz8# 255## is 0, so all this thunk should boil down to uniformR_Int8_mask = W8# 255##8. FWIW GHC 9.0 was not able to optimise this piece either, but it would be nice to do so.

-- RHS size: {terms: 104, types: 31, coercions: 4, joins: 1/9}
$wgo :: Word# -> Word# -> (# Word8, StdGen #)
$wgo
  = \ (ww_s3gz :: Word#) (ww1_s3gA :: Word#) ->
      case uniformR_Int8_mask of { W8# y#_i3bT ->
      let {
        seed'_s3g8 :: Word#
        seed'_s3g8 = plusWord# ww_s3gz ww1_s3gA } in
      let {
        x#_s3g7 :: Word#
        x#_s3g7
          = timesWord#
              (xor# seed'_s3g8 (uncheckedShiftRL# seed'_s3g8 33#))
              18397679294719823053## } in
      let {
        x#1_s3g6 :: Word#
        x#1_s3g6
          = timesWord#
              (xor# x#_s3g7 (uncheckedShiftRL# x#_s3g7 33#))
              14181476777654086739## } in
      let {
        x_i3fk :: Word8#
        x_i3fk
          = wordToWord8#
              (and#
                 (and# (xor# x#1_s3g6 (uncheckedShiftRL# x#1_s3g6 33#)) 255##)
                 (word8ToWord# y#_i3bT)) } in

This seems to be a consequence of sized primitive types: we widen y#_i3bT only to conjunct it with already 8-bit long value of and# ... 255## and immediately narrow the result back. I would expect GHC to emit x_i3fk = and# (xor# ...) y#_i3bT.

      case gtWord# (word8ToWord# x_i3fk) 255## of {
        __DEFAULT ->
          (# W8# x_i3fk, (SMGen seed'_s3g8 ww1_s3gA) `cast` <Co:2> #);
        1# ->
          joinrec {
            $wgo1_X1 :: Word# -> Word# -> (# Word8, StdGen #)
            $wgo1_X1 (ww2_X2 :: Word#) (ww3_X3 :: Word#)
              = let {
                  seed'1_X5 :: Word#
                  seed'1_X5 = plusWord# ww2_X2 ww3_X3 } in
                let {
                  x#2_X6 :: Word#
                  x#2_X6
                    = timesWord#
                        (xor# seed'1_X5 (uncheckedShiftRL# seed'1_X5 33#))
                        18397679294719823053## } in
                let {
                  x#3_X7 :: Word#
                  x#3_X7
                    = timesWord#
                        (xor# x#2_X6 (uncheckedShiftRL# x#2_X6 33#))
                        14181476777654086739## } in
                let {
                  x1_X8 :: Word8#
                  x1_X8
                    = wordToWord8#
                        (and#
                           (and# (xor# x#3_X7 (uncheckedShiftRL# x#3_X7 33#)) 255##)
                           (word8ToWord# y#_i3bT)) } in

Redundant widening and narrowing again.

                case gtWord# (word8ToWord# x1_X8) 255## of {
                  __DEFAULT ->
                    (# W8# x1_X8, (SMGen seed'1_X5 ww3_X3) `cast` <Co:2> #);
                  1# -> jump $wgo1_X1 seed'1_X5 ww3_X3
                }; } in
          jump $wgo1_X1 seed'_s3g8 ww1_s3gA
      }
      }

-- RHS size: {terms: 46, types: 23, coercions: 10, joins: 0/0}
uniformR_Int1 :: Identity (Int8, StdGen)
uniformR_Int1
  = case >=# (word2Int# (popCnt64# 16374150590275443167##)) 24# of {

Missed constant folding opportunity, similar to clz8# above.

      __DEFAULT ->
        case $wgo 9297814886316923340## 1692676764259697727## of
        { (# ww1_s3gG, ww2_s3gH #) ->
        (case ww1_s3gG of { W8# x#1_i3br ->
         I8#
           (intToInt8#
              (+#
                 -128#
                 (int8ToInt# (intToInt8# (word2Int# (word8ToWord# x#1_i3br))))))

Redundant widening and narrowing, but even worse than before. There are five casts, where one would do.

         },
         ww2_s3gH)
        `cast` <Co:5>
        };
      1# ->
        case $wgo 9297814886316923340## 13679457532755275413## of
        { (# ww1_s3gG, ww2_s3gH #) ->
        (case ww1_s3gG of { W8# x#1_i3br ->
         I8#
           (intToInt8#
              (+#
                 -128#
                 (int8ToInt# (intToInt8# (word2Int# (word8ToWord# x#1_i3br))))))

Same as above.

         },
         ww2_s3gH)
        `cast` <Co:5>
        }
    }

-- RHS size: {terms: 1, types: 0, coercions: 4, joins: 0/0}
uniformR_Int8 :: (Int8, StdGen)
uniformR_Int8 = uniformR_Int1 `cast` <Co:4>

CC @lehins @Ericson2314

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information