Miscompilation & Cmm Lint failure with SIMD vectors

The following program fails Cmm Lint:

{-# language MagicHash, UnboxedTuples, UnboxedSums #-}

module Main ( main ) where

import GHC.Exts
import GHC.Int

foo :: Int32X4# -> Integer
foo i32x4 =
  case unpackInt32X4# i32x4 of
    (# i1, i2, i3, i4 #) ->
      let
        s = sum $ map fromIntegral
             [ I32# i1, I32# i2, I32# i3, I32# i4 ]
      in s

main :: IO ()
main = do
  print ( foo ( broadcastInt32X4# ( intToInt32# 1# ) ) )

Cmm lint error:
  in basic block c1EB
    in MachOp application:
      %MO_UU_Conv_W64_W32(1 :: W32)
    op is expecting:  [W64]
    arguments provide:  [I32]
Program was:
  {offset
    c1Fg: // global
        _s1Ew::P64 = R1;
        if ((old + 0) - <highSp> < SpLim) (likely: False) goto c1Fh; else goto c1Fi;
    c1Fi: // global
        Hp = Hp + 48;
        if (Hp > HpLim) (likely: False) goto c1Fk; else goto c1Fj;
    c1Fk: // global
        HpAlloc = 48;
        goto c1Fh;
    c1Fh: // global
        R1 = _s1Ew::P64;
        call (stg_gc_enter_1)(R1) args: 8, res: 0, upd: 8;
    c1Fj: // global
        (_c1EA::I64) = call "ccall" arg hints:  [PtrHint,
                                                 PtrHint]  result hints:  [PtrHint] newCAF(BaseReg, _s1Ew::P64);
        if (_c1EA::I64 == 0) goto c1EC; else goto c1EB;
    c1EC: // global
        call (I64[_s1Ew::P64])() args: 8, res: 0, upd: 8;
    c1EB: // global
        I64[(old + 24)] = stg_bh_upd_frame_info;
        I64[(old + 16)] = _c1EA::I64;
        _c1EE::Ix4V128 = <0 :: W32, 0 :: W32, 0 :: W32, 0 :: W32>;
        _c1EF::Ix4V128 = %MO_V_Insert_4_W32(_c1EE::Ix4V128,
                                            %MO_UU_Conv_W64_W32(1 :: W32), 0 :: W32);
        _c1EG::Ix4V128 = %MO_V_Insert_4_W32(_c1EF::Ix4V128,
                                            %MO_UU_Conv_W64_W32(1 :: W32), 1 :: W32);
        _c1EH::Ix4V128 = %MO_V_Insert_4_W32(_c1EG::Ix4V128,
                                            %MO_UU_Conv_W64_W32(1 :: W32), 2 :: W32);
        _c1EI::Ix4V128 = %MO_V_Insert_4_W32(_c1EH::Ix4V128,
                                            %MO_UU_Conv_W64_W32(1 :: W32), 3 :: W32);
        _c1ED::Ix4V128 = _c1EI::Ix4V128;
        _s1Ef::Ix4V128 = _c1ED::Ix4V128;
        _s1Eh::I32 = %MO_SS_Conv_W32_W64(%MO_V_Extract_4_W32(_s1Ef::Ix4V128,
                                                             0 :: W32));
        _s1Ei::I32 = %MO_SS_Conv_W32_W64(%MO_V_Extract_4_W32(_s1Ef::Ix4V128,
                                                             1 :: W32));
        _s1Ej::I32 = %MO_SS_Conv_W32_W64(%MO_V_Extract_4_W32(_s1Ef::Ix4V128,
                                                             2 :: W32));
        _s1Ek::I32 = %MO_SS_Conv_W32_W64(%MO_V_Extract_4_W32(_s1Ef::Ix4V128,
                                                             3 :: W32));
        _s1Ek::I32 = _s1Ek::I32;
        _s1Ej::I32 = _s1Ej::I32;
        _s1Ei::I32 = _s1Ei::I32;
        _s1Eh::I32 = _s1Eh::I32;
        I64[Hp - 40] = sat_s1Ev_info;
        I32[Hp - 24] = _s1Eh::I32;
        I32[Hp - 20] = _s1Ei::I32;
        I32[Hp - 16] = _s1Ej::I32;
        I32[Hp - 12] = _s1Ek::I32;
        _c1EL::P64 = Hp - 40;
        I64[Hp - 8] = sat_s1El_info;
        _c1F9::P64 = Hp - 8;
        R3 = _c1EL::P64;
        R2 = _c1F9::P64;
        R1 = $_closure;
        call stg_ap_pp_fast(R3, R2, R1) args: 24, res: 0, upd: 24;
  }

It seems that the vector broadcast and unpack instructions end up emitting conversion instructions at incorrect types?

Turning off Cmm Lint, I get the following error from LLVM:

opt: /run/user/1000/ghc230199_0/ghc_1.ll:494:18: error: invalid cast opcode for cast from 'i32' to 'i32'
  %ln1K6 = trunc i32 1 to i32
                 ^

Edited Oct 14, 2022 by sheaf

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information