x86_64 NCG: Another spill issue with scalars and SIMD vectors mixed
Summary
In the NCG backend, there seems to be an issue with spilling if vectors and scalars are mixed. The x86_64 NCG backend exhibits the problem, but the problem also occurs with the work-in-progress SIMD support in the AArch64 NCG (!15016).
Steps to reproduce
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnboxedTuples #-}
import GHC.Exts
import Data.List (intercalate)
import Data.Array.Base
import Data.Array.ST
import GHC.ST (ST (..))
-- The program runs fine if the type is defined as:
-- type D8 = (# DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2# #)
type D8 = (# DoubleX2#, Double#, DoubleX2#, Double#, DoubleX2# #)
-- unD# :: Double -> Double#
-- unD# (D# x) = x
indexD8 :: UArray Int Double -> Int -> D8
indexD8 (UArray l _ _ ba) i = case i - l of
I# i# -> (# indexDoubleArrayAsDoubleX2# ba i#
, indexDoubleArray# ba (i# +# 2#)
, indexDoubleArrayAsDoubleX2# ba (i# +# 3#)
, indexDoubleArray# ba (i# +# 5#)
, indexDoubleArrayAsDoubleX2# ba (i# +# 6#)
#)
mkD8 :: Double -> D8
-- mkD8 x = (# packDoubleX2# (# unD# x, unD# (x + 1) #), unD# (x + 2), packDoubleX2# (# unD# (x + 3), unD# (x + 4) #), unD# (x + 5), packDoubleX2# (# unD# (x + 6), unD# (x + 7) #) #)
-- By using an array, it becomes clear that this problem is not a bug with packDoubleX2#
mkD8 x = indexD8 (listArray (0, 7) [x,x+1..x+7]) 0
{-# NOINLINE mkD8 #-}
addD8 :: D8 -> D8 -> D8
addD8 (# x0, x1, x2, x3, x4 #) (# y0, y1, y2, y3, y4 #) = (# plusDoubleX2# x0 y0, x1 +## y1, plusDoubleX2# x2 y2, x3 +## y3, plusDoubleX2# x4 y4 #)
{-# NOINLINE addD8 #-}
writeD8 :: D8 -> STUArray s Int Double -> Int -> ST s ()
writeD8 (# v0, x2, v1, x5, v2 #) (STUArray l _ _ mba) i = ST $ \s ->
case i - l of
I# i# ->
case writeDoubleArrayAsDoubleX2# mba i# v0 s of
s1 -> case writeDoubleArray# mba (i# +# 2#) x2 s1 of
s2 -> case writeDoubleArrayAsDoubleX2# mba (i# +# 3#) v1 s2 of
s3 -> case writeDoubleArray# mba (i# +# 5#) x5 s3 of
s4 -> (# writeDoubleArrayAsDoubleX2# mba (i# +# 6#) v2 s4, () #)
toListD8 :: D8 -> [Double]
{-
toListD8 (# v0, x2, v1, x5, v2 #) = case unpackDoubleX2# v0 of
(# x0, x1 #) -> case unpackDoubleX2# v1 of
(# x3, x4 #) -> case unpackDoubleX2# v2 of
(# x6, x7 #) -> [D# x0, D# x1, D# x2, D# x3, D# x4, D# x5, D# x6, D# x7]
-}
-- By using arrays, it becomes clear that this problem is not a bug with unpackDoubleX2#
toListD8 x0 = elems $ runSTUArray $ do
ma <- newArray (0, 7) 0.0
writeD8 x0 ma 0
pure ma
{-# NOINLINE toListD8 #-}
type D64 = (# D8, D8, D8, D8, D8, D8, D8, D8 #)
mkD64 :: Double -> D64
mkD64 x = (# mkD8 x, mkD8 (x + 8), mkD8 (x + 16), mkD8 (x + 24), mkD8 (x + 32), mkD8 (x + 40), mkD8 (x + 48), mkD8 (x + 56) #)
{-# NOINLINE mkD64 #-}
addD64 :: D64 -> D64 -> D64
addD64 (# x0, x1, x2, x3, x4, x5, x6, x7 #) (# y0, y1, y2, y3, y4, y5, y6, y7 #) = (# addD8 x0 y0, addD8 x1 y1, addD8 x2 y2, addD8 x3 y3, addD8 x4 y4, addD8 x5 y5, addD8 x6 y6, addD8 x7 y7 #)
{-# NOINLINE addD64 #-}
toListD64 :: D64 -> [Double]
toListD64 (# x0, x1, x2, x3, x4, x5, x6, x7 #) = concat [toListD8 x0, toListD8 x1, toListD8 x2, toListD8 x3, toListD8 x4, toListD8 x5, toListD8 x6, toListD8 x7]
{-# NOINLINE toListD64 #-}
compareList :: [Double] -> [Double] -> String
compareList xs ys = '[' : intercalate "," (zipWith (\x y -> if x == y then show x else "<<<" ++ shows x ">>>") xs ys) ++ "]" ++ summary
where summary = if xs == ys then ": OK" else ": Wrong"
main :: IO ()
main = do
let n = 64
let !x = mkD64 0
!y = mkD64 n
putStrLn $ toListD64 x `compareList` [0..n-1]
putStrLn $ toListD64 y `compareList` [n..2*n-1]
putStrLn $ toListD64 (addD64 x y) `compareList` zipWith (+) [0..n-1] [n..2*n-1]
$ ghc -fforce-recomp -ddump-to-file -dppr-debug -ddump-asm-regalloc -ddump-asm-liveness -g -Wall HeteroSpill.hs
$ ./HeteroSpill
# On x86_64
[0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,<<<13.0>>>,13.0,14.0,<<<16.0>>>,16.0,17.0,18.0,19.0,<<<21.0>>>,21.0,22.0,<<<24.0>>>,24.0,25.0,26.0,27.0,<<<29.0>>>,29.0,30.0,<<<32.0>>>,32.0,33.0,34.0,35.0,<<<37.0>>>,37.0,38.0,<<<24.0>>>,40.0,41.0,42.0,43.0,<<<45.0>>>,45.0,46.0,<<<32.0>>>,48.0,49.0,50.0,51.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0]: Wrong
[64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0,75.0,76.0,77.0,78.0,<<<0.0>>>,80.0,81.0,82.0,83.0,84.0,85.0,86.0,87.0,88.0,89.0,90.0,91.0,<<<93.0>>>,93.0,94.0,<<<96.0>>>,96.0,97.0,98.0,99.0,<<<101.0>>>,101.0,102.0,<<<88.0>>>,104.0,105.0,106.0,107.0,<<<109.0>>>,109.0,110.0,<<<96.0>>>,112.0,113.0,114.0,115.0,116.0,117.0,118.0,119.0,120.0,121.0,122.0,123.0,124.0,125.0,126.0,127.0]: Wrong
[64.0,66.0,68.0,70.0,72.0,74.0,76.0,78.0,80.0,82.0,84.0,86.0,<<<89.0>>>,90.0,92.0,<<<8.0>>>,96.0,98.0,100.0,102.0,<<<105.0>>>,106.0,108.0,<<<111.0>>>,112.0,114.0,116.0,118.0,<<<122.0>>>,122.0,124.0,<<<128.0>>>,128.0,130.0,132.0,134.0,<<<138.0>>>,138.0,140.0,<<<182.0>>>,144.0,146.0,148.0,150.0,<<<154.0>>>,154.0,156.0,<<<128.0>>>,160.0,162.0,164.0,166.0,168.0,170.0,172.0,174.0,176.0,<<<172.0>>>,180.0,182.0,184.0,186.0,188.0,190.0]: Wrong
# On AArch64
[0.0,<<<2.0>>>,2.0,3.0,4.0,5.0,6.0,7.0,8.0,<<<10.0>>>,10.0,11.0,<<<13.0>>>,13.0,14.0,15.0,16.0,<<<18.0>>>,18.0,19.0,<<<21.0>>>,21.0,22.0,23.0,24.0,<<<26.0>>>,26.0,27.0,<<<3.2860115435e-314>>>,29.0,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0,39.0,40.0,41.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0]: Wrong
[64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0,75.0,76.0,77.0,78.0,79.0,80.0,81.0,82.0,83.0,84.0,85.0,86.0,87.0,88.0,89.0,90.0,91.0,92.0,93.0,94.0,95.0,96.0,97.0,98.0,99.0,100.0,101.0,102.0,103.0,104.0,<<<106.0>>>,106.0,107.0,108.0,109.0,110.0,111.0,112.0,<<<114.0>>>,114.0,115.0,116.0,117.0,118.0,119.0,120.0,<<<122.0>>>,122.0,123.0,124.0,125.0,126.0,127.0]: Wrong
[64.0,<<<67.0>>>,68.0,70.0,72.0,74.0,76.0,78.0,80.0,<<<83.0>>>,84.0,86.0,<<<89.0>>>,90.0,92.0,94.0,96.0,<<<99.0>>>,100.0,102.0,<<<105.0>>>,106.0,108.0,110.0,112.0,<<<115.0>>>,116.0,118.0,<<<92.0>>>,122.0,124.0,126.0,128.0,130.0,132.0,134.0,136.0,138.0,140.0,142.0,144.0,<<<147.0>>>,148.0,150.0,152.0,154.0,156.0,158.0,160.0,<<<163.0>>>,164.0,166.0,168.0,170.0,172.0,174.0,176.0,<<<179.0>>>,180.0,182.0,184.0,186.0,188.0,190.0]: Wrong
Expected behavior
The program runs fine if compiled with the LLVM backend:
$ ghc -fforce-recomp -fllvm HeteroSpill.hs
$ ./HeteroSpill
[0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0,39.0,40.0,41.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0,60.0,61.0,62.0,63.0]: OK
[64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0,75.0,76.0,77.0,78.0,79.0,80.0,81.0,82.0,83.0,84.0,85.0,86.0,87.0,88.0,89.0,90.0,91.0,92.0,93.0,94.0,95.0,96.0,97.0,98.0,99.0,100.0,101.0,102.0,103.0,104.0,105.0,106.0,107.0,108.0,109.0,110.0,111.0,112.0,113.0,114.0,115.0,116.0,117.0,118.0,119.0,120.0,121.0,122.0,123.0,124.0,125.0,126.0,127.0]: OK
[64.0,66.0,68.0,70.0,72.0,74.0,76.0,78.0,80.0,82.0,84.0,86.0,88.0,90.0,92.0,94.0,96.0,98.0,100.0,102.0,104.0,106.0,108.0,110.0,112.0,114.0,116.0,118.0,120.0,122.0,124.0,126.0,128.0,130.0,132.0,134.0,136.0,138.0,140.0,142.0,144.0,146.0,148.0,150.0,152.0,154.0,156.0,158.0,160.0,162.0,164.0,166.0,168.0,170.0,172.0,174.0,176.0,178.0,180.0,182.0,184.0,186.0,188.0,190.0]: OK
Environment
- GHC version used: 9.15.20251030 (86c82745)
- Operating System: Ubuntu, macOS
- System Architecture: x86_64, AArch64