From 015886ec78e598f850c4202efdee239bac63b8c7 Mon Sep 17 00:00:00 2001
From: ARATA Mizuki <minorinoki@gmail.com>
Date: Tue, 16 May 2023 21:06:31 +0900
Subject: [PATCH] Support 128-bit SIMD on AArch64 via LLVM backend

---
 compiler/CodeGen.Platform.h              | 33 ++++++++++++++++++++++++
 compiler/GHC/Cmm/CallConv.hs             |  6 +++--
 compiler/GHC/StgToCmm/Prim.hs            | 31 +++++++++++++++-------
 rts/include/stg/MachRegs.h               |  6 +++++
 testsuite/tests/codeGen/should_run/all.T |  2 +-
 testsuite/tests/unboxedsums/all.T        |  2 +-
 6 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/compiler/CodeGen.Platform.h b/compiler/CodeGen.Platform.h
index fb006c9f1a5d..1c3bee7eb3d3 100644
--- a/compiler/CodeGen.Platform.h
+++ b/compiler/CodeGen.Platform.h
@@ -203,6 +203,39 @@ import GHC.Platform.Reg
 # define d29 61
 # define d30 62
 # define d31 63
+
+# define q0 32
+# define q1 33
+# define q2 34
+# define q3 35
+# define q4 36
+# define q5 37
+# define q6 38
+# define q7 39
+# define q8 40
+# define q9 41
+# define q10 42
+# define q11 43
+# define q12 44
+# define q13 45
+# define q14 46
+# define q15 47
+# define q16 48
+# define q17 49
+# define q18 50
+# define q19 51
+# define q20 52
+# define q21 53
+# define q22 54
+# define q23 55
+# define q24 56
+# define q25 57
+# define q26 58
+# define q27 59
+# define q28 60
+# define q29 61
+# define q30 62
+# define q31 63
 #endif
 
 # if defined(MACHREGS_darwin)
diff --git a/compiler/GHC/Cmm/CallConv.hs b/compiler/GHC/Cmm/CallConv.hs
index 14f3672a4b32..ddc38e2116dd 100644
--- a/compiler/GHC/Cmm/CallConv.hs
+++ b/compiler/GHC/Cmm/CallConv.hs
@@ -193,8 +193,10 @@ realLongRegs    platform = map LongReg    $ regList (pc_MAX_Real_Long_REG    (pl
 
 realXmmRegNos :: Platform -> [Int]
 realXmmRegNos platform
-    | isSse2Enabled platform = regList (pc_MAX_Real_XMM_REG (platformConstants platform))
-    | otherwise              = []
+    | isSse2Enabled platform || platformArch platform == ArchAArch64
+    = regList (pc_MAX_Real_XMM_REG (platformConstants platform))
+    | otherwise
+    = []
 
 regList :: Int -> [Int]
 regList n = [1 .. n]
diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs
index 1c8663ebf9f1..6f038d0dc47d 100644
--- a/compiler/GHC/StgToCmm/Prim.hs
+++ b/compiler/GHC/StgToCmm/Prim.hs
@@ -2303,7 +2303,7 @@ vecCmmCat FloatVec = cmmFloat
 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 -- Check to make sure that we can generate code for the specified vector type
 -- given the current set of dynamic flags.
--- Currently these checks are specific to x86 and x86_64 architecture.
+-- Currently these checks are specific to x86, x86_64 and AArch64 architectures.
 -- This should be fixed!
 -- In particular,
 -- 1) Add better support for other architectures! (this may require a redesign)
@@ -2334,27 +2334,38 @@ vecCmmCat FloatVec = cmmFloat
 checkVecCompatibility :: StgToCmmConfig -> PrimOpVecCat -> Length -> Width -> FCode ()
 checkVecCompatibility cfg vcat l w =
   case stgToCmmVecInstrsErr cfg of
-    Nothing  -> check vecWidth vcat l w  -- We are in a compatible backend
-    Just err -> sorry err                -- incompatible backend, do panic
+    Nothing | isX86 -> checkX86 vecWidth vcat l w
+            | platformArch platform == ArchAArch64 -> checkAArch64 vecWidth
+            | otherwise -> sorry "SIMD vector instructions are not supported on this architecture."
+    Just err -> sorry err  -- incompatible backend, do panic
   where
     platform = stgToCmmPlatform cfg
-    check :: Width -> PrimOpVecCat -> Length -> Width -> FCode ()
-    check W128 FloatVec 4 W32 | not (isSseEnabled platform) =
+    isX86 = case platformArch platform of
+      ArchX86_64 -> True
+      ArchX86 -> True
+      _ -> False
+    checkX86 :: Width -> PrimOpVecCat -> Length -> Width -> FCode ()
+    checkX86 W128 FloatVec 4 W32 | not (isSseEnabled platform) =
         sorry $ "128-bit wide single-precision floating point " ++
                 "SIMD vector instructions require at least -msse."
-    check W128 _ _ _ | not (isSse2Enabled platform) =
+    checkX86 W128 _ _ _ | not (isSse2Enabled platform) =
         sorry $ "128-bit wide integer and double precision " ++
                 "SIMD vector instructions require at least -msse2."
-    check W256 FloatVec _ _ | not (stgToCmmAvx cfg) =
+    checkX86 W256 FloatVec _ _ | not (stgToCmmAvx cfg) =
         sorry $ "256-bit wide floating point " ++
                 "SIMD vector instructions require at least -mavx."
-    check W256 _ _ _ | not (stgToCmmAvx2 cfg) =
+    checkX86 W256 _ _ _ | not (stgToCmmAvx2 cfg) =
         sorry $ "256-bit wide integer " ++
                 "SIMD vector instructions require at least -mavx2."
-    check W512 _ _ _ | not (stgToCmmAvx512f cfg) =
+    checkX86 W512 _ _ _ | not (stgToCmmAvx512f cfg) =
         sorry $ "512-bit wide " ++
                 "SIMD vector instructions require -mavx512f."
-    check _ _ _ _ = return ()
+    checkX86 _ _ _ _ = return ()
+
+    checkAArch64 :: Width -> FCode ()
+    checkAArch64 W256 = sorry $ "256-bit wide SIMD vector instructions are not supported."
+    checkAArch64 W512 = sorry $ "512-bit wide SIMD vector instructions are not supported."
+    checkAArch64 _ = return ()
 
     vecWidth = typeWidth (vecVmmType vcat l w)
 
diff --git a/rts/include/stg/MachRegs.h b/rts/include/stg/MachRegs.h
index f0253865ea3b..239f505c7965 100644
--- a/rts/include/stg/MachRegs.h
+++ b/rts/include/stg/MachRegs.h
@@ -457,6 +457,12 @@ the stack. See Note [Overlapping global registers] for implications.
 #define REG_D3          d14
 #define REG_D4          d15
 
+#define REG_XMM1        q4
+#define REG_XMM2        q5
+
+#define CALLER_SAVES_XMM1
+#define CALLER_SAVES_XMM2
+
 /* -----------------------------------------------------------------------------
    The s390x register mapping
 
diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index bf1110ac7e91..512a9850b1cd 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -222,7 +222,7 @@ test('T21186', normal, compile_and_run, [''])
 test('T20640a', normal, compile_and_run, [''])
 test('T20640b', normal, compile_and_run, [''])
 test('T22296',[only_ways(llvm_ways)
-              ,unless(arch('x86_64'), skip)],compile_and_run,[''])
+              ,unless(arch('x86_64') or arch('aarch64'), skip)],compile_and_run,[''])
 test('T22798', normal, compile_and_run, ['-fregs-graph'])
 test('CheckBoundsOK', normal, compile_and_run, ['-fcheck-prim-bounds'])
 test('OrigThunkInfo', normal, compile_and_run, ['-forig-thunk-info'])
diff --git a/testsuite/tests/unboxedsums/all.T b/testsuite/tests/unboxedsums/all.T
index 300d7892e13c..dc2898702194 100644
--- a/testsuite/tests/unboxedsums/all.T
+++ b/testsuite/tests/unboxedsums/all.T
@@ -38,7 +38,7 @@ test('T20859', normal, compile, [''])
 
 test('T22187',[only_ways(llvm_ways)],compile,[''])
 test('T22187_run',[only_ways(llvm_ways)
-                  ,unless(arch('x86_64'), skip)],compile_and_run,[''])
+                  ,unless(arch('x86_64') or arch('aarch64'), skip)],compile_and_run,[''])
 
 test('unpack_sums_1', normal, compile_and_run, ['-O'])
 test('unpack_sums_2', normal, compile, ['-O'])
-- 
GitLab