Compare revisions

19b84b8f · 0a64be45 · d17ec357 · 6bf3c5bc · ede58bf3 · 154219d3
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -536,6 +536,20 @@ nightly-aarch64-linux-deb10:
  variables:
    TEST_TYPE: slowtest

+.build-aarch64-linux-deb10-llvm:
+  extends: .build-aarch64-linux-deb10
+  stage: full-build
+  variables:
+    BUILD_FLAVOUR: perf-llvm
+  tags:
+    - aarch64-linux
+
+validate-aarch64-linux-deb10-llvm:
+  extends: .build-aarch64-linux-deb10-llvm
+  artifacts:
+    when: always
+    expire_in: 2 week
+
 #################################
 # armv7-linux-deb10
 #################################

--- a/aclocal.m4
+++ b/aclocal.m4
@@ -118,7 +118,7 @@ AC_DEFUN([FPTOOLS_SET_PLATFORM_VARS],
        GHC_CONVERT_OS([$target_os], [$TargetArch], [TargetOS])
    fi

-    GHC_LLVM_TARGET([$target_cpu],[$target_vendor],[$target_os],[LlvmTarget])
+    GHC_LLVM_TARGET([$target],[$target_cpu],[$target_vendor],[$target_os],[LlvmTarget])

    GHC_SELECT_FILE_EXTENSIONS([$host], [exeext_host], [soext_host])
    GHC_SELECT_FILE_EXTENSIONS([$target], [exeext_target], [soext_target])
@@ -218,7 +218,7 @@ AC_DEFUN([FPTOOLS_SET_HASKELL_PLATFORM_VARS],
            test -z "[$]2" || eval "[$]2=\"ArchARM \$ARM_ISA \$ARM_ISA_EXT \$ARM_ABI\""
            ;;
        aarch64)
-            test -z "[$]2" || eval "[$]2=ArchARM64"
+            test -z "[$]2" || eval "[$]2=ArchAArch64"
            ;;
        alpha)
            test -z "[$]2" || eval "[$]2=ArchAlpha"
@@ -327,9 +327,14 @@ AC_DEFUN([FPTOOLS_SET_HASKELL_PLATFORM_VARS],
    AC_LINK_IFELSE(
        [AC_LANG_PROGRAM([], [__asm__ (".subsections_via_symbols");])],
        [AC_MSG_RESULT(yes)
-         TargetHasSubsectionsViaSymbols=YES
-         AC_DEFINE([HAVE_SUBSECTIONS_VIA_SYMBOLS],[1],
+         if test x"$TargetArch" = xaarch64; then
+            dnl subsections via symbols is busted on arm64
+            TargetHasSubsectionsViaSymbols=NO
+         else
+            TargetHasSubsectionsViaSymbols=YES
+            AC_DEFINE([HAVE_SUBSECTIONS_VIA_SYMBOLS],[1],
                   [Define to 1 if Apple-style dead-stripping is supported.])
+         fi
        ],
        [TargetHasSubsectionsViaSymbols=NO
         AC_MSG_RESULT(no)])
@@ -1976,7 +1981,7 @@ AC_MSG_CHECKING(for path to top of build tree)
 # `libraries/base/System/Info.hs`'s documentation.
 AC_DEFUN([GHC_CONVERT_CPU],[
 case "$1" in
-  aarch64*)
+  aarch64*|arm64*)
    $2="aarch64"
    ;;
  alpha*)
@@ -2058,18 +2063,19 @@ case "$1" in
  esac
 ])

-# GHC_LLVM_TARGET(target_cpu, target_vendor, target_os, llvm_target_var)
+# GHC_LLVM_TARGET(target, target_cpu, target_vendor, target_os, llvm_target_var)
 # --------------------------------
 # converts the canonicalized target into something llvm can understand
 AC_DEFUN([GHC_LLVM_TARGET], [
-  case "$2-$3" in
+  llvm_target_cpu=$2
+  case "$1" in
    *-freebsd*-gnueabihf)
      llvm_target_vendor="unknown"
      llvm_target_os="freebsd-gnueabihf"
      ;;
-    hardfloat-*eabi)
+    *-hardfloat-*eabi)
      llvm_target_vendor="unknown"
-      llvm_target_os="$3""hf"
+      llvm_target_os="$4""hf"
      ;;
    *-mingw32|*-mingw64|*-msys)
      llvm_target_vendor="unknown"
@@ -2080,15 +2086,25 @@ AC_DEFUN([GHC_LLVM_TARGET], [
    # turned into just `-linux` and fail to be found
    # in the `llvm-targets` file.
    *-android*|*-gnueabi*|*-musleabi*)
-      GHC_CONVERT_VENDOR([$2],[llvm_target_vendor])
-      llvm_target_os="$3"
+      GHC_CONVERT_VENDOR([$3],[llvm_target_vendor])
+      llvm_target_os="$4"
+      ;;
+    # apple is a bit about their naming scheme for
+    # aarch64; and clang on macOS doesn't know that
+    # aarch64 would be arm64. So for LLVM we'll need
+    # to call it arm64; while we'll refer to it internally
+    # as aarch64 for consistency and sanity.
+    aarch64-apple-*|arm64-apple-*)
+      llvm_target_cpu="arm64"
+      GHC_CONVERT_VENDOR([$3],[llvm_target_vendor])
+      GHC_CONVERT_OS([$4],[$2],[llvm_target_os])
      ;;
    *)
-      GHC_CONVERT_VENDOR([$2],[llvm_target_vendor])
-      GHC_CONVERT_OS([$3],[$1],[llvm_target_os])
+      GHC_CONVERT_VENDOR([$3],[llvm_target_vendor])
+      GHC_CONVERT_OS([$4],[$2],[llvm_target_os])
      ;;
  esac
-  $4="$1-$llvm_target_vendor-$llvm_target_os"
+  $5="$llvm_target_cpu-$llvm_target_vendor-$llvm_target_os"
 ])



--- a/compiler/GHC/Builtin/Names.hs
+++ b/compiler/GHC/Builtin/Names.hs
@@ -336,7 +336,7 @@ basicKnownKeyNames
        -- FFI primitive types that are not wired-in.
        stablePtrTyConName, ptrTyConName, funPtrTyConName,
        int8TyConName, int16TyConName, int32TyConName, int64TyConName,
-        word16TyConName, word32TyConName, word64TyConName,
+        word8TyConName, word16TyConName, word32TyConName, word64TyConName,

        -- Others
        otherwiseIdName, inlineIdName,
@@ -1463,7 +1463,8 @@ int32TyConName    = tcQual gHC_INT  (fsLit "Int32") int32TyConKey
 int64TyConName    = tcQual gHC_INT  (fsLit "Int64") int64TyConKey

 -- Word module
-word16TyConName, word32TyConName, word64TyConName :: Name
+word8TyConName, word16TyConName, word32TyConName, word64TyConName :: Name
+word8TyConName    = tcQual  gHC_WORD (fsLit "Word8")  word8TyConKey
 word16TyConName   = tcQual  gHC_WORD (fsLit "Word16") word16TyConKey
 word32TyConName   = tcQual  gHC_WORD (fsLit "Word32") word32TyConKey
 word64TyConName   = tcQual  gHC_WORD (fsLit "Word64") word64TyConKey

--- a/compiler/GHC/Builtin/Types.hs
+++ b/compiler/GHC/Builtin/Types.hs
@@ -55,7 +55,7 @@ module GHC.Builtin.Types (
        wordTyCon, wordDataCon, wordTyConName, wordTy,

        -- * Word8
-        word8TyCon, word8DataCon, word8TyConName, word8Ty,
+        word8TyCon, word8DataCon, word8Ty,

        -- * List
        listTyCon, listTyCon_RDR, listTyConName, listTyConKey,
@@ -251,7 +251,6 @@ wiredInTyCons = [ -- Units are not treated like other tuples, because they
                , floatTyCon
                , intTyCon
                , wordTyCon
-                , word8TyCon
                , listTyCon
                , orderingTyCon
                , maybeTyCon
@@ -354,10 +353,9 @@ nothingDataConName = mkWiredInDataConName UserSyntax gHC_MAYBE (fsLit "Nothing")
 justDataConName    = mkWiredInDataConName UserSyntax gHC_MAYBE (fsLit "Just")
                                          justDataConKey justDataCon

-wordTyConName, wordDataConName, word8TyConName, word8DataConName :: Name
+wordTyConName, wordDataConName, word8DataConName :: Name
 wordTyConName      = mkWiredInTyConName   UserSyntax gHC_TYPES (fsLit "Word")   wordTyConKey     wordTyCon
 wordDataConName    = mkWiredInDataConName UserSyntax gHC_TYPES (fsLit "W#")     wordDataConKey   wordDataCon
-word8TyConName     = mkWiredInTyConName   UserSyntax gHC_WORD  (fsLit "Word8")  word8TyConKey    word8TyCon
 word8DataConName   = mkWiredInDataConName UserSyntax gHC_WORD  (fsLit "W8#")    word8DataConKey  word8DataCon

 floatTyConName, floatDataConName, doubleTyConName, doubleDataConName :: Name
@@ -1641,7 +1639,7 @@ word8TyCon = pcTyCon word8TyConName
                            (NoSourceText, fsLit "HsWord8"))) []
                     [word8DataCon]
 word8DataCon :: DataCon
-word8DataCon = pcDataCon word8DataConName [] [wordPrimTy] word8TyCon
+word8DataCon = pcDataCon word8DataConName [] [word8PrimTy] word8TyCon

 floatTy :: Type
 floatTy = mkTyConTy floatTyCon

--- a/compiler/GHC/Builtin/primops.txt.pp
+++ b/compiler/GHC/Builtin/primops.txt.pp
@@ -291,8 +291,8 @@ section "Int8#"

 primtype Int8#

-primop Int8Extend "extendInt8#" GenPrimOp Int8# -> Int#
-primop Int8Narrow "narrowInt8#" GenPrimOp Int# -> Int8#
+primop Int8ExtendOp "extendInt8#" GenPrimOp Int8# -> Int#
+primop Int8NarrowOp "narrowInt8#" GenPrimOp Int# -> Int8#

 primop Int8NegOp "negateInt8#" GenPrimOp Int8# -> Int8#

@@ -332,8 +332,8 @@ section "Word8#"

 primtype Word8#

-primop Word8Extend "extendWord8#" GenPrimOp Word8# -> Word#
-primop Word8Narrow "narrowWord8#" GenPrimOp Word# -> Word8#
+primop Word8ExtendOp "extendWord8#" GenPrimOp Word8# -> Word#
+primop Word8NarrowOp "narrowWord8#" GenPrimOp Word# -> Word8#

 primop Word8NotOp "notWord8#" GenPrimOp Word8# -> Word8#

@@ -373,8 +373,8 @@ section "Int16#"

 primtype Int16#

-primop Int16Extend "extendInt16#" GenPrimOp Int16# -> Int#
-primop Int16Narrow "narrowInt16#" GenPrimOp Int# -> Int16#
+primop Int16ExtendOp "extendInt16#" GenPrimOp Int16# -> Int#
+primop Int16NarrowOp "narrowInt16#" GenPrimOp Int# -> Int16#

 primop Int16NegOp "negateInt16#" GenPrimOp Int16# -> Int16#

@@ -414,8 +414,8 @@ section "Word16#"

 primtype Word16#

-primop Word16Extend "extendWord16#" GenPrimOp Word16# -> Word#
-primop Word16Narrow "narrowWord16#" GenPrimOp Word# -> Word16#
+primop Word16ExtendOp "extendWord16#" GenPrimOp Word16# -> Word#
+primop Word16NarrowOp "narrowWord16#" GenPrimOp Word# -> Word16#

 primop Word16NotOp "notWord16#" GenPrimOp Word16# -> Word16#

@@ -448,6 +448,26 @@ primop Word16LeOp "leWord16#" Compare Word16# -> Word16# -> Int#
 primop Word16LtOp "ltWord16#" Compare Word16# -> Word16# -> Int#
 primop Word16NeOp "neWord16#" Compare Word16# -> Word16# -> Int#

+------------------------------------------------------------------------
+section "Int32#"
+        {Operations on 32-bit integers.}
+------------------------------------------------------------------------
+
+primtype Int32#
+
+primop Int32ExtendOp "extendInt32#" GenPrimOp Int32# -> Int#
+primop Int32NarrowOp "narrowInt32#" GenPrimOp Int# -> Int32#
+
+------------------------------------------------------------------------
+section "Word32#"
+        {Operations on 32-bit unsigned integers.}
+------------------------------------------------------------------------
+
+primtype Word32#
+
+primop Word32ExtendOp "extendWord32#" GenPrimOp Word32# -> Word#
+primop Word32NarrowOp "narrowWord32#" GenPrimOp Word# -> Word32#
+
 #if WORD_SIZE_IN_BITS < 64
 ------------------------------------------------------------------------
 section "Int64#"

--- a/compiler/GHC/ByteCode/Asm.hs
+++ b/compiler/GHC/ByteCode/Asm.hs
@@ -464,6 +464,12 @@ assembleI platform i = case i of
    literal (LitNumber nt i) = case nt of
      LitNumInt     -> int (fromIntegral i)
      LitNumWord    -> int (fromIntegral i)
+      LitNumInt8    -> int8 (fromIntegral i)
+      LitNumWord8   -> int8 (fromIntegral i)
+      LitNumInt16   -> int16 (fromIntegral i)
+      LitNumWord16  -> int16 (fromIntegral i)
+      LitNumInt32   -> int32 (fromIntegral i)
+      LitNumWord32  -> int32 (fromIntegral i)
      LitNumInt64   -> int64 (fromIntegral i)
      LitNumWord64  -> int64 (fromIntegral i)
      LitNumInteger -> panic "GHC.ByteCode.Asm.literal: LitNumInteger"
@@ -478,6 +484,9 @@ assembleI platform i = case i of
    float = words . mkLitF
    double = words . mkLitD platform
    int = words . mkLitI
+    int8 = words . mkLitI64 platform
+    int16 = words . mkLitI64 platform
+    int32 = words . mkLitI64 platform
    int64 = words . mkLitI64 platform
    words ws = lit (map BCONPtrWord ws)
    word w = words [w]

--- a/compiler/GHC/Cmm/CLabel.hs
+++ b/compiler/GHC/Cmm/CLabel.hs
@@ -141,6 +141,8 @@ import GHC.Utils.Outputable
 import GHC.Utils.Panic
 import GHC.Data.FastString
 import GHC.Driver.Session
+import GHC.Driver.Backend
+import GHC.Driver.Ppr
 import GHC.Platform
 import GHC.Types.Unique.Set
 import GHC.Utils.Misc
@@ -280,6 +282,12 @@ data CLabel

  deriving Eq

+instance Show CLabel where
+  show = showPprUnsafe . pprDebugCLabel genericPlatform
+
+instance Outputable CLabel where
+  ppr = text . show
+
 isIdLabel :: CLabel -> Bool
 isIdLabel IdLabel{} = True
 isIdLabel _ = False
@@ -404,7 +412,6 @@ data ForeignLabelSource

   deriving (Eq, Ord)

-
 -- | For debugging problems with the CLabel representation.
 --      We can't make a Show instance for CLabel because lots of its components don't have instances.
 --      The regular Outputable instance only shows the label name, and not its other info.
@@ -1401,14 +1408,19 @@ pprDynamicLinkerAsmLabel platform dllInfo ppLbl =
      OSDarwin
        | platformArch platform == ArchX86_64 ->
          case dllInfo of
-            CodeStub        -> char 'L' <> ppLbl <> text "$stub"
-            SymbolPtr       -> char 'L' <> ppLbl <> text "$non_lazy_ptr"
+            CodeStub        -> text "L" <> ppLbl <> text "$stub"
+            SymbolPtr       -> text "L" <> ppLbl <> text "$non_lazy_ptr"
            GotSymbolPtr    -> ppLbl <> text "@GOTPCREL"
            GotSymbolOffset -> ppLbl
+        | platformArch platform == ArchAArch64 -> ppLbl
+        --   case dllInfo of
+        --     CodeStub        -> text "L" <> ppLbl <> text "$stub"
+        --     SymbolPtr       -> text "L" <> ppLbl <> text "$non_lazy_ptr"
+        --     _               -> ppLbl
        | otherwise ->
          case dllInfo of
-            CodeStub  -> char 'L' <> ppLbl <> text "$stub"
-            SymbolPtr -> char 'L' <> ppLbl <> text "$non_lazy_ptr"
+            CodeStub  -> text "L" <> ppLbl <> text "$stub"
+            SymbolPtr -> text "L" <> ppLbl <> text "$non_lazy_ptr"
            _         -> panic "pprDynamicLinkerAsmLabel"

      OSAIX ->
@@ -1433,6 +1445,10 @@ pprDynamicLinkerAsmLabel platform dllInfo ppLbl =
          SymbolPtr -> text ".LC_" <> ppLbl
          _         -> panic "pprDynamicLinkerAsmLabel"

+      | platformArch platform == ArchAArch64
+      = ppLbl
+
+
      | platformArch platform == ArchX86_64
      = case dllInfo of
          CodeStub        -> ppLbl <> text "@plt"

--- a/compiler/GHC/Cmm/Expr.hs
+++ b/compiler/GHC/Cmm/Expr.hs
@@ -64,6 +64,7 @@ data CmmExpr
        --        ** is shorthand only, meaning **
        -- CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
        --      where rep = typeWidth (cmmRegType reg)
+  deriving Show

 instance Eq CmmExpr where       -- Equality ignores the types
  CmmLit l1          == CmmLit l2          = l1==l2
@@ -77,7 +78,7 @@ instance Eq CmmExpr where       -- Equality ignores the types
 data CmmReg
  = CmmLocal  {-# UNPACK #-} !LocalReg
  | CmmGlobal GlobalReg
-  deriving( Eq, Ord )
+  deriving( Eq, Ord, Show )

 -- | A stack area is either the stack slot where a variable is spilled
 -- or the stack space where function arguments and results are passed.
@@ -85,7 +86,7 @@ data Area
  = Old            -- See Note [Old Area]
  | Young {-# UNPACK #-} !BlockId  -- Invariant: must be a continuation BlockId
                   -- See Note [Continuation BlockId] in GHC.Cmm.Node.
-  deriving (Eq, Ord)
+  deriving (Eq, Ord, Show)

 {- Note [Old Area]
 ~~~~~~~~~~~~~~~~~~
@@ -208,7 +209,7 @@ data CmmLit
                     -- During the stack-layout pass, CmmHighStackMark
                     -- is replaced by a CmmInt for the actual number
                     -- of bytes used
-  deriving Eq
+  deriving (Eq, Show)

 cmmExprType :: Platform -> CmmExpr -> CmmType
 cmmExprType platform = \case
@@ -268,6 +269,7 @@ data LocalReg
    -- ^ Parameters:
    --   1. Identifier
    --   2. Type
+  deriving Show

 instance Eq LocalReg where
  (LocalReg u1 _) == (LocalReg u2 _) = u1 == u2

--- a/compiler/GHC/Cmm/Parser.y
+++ b/compiler/GHC/Cmm/Parser.y
@@ -1020,6 +1020,45 @@ machOps = listToUFM $
 callishMachOps :: Platform -> UniqFM FastString ([CmmExpr] -> (CallishMachOp, [CmmExpr]))
 callishMachOps platform = listToUFM $
        map (\(x, y) -> (mkFastString x, y)) [
+
+        ( "pow64f", (MO_F64_Pwr,) ),
+        ( "sin64f", (MO_F64_Sin,) ),
+        ( "cos64f", (MO_F64_Cos,) ),
+        ( "tan64f", (MO_F64_Tan,) ),
+        ( "sinh64f", (MO_F64_Sinh,) ),
+        ( "cosh64f", (MO_F64_Cosh,) ),
+        ( "tanh64f", (MO_F64_Tanh,) ),
+        ( "asin64f", (MO_F64_Asin,) ),
+        ( "acos64f", (MO_F64_Acos,) ),
+        ( "atan64f", (MO_F64_Atan,) ),
+        ( "asinh64f", (MO_F64_Asinh,) ),
+        ( "acosh64f", (MO_F64_Acosh,) ),
+        ( "log64f", (MO_F64_Log,) ),
+        ( "log1p64f", (MO_F64_Log1P,) ),
+        ( "exp64f", (MO_F64_Exp,) ),
+        ( "expM164f", (MO_F64_ExpM1,) ),
+        ( "fabs64f", (MO_F64_Fabs,) ),
+        ( "sqrt64f", (MO_F64_Sqrt,) ),
+
+        ( "pow32f", (MO_F32_Pwr,) ),
+        ( "sin32f", (MO_F32_Sin,) ),
+        ( "cos32f", (MO_F32_Cos,) ),
+        ( "tan32f", (MO_F32_Tan,) ),
+        ( "sinh32f", (MO_F32_Sinh,) ),
+        ( "cosh32f", (MO_F32_Cosh,) ),
+        ( "tanh32f", (MO_F32_Tanh,) ),
+        ( "asin32f", (MO_F32_Asin,) ),
+        ( "acos32f", (MO_F32_Acos,) ),
+        ( "atan32f", (MO_F32_Atan,) ),
+        ( "asinh32f", (MO_F32_Asinh,) ),
+        ( "acosh32f", (MO_F32_Acosh,) ),
+        ( "log32f", (MO_F32_Log,) ),
+        ( "log1p32f", (MO_F32_Log1P,) ),
+        ( "exp32f", (MO_F32_Exp,) ),
+        ( "expM132f", (MO_F32_ExpM1,) ),
+        ( "fabs32f", (MO_F32_Fabs,) ),
+        ( "sqrt32f", (MO_F32_Sqrt,) ),
+
        ( "read_barrier", (MO_ReadBarrier,)),
        ( "write_barrier", (MO_WriteBarrier,)),
        ( "memcpy", memcpyLikeTweakArgs MO_Memcpy ),
@@ -1057,9 +1096,7 @@ callishMachOps platform = listToUFM $
        ( "xchg32", (MO_Xchg W32,)),
        ( "xchg64", (MO_Xchg W64,))

-        -- ToDo: the rest, maybe
-        -- edit: which rest?
-        -- also: how do we tell CMM Lint how to type check callish macops?
+        -- ToDo: how do we tell CMM Lint how to type check callish macops?
    ]
  where
    memcpyLikeTweakArgs :: (Int -> CallishMachOp) -> [CmmExpr] -> (CallishMachOp, [CmmExpr])

--- a/compiler/GHC/Cmm/Type.hs
+++ b/compiler/GHC/Cmm/Type.hs
@@ -51,13 +51,14 @@ import Data.Int

 data CmmType    -- The important one!
  = CmmType CmmCat Width
+  deriving Show

 data CmmCat                -- "Category" (not exported)
   = GcPtrCat              -- GC pointer
   | BitsCat               -- Non-pointer
   | FloatCat              -- Float
   | VecCat Length CmmCat  -- Vector
-   deriving( Eq )
+   deriving( Eq, Show )
        -- See Note [Signed vs unsigned] at the end

 instance Outputable CmmType where
@@ -429,4 +430,3 @@ C calling convention rather early on in the compiler).  However, given
 this, the cons outweigh the pros.

 -}
-
--- a/compiler/GHC/Cmm/Utils.hs
+++ b/compiler/GHC/Cmm/Utils.hs
@@ -14,7 +14,7 @@

 module GHC.Cmm.Utils(
        -- CmmType
-        primRepCmmType, slotCmmType, slotForeignHint,
+        primRepCmmType, slotCmmType,
        typeCmmType, typeForeignHint, primRepForeignHint,

        -- CmmLit
@@ -159,13 +159,6 @@ primRepForeignHint FloatRep     = NoHint
 primRepForeignHint DoubleRep    = NoHint
 primRepForeignHint (VecRep {})  = NoHint

-slotForeignHint :: SlotTy -> ForeignHint
-slotForeignHint PtrSlot       = AddrHint
-slotForeignHint WordSlot      = NoHint
-slotForeignHint Word64Slot    = NoHint
-slotForeignHint FloatSlot     = NoHint
-slotForeignHint DoubleSlot    = NoHint
-
 typeForeignHint :: UnaryType -> ForeignHint
 typeForeignHint = primRepForeignHint . typePrimRep1


--- a/compiler/GHC/CmmToAsm.hs
+++ b/compiler/GHC/CmmToAsm.hs
@@ -87,6 +87,7 @@ import GHC.Prelude
 import qualified GHC.CmmToAsm.X86   as X86
 import qualified GHC.CmmToAsm.PPC   as PPC
 import qualified GHC.CmmToAsm.SPARC as SPARC
+import qualified GHC.CmmToAsm.AArch64 as AArch64

 import GHC.CmmToAsm.Reg.Liveness
 import qualified GHC.CmmToAsm.Reg.Linear                as Linear
@@ -166,14 +167,13 @@ nativeCodeGen dflags this_mod modLoc h us cmms
      ArchSPARC64   -> panic "nativeCodeGen: No NCG for SPARC64"
      ArchS390X     -> panic "nativeCodeGen: No NCG for S390X"
      ArchARM {}    -> panic "nativeCodeGen: No NCG for ARM"
-      ArchARM64     -> panic "nativeCodeGen: No NCG for ARM64"
+      ArchAArch64   -> nCG' (AArch64.ncgAArch64 config)
      ArchAlpha     -> panic "nativeCodeGen: No NCG for Alpha"
      ArchMipseb    -> panic "nativeCodeGen: No NCG for mipseb"
      ArchMipsel    -> panic "nativeCodeGen: No NCG for mipsel"
      ArchUnknown   -> panic "nativeCodeGen: No NCG for unknown arch"
      ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript"

-
 -- | Data accumulated during code generation. Mostly about statistics,
 -- but also collects debug data for DWARF generation.
 data NativeGenAcc statics instr
@@ -1188,8 +1188,7 @@ initNCGConfig dflags = NCGConfig
            ArchX86    -> v
            _          -> Nothing

-   , ncgDwarfEnabled        = debugLevel dflags > 0
-   , ncgDwarfUnwindings     = debugLevel dflags >= 1
-   , ncgDwarfStripBlockInfo = debugLevel dflags < 2 -- We strip out block information when running with -g0 or -g1.
+   , ncgDwarfEnabled        = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags > 0
+   , ncgDwarfUnwindings     = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags >= 1
+   , ncgDwarfStripBlockInfo = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags < 2 -- We strip out block information when running with -g0 or -g1.
   }
-
--- a/compiler/GHC/CmmToAsm/AArch64.hs
+++ b/compiler/GHC/CmmToAsm/AArch64.hs
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+-- | Native code generator for x86 and x86-64 architectures
+module GHC.CmmToAsm.AArch64
+   ( ncgAArch64 )
+where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+
+import qualified GHC.CmmToAsm.AArch64.Instr   as AArch64
+import qualified GHC.CmmToAsm.AArch64.Ppr     as AArch64
+import qualified GHC.CmmToAsm.AArch64.CodeGen as AArch64
+import qualified GHC.CmmToAsm.AArch64.Regs    as AArch64
+import qualified GHC.CmmToAsm.AArch64.RegInfo as AArch64
+
+ncgAArch64 :: NCGConfig -> NcgImpl RawCmmStatics AArch64.Instr AArch64.JumpDest
+ncgAArch64 config
+ = NcgImpl {
+        ncgConfig                 = config
+       ,cmmTopCodeGen             = AArch64.cmmTopCodeGen
+       ,generateJumpTableForInstr = AArch64.generateJumpTableForInstr config
+       ,getJumpDestBlockId        = AArch64.getJumpDestBlockId
+       ,canShortcut               = AArch64.canShortcut
+       ,shortcutStatics           = AArch64.shortcutStatics
+       ,shortcutJump              = AArch64.shortcutJump
+       ,pprNatCmmDecl             = AArch64.pprNatCmmDecl config
+       ,maxSpillSlots             = AArch64.maxSpillSlots config
+       ,allocatableRegs           = AArch64.allocatableRegs platform
+       ,ncgAllocMoreStack         = AArch64.allocMoreStack platform
+       ,ncgExpandTop              = id
+       ,ncgMakeFarBranches        = const id
+       ,extractUnwindPoints       = const []
+       ,invertCondBranches        = \_ _ -> id
+  }
+    where
+      platform = ncgPlatform config
+
+-- | Instruction instance for aarch64
+instance Instruction AArch64.Instr where
+        regUsageOfInstr         = AArch64.regUsageOfInstr
+        patchRegsOfInstr        = AArch64.patchRegsOfInstr
+        isJumpishInstr          = AArch64.isJumpishInstr
+        jumpDestsOfInstr        = AArch64.jumpDestsOfInstr
+        patchJumpInstr          = AArch64.patchJumpInstr
+        mkSpillInstr            = AArch64.mkSpillInstr
+        mkLoadInstr             = AArch64.mkLoadInstr
+        takeDeltaInstr          = AArch64.takeDeltaInstr
+        isMetaInstr             = AArch64.isMetaInstr
+        mkRegRegMoveInstr _     = AArch64.mkRegRegMoveInstr
+        takeRegRegMoveInstr     = AArch64.takeRegRegMoveInstr
+        mkJumpInstr             = AArch64.mkJumpInstr
+        mkStackAllocInstr       = AArch64.mkStackAllocInstr
+        mkStackDeallocInstr     = AArch64.mkStackDeallocInstr
+        mkComment               = pure . AArch64.COMMENT
+        pprInstr                = AArch64.pprInstr
--- a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
--- a/compiler/GHC/CmmToAsm/AArch64/Cond.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Cond.hs
+module GHC.CmmToAsm.AArch64.Cond  where
+
+import GHC.Prelude
+
+-- https://developer.arm.com/documentation/den0024/a/the-a64-instruction-set/data-processing-instructions/conditional-instructions
+
+-- XXX: This appears to go a bit overboard? Maybe we should stick with what LLVM
+-- settled on for fcmp?
+-- false: always yields false, regardless of operands.
+-- oeq: yields true if both operands are not a QNAN and op1 is equal to op2.
+-- ogt: yields true if both operands are not a QNAN and op1 is greater than op2.
+-- oge: yields true if both operands are not a QNAN and op1 is greater than or equal to op2.
+-- olt: yields true if both operands are not a QNAN and op1 is less than op2.
+-- ole: yields true if both operands are not a QNAN and op1 is less than or equal to op2.
+-- one: yields true if both operands are not a QNAN and op1 is not equal to op2.
+-- ord: yields true if both operands are not a QNAN.
+-- ueq: yields true if either operand is a QNAN or op1 is equal to op2.
+-- ugt: yields true if either operand is a QNAN or op1 is greater than op2.
+-- uge: yields true if either operand is a QNAN or op1 is greater than or equal to op2.
+-- ult: yields true if either operand is a QNAN or op1 is less than op2.
+-- ule: yields true if either operand is a QNAN or op1 is less than or equal to op2.
+-- une: yields true if either operand is a QNAN or op1 is not equal to op2.
+-- uno: yields true if either operand is a QNAN.
+-- true: always yields true, regardless of operands.
+--
+-- LLVMs icmp knows about:
+-- eq: yields true if the operands are equal, false otherwise. No sign interpretation is necessary or performed.
+-- ne: yields true if the operands are unequal, false otherwise. No sign interpretation is necessary or performed.
+-- ugt: interprets the operands as unsigned values and yields true if op1 is greater than op2.
+-- uge: interprets the operands as unsigned values and yields true if op1 is greater than or equal to op2.
+-- ult: interprets the operands as unsigned values and yields true if op1 is less than op2.
+-- ule: interprets the operands as unsigned values and yields true if op1 is less than or equal to op2.
+-- sgt: interprets the operands as signed values and yields true if op1 is greater than op2.
+-- sge: interprets the operands as signed values and yields true if op1 is greater than or equal to op2.
+-- slt: interprets the operands as signed values and yields true if op1 is less than op2.
+-- sle: interprets the operands as signed values and yields true if op1 is less than or equal to op2.
+
+data Cond
+    = ALWAYS -- b.al
+    | EQ     -- b.eq
+    | NE     -- b.ne
+    -- signed
+    | SLT    -- b.lt
+    | SLE    -- b.le
+    | SGE    -- b.ge
+    | SGT    -- b.gt
+    -- unsigned
+    | ULT    -- b.lo
+    | ULE    -- b.ls
+    | UGE    -- b.hs
+    | UGT    -- b.hi
+    -- ordered
+    | OLT    -- b.mi
+    | OLE    -- b.ls
+    | OGE    -- b.ge
+    | OGT    -- b.gt
+    -- unordered
+    | UOLT   -- b.lt
+    | UOLE   -- b.le
+    | UOGE   -- b.pl
+    | UOGT   -- b.hi
+    -- others
+    | NEVER  -- b.nv
+    | VS     -- oVerflow set
+    | VC     -- oVerflow clear
+    deriving Eq
--- a/compiler/GHC/CmmToAsm/AArch64/Instr.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+{-# language CPP, BangPatterns #-}
+
+module GHC.CmmToAsm.AArch64.Instr
+
+where
+
+#include "HsVersions.h"
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.AArch64.Cond
+import GHC.CmmToAsm.AArch64.Regs
+
+import GHC.CmmToAsm.Instr (RegUsage(..))
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Types
+import GHC.CmmToAsm.Utils
+-- import GHC.CmmToAsm.Reg.Target
+import GHC.CmmToAsm.Config
+-- import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm
+-- import GHC.Cmm.Info
+-- import GHC.Data.FastString
+import GHC.Cmm.CLabel
+import GHC.Utils.Outputable
+import GHC.Platform
+-- import GHC.Types.Unique.FM (listToUFM, lookupUFM)
+import GHC.Types.Unique.Supply
+
+import GHC.Utils.Panic
+
+import Control.Monad (replicateM)
+import Data.Maybe (fromMaybe)
+
+-- import Debug.Trace
+import GHC.Stack
+
+import Data.Bits ((.&.), complement)
+
+-- | XXX: verify this!
+stackFrameHeaderSize :: Platform -> Int
+stackFrameHeaderSize _ = 64
+
+-- | All registers are 8 byte wide.
+spillSlotSize :: Int
+spillSlotSize = 8
+
+-- | The number of bytes that the stack pointer should be aligned
+-- to.
+stackAlign :: Int
+stackAlign = 16
+
+-- | The number of spill slots available without allocating more.
+maxSpillSlots :: NCGConfig -> Int
+maxSpillSlots config
+--  = 0 -- set to zero, to see when allocMoreStack has to fire.
+    = let platform = ncgPlatform config
+      in ((ncgSpillPreallocSize config - stackFrameHeaderSize platform)
+         `div` spillSlotSize) - 1
+
+-- | Convert a spill slot number to a *byte* offset, with no sign.
+spillSlotToOffset :: NCGConfig -> Int -> Int
+spillSlotToOffset config slot
+   = stackFrameHeaderSize (ncgPlatform config) + spillSlotSize * slot
+
+-- | Get the registers that are being used by this instruction.
+-- regUsage doesn't need to do any trickery for jumps and such.
+-- Just state precisely the regs read and written by that insn.
+-- The consequences of control flow transfers, as far as register
+-- allocation goes, are taken care of by the register allocator.
+--
+-- RegUsage = RU [<read regs>] [<write regs>]
+
+instance Outputable RegUsage where
+    ppr (RU reads writes) = text "RegUsage(reads:" <+> ppr reads <> comma <+> text "writes:" <+> ppr writes <> char ')'
+
+regUsageOfInstr :: Platform -> Instr -> RegUsage
+regUsageOfInstr platform instr = case instr of
+  ANN _ i                  -> regUsageOfInstr platform i
+  -- 1. Arithmetic Instructions ------------------------------------------------
+  ADD dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  CMN l r                  -> usage (regOp l ++ regOp r, [])
+  CMP l r                  -> usage (regOp l ++ regOp r, [])
+  MSUB dst src1 src2 src3  -> usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
+  MUL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  NEG dst src              -> usage (regOp src, regOp dst)
+  SDIV dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  SUB dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  UDIV dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+
+  -- 2. Bit Manipulation Instructions ------------------------------------------
+  SBFM dst src _ _         -> usage (regOp src, regOp dst)
+  UBFM dst src _ _         -> usage (regOp src, regOp dst)
+
+  -- 3. Logical and Move Instructions ------------------------------------------
+  AND dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  ASR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  BIC dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  BICS dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  EON dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  EOR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  LSL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  LSR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  MOV dst src              -> usage (regOp src, regOp dst)
+  MOVK dst src             -> usage (regOp src, regOp dst)
+  MVN dst src              -> usage (regOp src, regOp dst)
+  ORR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  ROR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  TST src1 src2            -> usage (regOp src1 ++ regOp src2, [])
+  -- 4. Branch Instructions ----------------------------------------------------
+  J t                      -> usage (regTarget t, [])
+  B t                      -> usage (regTarget t, [])
+  BCOND _ t                -> usage (regTarget t, [])
+  BL t ps _rs              -> usage (regTarget t ++ ps, callerSavedRegisters)
+
+  -- 5. Atomic Instructions ----------------------------------------------------
+  -- 6. Conditional Instructions -----------------------------------------------
+  CSET dst _               -> usage ([], regOp dst)
+  CBZ src _                -> usage (regOp src, [])
+  CBNZ src _               -> usage (regOp src, [])
+  -- 7. Load and Store Instructions --------------------------------------------
+  STR _ src dst            -> usage (regOp src ++ regOp dst, [])
+  LDR _ dst src            -> usage (regOp src, regOp dst)
+  -- XXX is this right? see STR, which I'm only partial about being right?
+  STP _ src1 src2 dst      -> usage (regOp src1 ++ regOp src2 ++ regOp dst, [])
+  LDP _ dst1 dst2 src      -> usage (regOp src, regOp dst1 ++ regOp dst2)
+
+  -- 8. Synchronization Instructions -------------------------------------------
+  DMBSY                    -> usage ([], [])
+
+  -- 9. Floating Point Instructions --------------------------------------------
+  FCVT dst src             -> usage (regOp src, regOp dst)
+  SCVTF dst src            -> usage (regOp src, regOp dst)
+  FCVTZS dst src           -> usage (regOp src, regOp dst)
+
+  _ -> panic "regUsageOfInstr"
+
+  where
+        -- filtering the usage is necessary, otherwise the register
+        -- allocator will try to allocate pre-defined fixed stg
+        -- registers as well, as they show up.
+        usage (src, dst) = RU (filter (interesting platform) src)
+                              (filter (interesting platform) dst)
+
+        regAddr :: AddrMode -> [Reg]
+        regAddr (AddrRegReg r1 r2) = [r1, r2]
+        regAddr (AddrRegImm r1 _)  = [r1]
+        regAddr (AddrReg r1)       = [r1]
+        regOp :: Operand -> [Reg]
+        regOp (OpReg _ r1) = [r1]
+        regOp (OpRegExt _ r1 _ _) = [r1]
+        regOp (OpRegShift _ r1 _ _) = [r1]
+        regOp (OpAddr a) = regAddr a
+        regOp (OpImm _) = []
+        regOp (OpImmShift _ _ _) = []
+        regTarget :: Target -> [Reg]
+        regTarget (TBlock _) = []
+        regTarget (TLabel _) = []
+        regTarget (TReg r1)  = [r1]
+
+        -- Is this register interesting for the register allocator?
+        interesting :: Platform -> Reg -> Bool
+        interesting _        (RegVirtual _)                 = True
+        interesting _        (RegReal (RealRegSingle (-1))) = False
+        interesting platform (RegReal (RealRegSingle i))    = freeReg platform i
+        interesting _        (RegReal (RealRegPair{}))
+            = panic "AArch64.Instr.interesting: no reg pairs on this arch"
+
+-- Save caller save registers
+-- This is x0-x18
+--
+-- For SIMD/FP Registers:
+-- Registers v8-v15 must be preserved by a callee across subroutine calls;
+-- the remaining registers (v0-v7, v16-v31) do not need to be preserved (or
+-- should be preserved by the caller). Additionally, only the bottom 64 bits
+-- of each value stored in v8-v15 need to be preserved [7]; it is the
+-- responsibility of the caller to preserve larger values.
+--
+-- .---------------------------------------------------------------------------------------------------------------------------------------------------------------.
+-- |  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |  8 |  9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
+-- | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 42 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
+-- |== General Purpose registers ==================================================================================================================================|
+-- | <---- argument passing -------------> | IR | <------- tmp registers --------> | IP0| IP1| PL | <------------------- callee saved ------------> | FP | LR | SP |
+-- | <------ free registers --------------------------------------------------------------------> | BR | Sp | Hp | R1 | R2 | R3 | R4 | R5 | R6 | SL | -- | -- | -- |
+-- |== SIMD/FP Registers ==========================================================================================================================================|
+-- | <---- argument passing -------------> | <-- callee saved (lower 64 bits) ---> | <--------------------------------------- caller saved ----------------------> |
+-- | <------ free registers -------------> | F1 | F2 | F3 | F4 | D1 | D2 | D3 | D4 | <------ free registers -----------------------------------------------------> |
+-- '---------------------------------------------------------------------------------------------------------------------------------------------------------------'
+-- IR: Indirect result location register, IP: Intra-procedure register, PL: Platform register, FP: Frame pointer, LR: Link register, SP: Stack pointer
+-- BR: Base, SL: SpLim
+callerSavedRegisters :: [Reg]
+callerSavedRegisters
+    = map regSingle [0..18]
+    ++ map regSingle [32..39]
+    ++ map regSingle [48..63]
+
+-- | Apply a given mapping to all the register references in this
+-- instruction.
+patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+patchRegsOfInstr instr env = case instr of
+    -- 0. Meta Instructions
+    ANN d i        -> ANN d (patchRegsOfInstr i env)
+    -- 1. Arithmetic Instructions ----------------------------------------------
+    ADD o1 o2 o3   -> ADD (patchOp o1) (patchOp o2) (patchOp o3)
+    CMN o1 o2      -> CMN (patchOp o1) (patchOp o2)
+    CMP o1 o2      -> CMP (patchOp o1) (patchOp o2)
+    MSUB o1 o2 o3 o4 -> MSUB (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+    MUL o1 o2 o3   -> MUL (patchOp o1) (patchOp o2) (patchOp o3)
+    NEG o1 o2      -> NEG (patchOp o1) (patchOp o2)
+    SDIV o1 o2 o3  -> SDIV (patchOp o1) (patchOp o2) (patchOp o3)
+    SUB o1 o2 o3   -> SUB  (patchOp o1) (patchOp o2) (patchOp o3)
+    UDIV o1 o2 o3  -> UDIV (patchOp o1) (patchOp o2) (patchOp o3)
+
+    -- 2. Bit Manipulation Instructions ----------------------------------------
+    SBFM o1 o2 o3 o4 -> SBFM (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+    UBFM o1 o2 o3 o4 -> UBFM (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+
+    -- 3. Logical and Move Instructions ----------------------------------------
+    AND o1 o2 o3   -> AND  (patchOp o1) (patchOp o2) (patchOp o3)
+    ANDS o1 o2 o3  -> ANDS (patchOp o1) (patchOp o2) (patchOp o3)
+    ASR o1 o2 o3   -> ASR  (patchOp o1) (patchOp o2) (patchOp o3)
+    BIC o1 o2 o3   -> BIC  (patchOp o1) (patchOp o2) (patchOp o3)
+    BICS o1 o2 o3  -> BICS (patchOp o1) (patchOp o2) (patchOp o3)
+    EON o1 o2 o3   -> EON  (patchOp o1) (patchOp o2) (patchOp o3)
+    EOR o1 o2 o3   -> EOR  (patchOp o1) (patchOp o2) (patchOp o3)
+    LSL o1 o2 o3   -> LSL  (patchOp o1) (patchOp o2) (patchOp o3)
+    LSR o1 o2 o3   -> LSR  (patchOp o1) (patchOp o2) (patchOp o3)
+    MOV o1 o2      -> MOV  (patchOp o1) (patchOp o2)
+    MOVK o1 o2     -> MOVK (patchOp o1) (patchOp o2)
+    MVN o1 o2      -> MVN  (patchOp o1) (patchOp o2)
+    ORR o1 o2 o3   -> ORR  (patchOp o1) (patchOp o2) (patchOp o3)
+    ROR o1 o2 o3   -> ROR  (patchOp o1) (patchOp o2) (patchOp o3)
+    TST o1 o2      -> TST  (patchOp o1) (patchOp o2)
+
+    -- 4. Branch Instructions --------------------------------------------------
+    J t            -> J (patchTarget t)
+    B t            -> B (patchTarget t)
+    BL t rs ts     -> BL (patchTarget t) rs ts
+    BCOND c t      -> BCOND c (patchTarget t)
+
+    -- 5. Atomic Instructions --------------------------------------------------
+    -- 6. Conditional Instructions ---------------------------------------------
+    CSET o c       -> CSET (patchOp o) c
+    CBZ o l        -> CBZ (patchOp o) l
+    CBNZ o l       -> CBNZ (patchOp o) l
+    -- 7. Load and Store Instructions ------------------------------------------
+    STR f o1 o2    -> STR f (patchOp o1) (patchOp o2)
+    LDR f o1 o2    -> LDR f (patchOp o1) (patchOp o2)
+    STP f o1 o2 o3 -> STP f (patchOp o1) (patchOp o2) (patchOp o3)
+    LDP f o1 o2 o3 -> LDP f (patchOp o1) (patchOp o2) (patchOp o3)
+
+    -- 8. Synchronization Instructions -----------------------------------------
+    DMBSY          -> DMBSY
+
+    -- 9. Floating Point Instructions ------------------------------------------
+    FCVT o1 o2     -> FCVT (patchOp o1) (patchOp o2)
+    SCVTF o1 o2    -> SCVTF (patchOp o1) (patchOp o2)
+    FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
+
+    _ -> pprPanic "patchRegsOfInstr" (text $ show instr)
+    where
+        patchOp :: Operand -> Operand
+        patchOp (OpReg w r) = OpReg w (env r)
+        patchOp (OpRegExt w r x s) = OpRegExt w (env r) x s
+        patchOp (OpRegShift w r m s) = OpRegShift w (env r) m s
+        patchOp (OpAddr a) = OpAddr (patchAddr a)
+        patchOp op = op
+        patchTarget :: Target -> Target
+        patchTarget (TReg r) = TReg (env r)
+        patchTarget t = t
+        patchAddr :: AddrMode -> AddrMode
+        patchAddr (AddrRegReg r1 r2) = AddrRegReg (env r1) (env r2)
+        patchAddr (AddrRegImm r1 i)  = AddrRegImm (env r1) i
+        patchAddr (AddrReg r) = AddrReg (env r)
+--------------------------------------------------------------------------------
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+isJumpishInstr :: Instr -> Bool
+isJumpishInstr instr = case instr of
+    ANN _ i -> isJumpishInstr i
+    CBZ{} -> True
+    CBNZ{} -> True
+    J{} -> True
+    B{} -> True
+    BL{} -> True
+    BCOND{} -> True
+    _ -> False
+
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+jumpDestsOfInstr :: Instr -> [BlockId]
+jumpDestsOfInstr (ANN _ i) = jumpDestsOfInstr i
+jumpDestsOfInstr (CBZ _ t) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (CBNZ _ t) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (J t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (B t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (BL t _ _) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (BCOND _ t) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr _ = []
+
+-- | Change the destination of this jump instruction.
+-- Used in the linear allocator when adding fixup blocks for join
+-- points.
+patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr
+patchJumpInstr instr patchF
+    = case instr of
+        ANN d i -> ANN d (patchJumpInstr i patchF)
+        CBZ r (TBlock bid) -> CBZ r (TBlock (patchF bid))
+        CBNZ r (TBlock bid) -> CBNZ r (TBlock (patchF bid))
+        J (TBlock bid) -> J (TBlock (patchF bid))
+        B (TBlock bid) -> B (TBlock (patchF bid))
+        BL (TBlock bid) ps rs -> BL (TBlock (patchF bid)) ps rs
+        BCOND c (TBlock bid) -> BCOND c (TBlock (patchF bid))
+        _ -> pprPanic "patchJumpInstr" (text $ show instr)
+
+-- -----------------------------------------------------------------------------
+-- Note [Spills and Reloads]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~
+-- We reserve @RESERVED_C_STACK_BYTES@ on the C stack for spilling and reloading
+-- registers.  AArch64s maximum displacement for SP relative spills and reloads
+-- is essentially [-256,255], or [0, 0xFFF]*8 = [0, 32760] for 64bits.
+--
+-- The @RESERVED_C_STACK_BYTES@ is 16k, so we can't address any location in a
+-- single instruction.  The idea is to use the Inter Procedure 0 (ip0) register
+-- to perform the computations for larger offsets.
+--
+-- Using sp to compute the offset will violate assumptions about the stack pointer
+-- pointing to the top of the stack during signal handling.  As we can't force
+-- every signal to use its own stack, we have to ensure that the stack poitner
+-- always poitns to the top of the stack, and we can't use it for computation.
+--
+-- | An instruction to spill a register into a spill slot.
+mkSpillInstr
+   :: HasCallStack
+   => NCGConfig
+   -> Reg       -- register to spill
+   -> Int       -- current stack delta
+   -> Int       -- spill slot to use
+   -> [Instr]
+
+mkSpillInstr config reg delta slot =
+  case (spillSlotToOffset config slot) - delta of
+    imm | -256 <= imm && imm <= 255                               -> [ mkStrSp imm ]
+    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkStrSp imm ]
+    imm | imm > 0xfff && imm <= 0xffffff && imm .&. 0x7 == 0x0    -> [ mkIp0SpillAddr (imm .&~. 0xfff)
+                                                                     , mkStrIp0 (imm .&.  0xfff)
+                                                                     ]
+    imm -> pprPanic "mkSpillInstr" (text "Unable to spill into" <+> int imm)
+    where
+        a .&~. b = a .&. (complement b)
+
+        fmt = case reg of
+            RegReal (RealRegSingle n) | n < 32 -> II64
+            _                                  -> FF64
+        mkIp0SpillAddr imm = ANN (text "Spill: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
+        mkStrSp imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
+        mkStrIp0 imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
+
+        off = spillSlotToOffset config slot
+
+mkLoadInstr
+   :: NCGConfig
+   -> Reg       -- register to load
+   -> Int       -- current stack delta
+   -> Int       -- spill slot to use
+   -> [Instr]
+
+mkLoadInstr config reg delta slot =
+  case (spillSlotToOffset config slot) - delta of
+    imm | -256 <= imm && imm <= 255                               -> [ mkLdrSp imm ]
+    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkLdrSp imm ]
+    imm | imm > 0xfff && imm <= 0xffffff && imm .&. 0x7 == 0x0    -> [ mkIp0SpillAddr (imm .&~. 0xfff)
+                                                                     , mkLdrIp0 (imm .&.  0xfff)
+                                                                     ]
+    imm -> pprPanic "mkSpillInstr" (text "Unable to spill into" <+> int imm)
+    where
+        a .&~. b = a .&. (complement b)
+
+        fmt = case reg of
+            RegReal (RealRegSingle n) | n < 32 -> II64
+            _                                  -> FF64
+
+        mkIp0SpillAddr imm = ANN (text "Reload: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
+        mkLdrSp imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
+        mkLdrIp0 imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
+
+        off = spillSlotToOffset config slot
+
+--------------------------------------------------------------------------------
+-- | See if this instruction is telling us the current C stack delta
+takeDeltaInstr :: Instr -> Maybe Int
+takeDeltaInstr (ANN _ i) = takeDeltaInstr i
+takeDeltaInstr (DELTA i) = Just i
+takeDeltaInstr _         = Nothing
+
+-- Not real instructions.  Just meta data
+isMetaInstr :: Instr -> Bool
+isMetaInstr instr
+ = case instr of
+    ANN _ i     -> isMetaInstr i
+    COMMENT{}   -> True
+    MULTILINE_COMMENT{} -> True
+    LOCATION{}  -> True
+    LDATA{}     -> True
+    NEWBLOCK{}  -> True
+    DELTA{}     -> True
+    PUSH_STACK_FRAME -> True
+    POP_STACK_FRAME -> True
+    _           -> False
+
+-- | Copy the value in a register to another one.
+-- Must work for all register classes.
+mkRegRegMoveInstr :: Reg -> Reg -> Instr
+mkRegRegMoveInstr src dst = ANN (text $ "Reg->Reg Move: " ++ show src ++ " -> " ++ show dst) $ MOV (OpReg W64 dst) (OpReg W64 src)
+
+-- | Take the source and destination from this reg -> reg move instruction
+-- or Nothing if it's not one
+takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg)
+--takeRegRegMoveInstr (MOV (OpReg fmt dst) (OpReg fmt' src)) | fmt == fmt' = Just (src, dst)
+takeRegRegMoveInstr _ = Nothing
+
+-- | Make an unconditional jump instruction.
+mkJumpInstr :: BlockId -> [Instr]
+mkJumpInstr id = [B (TBlock id)]
+
+mkStackAllocInstr :: Platform -> Int -> [Instr]
+mkStackAllocInstr platform n
+    | n == 0 = []
+    | n > 0 && n < 4096 = [ ANN (text "Alloc More Stack") $ SUB sp sp (OpImm (ImmInt n)) ]
+    | n > 0 =  ANN (text "Alloc More Stack") (SUB sp sp (OpImm (ImmInt 4095))) : mkStackAllocInstr platform (n - 4095)
+mkStackAllocInstr _platform n = pprPanic "mkStackAllocInstr" (int n)
+
+mkStackDeallocInstr :: Platform -> Int -> [Instr]
+mkStackDeallocInstr platform n
+    | n == 0 = []
+    | n > 0 && n < 4096 = [ ANN (text "Dealloc More Stack") $ ADD sp sp (OpImm (ImmInt n)) ]
+    | n > 0 =  ANN (text "Dealloc More Stack") (ADD sp sp (OpImm (ImmInt 4095))) : mkStackDeallocInstr platform (n - 4095)
+mkStackDeallocInstr _platform n = pprPanic "mkStackDeallocInstr" (int n)
+
+--
+-- See note [extra spill slots] in X86/Instr.hs
+--
+allocMoreStack
+  :: Platform
+  -> Int
+  -> NatCmmDecl statics GHC.CmmToAsm.AArch64.Instr.Instr
+  -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.AArch64.Instr.Instr, [(BlockId,BlockId)])
+
+allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
+allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
+    let entries = entryBlocks proc
+
+    uniqs <- replicateM (length entries) getUniqueM
+
+    let
+      delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
+        where x = slots * spillSlotSize -- sp delta
+
+      alloc   = mkStackAllocInstr   platform delta
+      dealloc = mkStackDeallocInstr platform delta
+
+      retargetList = (zip entries (map mkBlockId uniqs))
+
+      new_blockmap :: LabelMap BlockId
+      new_blockmap = mapFromList retargetList
+
+      insert_stack_insn (BasicBlock id insns)
+        | Just new_blockid <- mapLookup id new_blockmap
+        = [ BasicBlock id $ alloc ++ [ B (TBlock new_blockid) ]
+          , BasicBlock new_blockid block' ]
+        | otherwise
+        = [ BasicBlock id block' ]
+        where
+          block' = foldr insert_dealloc [] insns
+
+      insert_dealloc insn r = case insn of
+        J _ -> dealloc ++ (insn : r)
+        ANN _ (J _) -> dealloc ++ (insn : r)
+        _other | jumpDestsOfInstr insn /= []
+            -> patchJumpInstr insn retarget : r
+        _other -> insn : r
+
+        where retarget b = fromMaybe b (mapLookup b new_blockmap)
+
+      new_code = concatMap insert_stack_insn code
+    -- in
+    return (CmmProc info lbl live (ListGraph new_code), retargetList)
+-- -----------------------------------------------------------------------------
+-- Machine's assembly language
+
+-- We have a few common "instructions" (nearly all the pseudo-ops) but
+-- mostly all of 'Instr' is machine-specific.
+
+data Instr
+    -- comment pseudo-op
+    = COMMENT SDoc
+    | MULTILINE_COMMENT SDoc
+
+    -- Annotated instruction. Should print <instr> # <doc>
+    | ANN SDoc Instr
+
+    -- location pseudo-op (file, line, col, name)
+    | LOCATION Int Int Int String
+
+    -- some static data spat out during code
+    -- generation.  Will be extracted before
+    -- pretty-printing.
+    | LDATA   Section RawCmmStatics
+
+    -- start a new basic block.  Useful during
+    -- codegen, removed later.  Preceding
+    -- instruction should be a jump, as per the
+    -- invariants for a BasicBlock (see Cmm).
+    | NEWBLOCK BlockId
+
+    -- specify current stack offset for
+    -- benefit of subsequent passes
+    | DELTA   Int
+
+    -- 0. Pseudo Instructions --------------------------------------------------
+    -- These are instructions not contained or only partially contained in the
+    -- official ISA, but make reading clearer. Some of them might even be
+    -- implemented in the assembler, but are not guaranteed to be portable.
+    -- | SXTB Operand Operand
+    -- | SXTH Operand Operand
+    -- | SXTW Operand Operand
+    -- | SXTX Operand Operand
+    | PUSH_STACK_FRAME
+    | POP_STACK_FRAME
+    -- 1. Arithmetic Instructions ----------------------------------------------
+    -- | ADC Operand Operand Operang -- rd = rn + rm + C
+    -- | ADCS ...
+    | ADD Operand Operand Operand -- rd = rn + rm
+    -- | ADDS Operand Operand Operand -- rd = rn + rm
+    -- | ADR ...
+    -- | ADRP ...
+    | CMN Operand Operand -- rd + op2
+    | CMP Operand Operand -- rd − op2
+    -- | MADD ...
+    -- | MNEG ...
+    | MSUB Operand Operand Operand Operand -- rd = ra − rn × rm
+    | MUL Operand Operand Operand -- rd = rn × rm
+    | NEG Operand Operand -- rd = −op2
+    -- | NEGS ...
+    -- | NGC ...
+    -- | NGCS ...
+    -- | SBC ...
+    -- | SBCS ...
+    | SDIV Operand Operand Operand -- rd = rn ÷ rm
+    -- | SMADDL ...
+    -- | SMNEGL ...
+    -- | SMSUBL ...
+    -- | SMULH ...
+    -- | SMULL ...
+    | SUB Operand Operand Operand -- rd = rn - op2
+    -- | SUBS ...
+    | UDIV Operand Operand Operand -- rd = rn ÷ rm
+    -- | UMADDL ...  -- Xd = Xa + Wn × Wm
+    -- | UMNEGL ... -- Xd = − Wn × Wm
+    -- | UMSUBL ... -- Xd = Xa − Wn × Wm
+    -- | UMULH ... -- Xd = (Xn × Xm)_127:64
+    -- | UMULL ... -- Xd = Wn × Wm
+
+    -- 2. Bit Manipulation Instructions ----------------------------------------
+    | SBFM Operand Operand Operand Operand -- rd = rn[i,j]
+    -- SXTB = SBFM <Wd>, <Wn>, #0, #7
+    -- SXTH = SBFM <Wd>, <Wn>, #0, #15
+    -- SXTW = SBFM <Wd>, <Wn>, #0, #31
+    | UBFM Operand Operand Operand Operand -- rd = rn[i,j]
+    -- UXTB = UBFM <Wd>, <Wn>, #0, #7
+    -- UXTH = UBFM <Wd>, <Wn>, #0, #15
+
+    -- XXX
+    -- 3. Logical and Move Instructions ----------------------------------------
+    | AND Operand Operand Operand -- rd = rn & op2
+    | ANDS Operand Operand Operand -- rd = rn & op2
+    | ASR Operand Operand Operand -- rd = rn ≫ rm  or  rd = rn ≫ #i, i is 6 bits
+    | BIC Operand Operand Operand -- rd = rn & ∼op2
+    | BICS Operand Operand Operand -- rd = rn & ∼op2
+    | EON Operand Operand Operand -- rd = rn ⊕ ∼op2
+    | EOR Operand Operand Operand -- rd = rn ⊕ op2
+    | LSL Operand Operand Operand -- rd = rn ≪ rm  or rd = rn ≪ #i, i is 6 bits
+    | LSR Operand Operand Operand -- rd = rn ≫ rm  or rd = rn ≫ #i, i is 6 bits
+    | MOV Operand Operand -- rd = rn  or  rd = #i
+    | MOVK Operand Operand
+    -- | MOVN Operand Operand
+    -- | MOVZ Operand Operand
+    | MVN Operand Operand -- rd = ~rn
+    | ORN Operand Operand Operand -- rd = rn | ∼op2
+    | ORR Operand Operand Operand -- rd = rn | op2
+    | ROR Operand Operand Operand -- rd = rn ≫ rm  or  rd = rn ≫ #i, i is 6 bits
+    | TST Operand Operand -- rn & op2
+    -- Load and stores.
+    -- XXX STR/LDR might want to change to STP/LDP with XZR for the second register.
+    | STR Format Operand Operand -- str Xn, address-mode // Xn -> *addr
+    | LDR Format Operand Operand -- ldr Xn, address-mode // Xn <- *addr
+    | STP Format Operand Operand Operand -- stp Xn, Xm, address-mode // Xn -> *addr, Xm -> *(addr + 8)
+    | LDP Format Operand Operand Operand -- stp Xn, Xm, address-mode // Xn <- *addr, Xm <- *(addr + 8)
+
+    -- Conditional instructions
+    | CSET Operand Cond   -- if(cond) op <- 1 else op <- 0
+
+    | CBZ Operand Target  -- if op == 0, then branch.
+    | CBNZ Operand Target -- if op /= 0, then branch.
+    -- Branching.
+    | J Target            -- like B, but only generated from genJump. Used to distinguish genJumps from others.
+    | B Target            -- unconditional branching b/br. (To a blockid, label or register)
+    | BL Target [Reg] [Reg] -- branch and link (e.g. set x30 to next pc, and branch)
+    | BCOND Cond Target   -- branch with condition. b.<cond>
+
+    -- 8. Synchronization Instructions -----------------------------------------
+    | DMBSY
+    -- 9. Floating Point Instructions
+    -- Float ConVerT
+    | FCVT Operand Operand
+    -- Signed ConVerT Float
+    | SCVTF Operand Operand
+    -- Float ConVerT to Zero Signed
+    | FCVTZS Operand Operand
+
+instance Show Instr where
+    show (LDR _f o1 o2) = "LDR " ++ show o1 ++ ", " ++ show o2
+    show (MOV o1 o2) = "MOV " ++ show o1 ++ ", " ++ show o2
+    show _ = "missing"
+
+data Target
+    = TBlock BlockId
+    | TLabel CLabel
+    | TReg   Reg
+
+
+-- Extension
+-- {Unsigned|Signed}XT{Byte|Half|Word|Doube}
+data ExtMode
+    = EUXTB | EUXTH | EUXTW | EUXTX
+    | ESXTB | ESXTH | ESXTW | ESXTX
+    deriving (Eq, Show)
+
+data ShiftMode
+    = SLSL | SLSR | SASR | SROR
+    deriving (Eq, Show)
+
+
+-- We can also add ExtShift to Extension.
+-- However at most 3bits.
+type ExtShift = Int
+-- at most 6bits
+type RegShift = Int
+
+data Operand
+        = OpReg Width Reg            -- register
+        | OpRegExt Width Reg ExtMode ExtShift -- rm, <ext>[, <shift left>]
+        | OpRegShift Width Reg ShiftMode RegShift     -- rm, <shift>, <0-64>
+        | OpImm Imm            -- immediate value
+        | OpImmShift Imm ShiftMode RegShift
+        | OpAddr AddrMode       -- memory reference
+        deriving (Eq, Show)
+
+-- Smart constructors
+opReg :: Width -> Reg -> Operand
+opReg = OpReg
+
+xzr, wzr, sp, ip0 :: Operand
+xzr = OpReg W64 (RegReal (RealRegSingle (-1)))
+wzr = OpReg W32 (RegReal (RealRegSingle (-1)))
+sp  = OpReg W64 (RegReal (RealRegSingle 31))
+ip0 = OpReg W64 (RegReal (RealRegSingle 16))
+
+_x :: Int -> Operand
+_x i = OpReg W64 (RegReal (RealRegSingle i))
+x0,  x1,  x2,  x3,  x4,  x5,  x6,  x7  :: Operand
+x8,  x9,  x10, x11, x12, x13, x14, x15 :: Operand
+x16, x17, x18, x19, x20, x21, x22, x23 :: Operand
+x24, x25, x26, x27, x28, x29, x30, x31 :: Operand
+x0  = OpReg W64 (RegReal (RealRegSingle  0))
+x1  = OpReg W64 (RegReal (RealRegSingle  1))
+x2  = OpReg W64 (RegReal (RealRegSingle  2))
+x3  = OpReg W64 (RegReal (RealRegSingle  3))
+x4  = OpReg W64 (RegReal (RealRegSingle  4))
+x5  = OpReg W64 (RegReal (RealRegSingle  5))
+x6  = OpReg W64 (RegReal (RealRegSingle  6))
+x7  = OpReg W64 (RegReal (RealRegSingle  7))
+x8  = OpReg W64 (RegReal (RealRegSingle  8))
+x9  = OpReg W64 (RegReal (RealRegSingle  9))
+x10 = OpReg W64 (RegReal (RealRegSingle 10))
+x11 = OpReg W64 (RegReal (RealRegSingle 11))
+x12 = OpReg W64 (RegReal (RealRegSingle 12))
+x13 = OpReg W64 (RegReal (RealRegSingle 13))
+x14 = OpReg W64 (RegReal (RealRegSingle 14))
+x15 = OpReg W64 (RegReal (RealRegSingle 15))
+x16 = OpReg W64 (RegReal (RealRegSingle 16))
+x17 = OpReg W64 (RegReal (RealRegSingle 17))
+x18 = OpReg W64 (RegReal (RealRegSingle 18))
+x19 = OpReg W64 (RegReal (RealRegSingle 19))
+x20 = OpReg W64 (RegReal (RealRegSingle 20))
+x21 = OpReg W64 (RegReal (RealRegSingle 21))
+x22 = OpReg W64 (RegReal (RealRegSingle 22))
+x23 = OpReg W64 (RegReal (RealRegSingle 23))
+x24 = OpReg W64 (RegReal (RealRegSingle 24))
+x25 = OpReg W64 (RegReal (RealRegSingle 25))
+x26 = OpReg W64 (RegReal (RealRegSingle 26))
+x27 = OpReg W64 (RegReal (RealRegSingle 27))
+x28 = OpReg W64 (RegReal (RealRegSingle 28))
+x29 = OpReg W64 (RegReal (RealRegSingle 29))
+x30 = OpReg W64 (RegReal (RealRegSingle 30))
+x31 = OpReg W64 (RegReal (RealRegSingle 31))
+
+_d :: Int -> Operand
+_d = OpReg W64 . RegReal . RealRegSingle
+d0,  d1,  d2,  d3,  d4,  d5,  d6,  d7  :: Operand
+d8,  d9,  d10, d11, d12, d13, d14, d15 :: Operand
+d16, d17, d18, d19, d20, d21, d22, d23 :: Operand
+d24, d25, d26, d27, d28, d29, d30, d31 :: Operand
+d0  = OpReg W64 (RegReal (RealRegSingle 32))
+d1  = OpReg W64 (RegReal (RealRegSingle 33))
+d2  = OpReg W64 (RegReal (RealRegSingle 34))
+d3  = OpReg W64 (RegReal (RealRegSingle 35))
+d4  = OpReg W64 (RegReal (RealRegSingle 36))
+d5  = OpReg W64 (RegReal (RealRegSingle 37))
+d6  = OpReg W64 (RegReal (RealRegSingle 38))
+d7  = OpReg W64 (RegReal (RealRegSingle 39))
+d8  = OpReg W64 (RegReal (RealRegSingle 40))
+d9  = OpReg W64 (RegReal (RealRegSingle 41))
+d10 = OpReg W64 (RegReal (RealRegSingle 42))
+d11 = OpReg W64 (RegReal (RealRegSingle 43))
+d12 = OpReg W64 (RegReal (RealRegSingle 44))
+d13 = OpReg W64 (RegReal (RealRegSingle 45))
+d14 = OpReg W64 (RegReal (RealRegSingle 46))
+d15 = OpReg W64 (RegReal (RealRegSingle 47))
+d16 = OpReg W64 (RegReal (RealRegSingle 48))
+d17 = OpReg W64 (RegReal (RealRegSingle 49))
+d18 = OpReg W64 (RegReal (RealRegSingle 50))
+d19 = OpReg W64 (RegReal (RealRegSingle 51))
+d20 = OpReg W64 (RegReal (RealRegSingle 52))
+d21 = OpReg W64 (RegReal (RealRegSingle 53))
+d22 = OpReg W64 (RegReal (RealRegSingle 54))
+d23 = OpReg W64 (RegReal (RealRegSingle 55))
+d24 = OpReg W64 (RegReal (RealRegSingle 56))
+d25 = OpReg W64 (RegReal (RealRegSingle 57))
+d26 = OpReg W64 (RegReal (RealRegSingle 58))
+d27 = OpReg W64 (RegReal (RealRegSingle 59))
+d28 = OpReg W64 (RegReal (RealRegSingle 60))
+d29 = OpReg W64 (RegReal (RealRegSingle 61))
+d30 = OpReg W64 (RegReal (RealRegSingle 62))
+d31 = OpReg W64 (RegReal (RealRegSingle 63))
+
+opRegUExt :: Width -> Reg -> Operand
+opRegUExt W64 r = OpRegExt W64 r EUXTX 0
+opRegUExt W32 r = OpRegExt W32 r EUXTW 0
+opRegUExt W16 r = OpRegExt W16 r EUXTH 0
+opRegUExt W8  r = OpRegExt W8  r EUXTB 0
+opRegUExt w  _r = pprPanic "opRegUExt" (text $ show w)
+
+opRegSExt :: Width -> Reg -> Operand
+opRegSExt W64 r = OpRegExt W64 r ESXTX 0
+opRegSExt W32 r = OpRegExt W32 r ESXTW 0
+opRegSExt W16 r = OpRegExt W16 r ESXTH 0
+opRegSExt W8  r = OpRegExt W8  r ESXTB 0
+opRegSExt w  _r = pprPanic "opRegSExt" (text $ show w)
--- a/compiler/GHC/CmmToAsm/AArch64/LLVM Test Results.md
+++ b/compiler/GHC/CmmToAsm/AArch64/LLVM Test Results.md
--- a/compiler/GHC/CmmToAsm/AArch64/NCG Test Results.md
+++ b/compiler/GHC/CmmToAsm/AArch64/NCG Test Results.md
--- a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.AArch64.Ppr (pprNatCmmDecl, pprInstr) where
+
+import GHC.Prelude hiding (EQ)
+
+import Data.Word
+import qualified Data.Array.Unsafe as U ( castSTUArray )
+import Data.Array.ST
+import Control.Monad.ST
+
+import GHC.CmmToAsm.AArch64.Instr
+import GHC.CmmToAsm.AArch64.Regs
+import GHC.CmmToAsm.AArch64.Cond
+import GHC.CmmToAsm.Ppr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+import GHC.CmmToAsm.Utils
+
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Types.Basic (Alignment, mkAlignment, alignmentBytes)
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.Ppr.Expr () -- For Outputable instances
+
+import GHC.Types.Unique ( pprUniqueAlways, getUnique )
+import GHC.Platform
+import GHC.Data.FastString
+import GHC.Utils.Outputable
+import GHC.Driver.Session (targetPlatform)
+
+import GHC.Utils.Panic
+
+pprProcAlignment :: NCGConfig -> SDoc
+pprProcAlignment config = maybe empty (pprAlign platform . mkAlignment) (ncgProcAlignment config)
+   where
+      platform = ncgPlatform config
+
+pprNatCmmDecl :: NCGConfig -> NatCmmDecl RawCmmStatics Instr -> SDoc
+pprNatCmmDecl config (CmmData section dats) =
+  pprSectionAlign config section $$ pprDatas config dats
+
+pprNatCmmDecl config proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+  let platform = ncgPlatform config in
+  pprProcAlignment config $$
+  case topInfoTable proc of
+    Nothing ->
+        -- special case for code without info table:
+        pprSectionAlign config (Section Text lbl) $$
+        -- do not
+        -- pprProcAlignment config $$
+        pprLabel platform lbl $$ -- blocks guaranteed not null, so label needed
+        vcat (map (pprBasicBlock config top_info) blocks) $$
+        (if ncgDwarfEnabled config
+         then ppr (mkAsmTempEndLabel lbl) <> char ':' else empty) $$
+        pprSizeDecl platform lbl
+
+    Just (CmmStaticsRaw info_lbl _) ->
+      pprSectionAlign config (Section Text info_lbl) $$
+      -- pprProcAlignment config $$
+      (if platformHasSubsectionsViaSymbols platform
+          then ppr (mkDeadStripPreventer info_lbl) <> char ':'
+          else empty) $$
+      vcat (map (pprBasicBlock config top_info) blocks) $$
+      -- above: Even the first block gets a label, because with branch-chain
+      -- elimination, it might be the target of a goto.
+      (if platformHasSubsectionsViaSymbols platform
+       then -- See Note [Subsections Via Symbols]
+                text "\t.long "
+            <+> ppr info_lbl
+            <+> char '-'
+            <+> ppr (mkDeadStripPreventer info_lbl)
+       else empty) $$
+      pprSizeDecl platform info_lbl
+
+pprLabel :: Platform -> CLabel -> SDoc
+pprLabel platform lbl =
+   pprGloblDecl platform lbl
+   $$ pprTypeDecl platform lbl
+   $$ (pdoc platform lbl <> char ':')
+
+pprAlign :: Platform -> Alignment -> SDoc
+pprAlign _platform alignment
+        = text "\t.balign " <> int (alignmentBytes alignment)
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: Platform -> SectionType -> SDoc
+pprAlignForSection _platform _seg
+    -- .balign is stable, whereas .align is platform dependent.
+    = text "\t.balign 8" --  always 8
+
+instance Outputable Instr where
+    ppr = pprInstr genericPlatform
+
+-- | Print section header and appropriate alignment for that section.
+--
+-- This one will emit the header:
+--
+--     .section .text
+--     .balign 8
+--
+pprSectionAlign :: NCGConfig -> Section -> SDoc
+pprSectionAlign _config (Section (OtherSection _) _) =
+     panic "AArch64.Ppr.pprSectionAlign: unknown section"
+pprSectionAlign config sec@(Section seg _) =
+    pprSectionHeader config sec
+    $$ pprAlignForSection (ncgPlatform config) seg
+
+-- | Output the ELF .size directive.
+pprSizeDecl :: Platform -> CLabel -> SDoc
+pprSizeDecl platform lbl
+ = if osElfTarget (platformOS platform)
+   then text "\t.size" <+> pdoc platform lbl <> ptext (sLit ", .-") <> pdoc platform lbl
+   else empty
+
+pprBasicBlock :: NCGConfig -> LabelMap RawCmmStatics -> NatBasicBlock Instr
+              -> SDoc
+pprBasicBlock config info_env (BasicBlock blockid instrs)
+  = maybe_infotable $
+    pprLabel platform asmLbl $$
+    vcat (map (pprInstr platform) (id {-detectTrivialDeadlock-} optInstrs)) $$
+    (if  ncgDwarfEnabled config
+      then ppr (mkAsmTempEndLabel asmLbl) <> char ':'
+      else empty
+    )
+  where
+    -- Filter out identity moves. E.g. mov x18, x18 will be dropped.
+    optInstrs = filter f instrs
+      where f (MOV o1 o2) | o1 == o2 = False
+            f _ = True
+
+    -- XXX: put deadlock detection behind a flag. This will need to pass over
+    -- each emitted instruction and can thus cause a slowdown in the number of
+    -- instructions we generate.
+    --
+    -- detect the trivial cases where we would need -fno-omit-yields
+    -- those are deadlocks where we have only an unconditional branch
+    -- instruction back to the block head, with no escape inbetween.
+    -- See https://gitlab.haskell.org/ghc/ghc/-/issues/367
+    -- This only intends to catch the very trivial case, not the more
+    -- compilicated cases.
+    {-
+    detectTrivialDeadlock :: [Instr] -> [Instr]
+    detectTrivialDeadlock instrs = case (findIndex isSelfBranch instrs) of
+      Just n | all (not . aarch64_isJumpishInstr) (take n instrs) ->
+        pprPanic "AArch64 NCG"
+                $  text "Deadlock detected! Re compile with -fno-omit-yields."
+                $$ text ""
+                $$ pprLabel platform asmLbl
+                $$ vcat (map (pprInstr platform) (take (n + 1) instrs))
+                $$ text ""
+                $$ text "See https://gitlab.haskell.org/ghc/ghc/-/issues/367"
+      -- Nothing, or there are jumpishInstructions before the self branch,
+      -- probably not a deadlock.
+      _ -> instrs
+
+      where isSelfBranch (B (TBlock blockid')) = blockid' == blockid
+            isSelfBranch _ = False
+    -}
+
+    asmLbl = blockLbl blockid
+    platform = ncgPlatform config
+    maybe_infotable c = case mapLookup blockid info_env of
+       Nothing   -> c
+       Just (CmmStaticsRaw info_lbl info) ->
+          --  pprAlignForSection platform Text $$
+           infoTableLoc $$
+           vcat (map (pprData config) info) $$
+           pprLabel platform info_lbl $$
+           c $$
+           (if ncgDwarfEnabled config
+             then ppr (mkAsmTempEndLabel info_lbl) <> char ':'
+             else empty)
+    -- Make sure the info table has the right .loc for the block
+    -- coming right after it. See [Note: Info Offset]
+    infoTableLoc = case instrs of
+      (l@LOCATION{} : _) -> pprInstr platform l
+      _other             -> empty
+
+pprDatas :: NCGConfig -> RawCmmStatics -> SDoc
+-- See note [emit-time elimination of static indirections] in "GHC.Cmm.CLabel".
+pprDatas config (CmmStaticsRaw alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+  | lbl == mkIndStaticInfoLabel
+  , let labelInd (CmmLabelOff l _) = Just l
+        labelInd (CmmLabel l) = Just l
+        labelInd _ = Nothing
+  , Just ind' <- labelInd ind
+  , alias `mayRedirectTo` ind'
+  = pprGloblDecl (ncgPlatform config) alias
+    $$ text ".equiv" <+> pdoc (ncgPlatform config) alias <> comma <> pdoc (ncgPlatform config) (CmmLabel ind')
+
+pprDatas config (CmmStaticsRaw lbl dats)
+  = vcat (pprLabel platform lbl : map (pprData config) dats)
+   where
+      platform = ncgPlatform config
+
+pprData :: NCGConfig -> CmmStatic -> SDoc
+pprData _config (CmmString str) = pprString str
+pprData _config (CmmFileEmbed path) = pprFileEmbed path
+
+pprData config (CmmUninitialised bytes)
+ = let platform = ncgPlatform config
+   in if platformOS platform == OSDarwin
+         then text ".space " <> int bytes
+         else text ".skip "  <> int bytes
+
+pprData config (CmmStaticLit lit) = pprDataItem config lit
+
+pprGloblDecl :: Platform -> CLabel -> SDoc
+pprGloblDecl platform lbl
+  | not (externallyVisibleCLabel lbl) = empty
+  | otherwise = text "\t.globl " <> pdoc platform lbl
+
+-- See discussion in X86.Ppr
+-- for why this is necessary.  Essentially we need to ensure that we never
+-- pass function symbols when we migth want to lookup the info table.  If we
+-- did, we could end up with procedure linking tables (PLT)s, and thus the
+-- lookup wouldn't pooint to the function, but into the jump table.
+pprLabelType' :: Platform -> CLabel -> SDoc
+pprLabelType' platform lbl =
+  if isCFunctionLabel lbl || functionOkInfoTable then
+    text "@function"
+  else
+    text "@object"
+  where
+    functionOkInfoTable = platformTablesNextToCode platform &&
+      isInfoTableLabel lbl && not (isConInfoTableLabel lbl)
+
+-- this is called pprTypeAndSizeDecl in PPC.Ppr
+pprTypeDecl :: Platform -> CLabel -> SDoc
+pprTypeDecl platform lbl
+    = if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl
+      then text ".type " <> pdoc platform lbl <> ptext (sLit  ", ") <> pprLabelType' platform lbl
+      else empty
+
+pprDataItem :: NCGConfig -> CmmLit -> SDoc
+pprDataItem config lit
+  = vcat (ppr_item (cmmTypeFormat $ cmmLitType platform lit) lit)
+    where
+        platform = ncgPlatform config
+
+        imm = litToImm lit
+
+        ppr_item II8  _ = [text "\t.byte\t"  <> pprImm platform imm]
+        ppr_item II16 _ = [text "\t.short\t" <> pprImm platform imm]
+        ppr_item II32 _ = [text "\t.long\t"  <> pprImm platform imm]
+        ppr_item II64 _ = [text "\t.quad\t"  <> pprImm platform imm]
+
+        ppr_item FF32  (CmmFloat r _)
+           = let bs = floatToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm platform (ImmInt b)) bs
+
+        ppr_item FF64 (CmmFloat r _)
+           = let bs = doubleToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm platform (ImmInt b)) bs
+
+        ppr_item _ _ = pprPanic "pprDataItem:ppr_item" (text $ show lit)
+
+floatToBytes :: Float -> [Int]
+floatToBytes f
+   = runST (do
+        arr <- newArray_ ((0::Int),3)
+        writeArray arr 0 f
+        arr <- castFloatToWord8Array arr
+        i0 <- readArray arr 0
+        i1 <- readArray arr 1
+        i2 <- readArray arr 2
+        i3 <- readArray arr 3
+        return (map fromIntegral [i0,i1,i2,i3])
+     )
+
+castFloatToWord8Array :: STUArray s Int Float -> ST s (STUArray s Int Word8)
+castFloatToWord8Array = U.castSTUArray
+
+pprImm :: Platform -> Imm -> SDoc
+pprImm _ (ImmInt i)     = int i
+pprImm _ (ImmInteger i) = integer i
+pprImm p (ImmCLbl l)    = pdoc p l
+pprImm p (ImmIndex l i) = pdoc p l <> char '+' <> int i
+pprImm _ (ImmLit s)     = s
+
+-- XXX: See pprIm below for why this is a bad idea!
+pprImm _ (ImmFloat f)
+  | f == 0 = text "wzr"
+  | otherwise = float (fromRational f)
+pprImm _ (ImmDouble d)
+  | d == 0 = text "xzr"
+  | otherwise = double (fromRational d)
+
+pprImm p (ImmConstantSum a b) = pprImm p a <> char '+' <> pprImm p b
+pprImm p (ImmConstantDiff a b) = pprImm p a <> char '-'
+                   <> lparen <> pprImm p b <> rparen
+
+
+-- aarch64 GNU as uses // for comments.
+asmComment :: SDoc -> SDoc
+asmComment c = whenPprDebug $ text "#" <+> c
+
+asmDoubleslashComment :: SDoc -> SDoc
+asmDoubleslashComment c = whenPprDebug $ text "//" <+> c
+
+asmMultilineComment :: SDoc -> SDoc
+asmMultilineComment c = whenPprDebug $ text "/*" $+$ c $+$ text "*/"
+
+pprIm :: Platform -> Imm -> SDoc
+pprIm platform im = case im of
+  ImmInt i     -> char '#' <> int i
+  ImmInteger i -> char '#' <> integer i
+
+  -- XXX: This will only work for
+  -- The floating point value must be expressable as ±n ÷ 16 × 2^r,
+  -- where n and r are integers such that 16 ≤ n ≤ 31 and −3 ≤ r ≤ 4.
+  -- and 0 needs to be encoded as wzr/xzr.
+  --
+  -- Except for 0, we might want to either split it up into enough
+  -- ADD operations into an Integer register and then just bit copy it into
+  -- the double register? See the toBytes + fromRational above for data items.
+  -- This is something the x86 backend does.
+  --
+  -- We could also just turn them into statics :-/ Which is what the
+  -- PowerPC backend odes.
+  ImmFloat f | f == 0 -> text "wzr"
+  ImmFloat f -> char '#' <> float (fromRational f)
+  ImmDouble d | d == 0 -> text "xzr"
+  ImmDouble d -> char '#' <> double (fromRational d)
+  -- =<lbl> pseudo instruction!
+  ImmCLbl l    -> char '=' <> pdoc platform l
+  ImmIndex l o -> text "[=" <> pdoc platform l <> comma <+> char '#' <> int o <> char ']'
+  _            -> panic "AArch64.pprIm"
+
+pprExt :: ExtMode -> SDoc
+pprExt EUXTB = text "uxtb"
+pprExt EUXTH = text "uxth"
+pprExt EUXTW = text "uxtw"
+pprExt EUXTX = text "uxtx"
+pprExt ESXTB = text "sxtb"
+pprExt ESXTH = text "sxth"
+pprExt ESXTW = text "sxtw"
+pprExt ESXTX = text "sxtx"
+
+pprShift :: ShiftMode -> SDoc
+pprShift SLSL = text "lsl"
+pprShift SLSR = text "lsr"
+pprShift SASR = text "asr"
+pprShift SROR = text "ror"
+
+pprOp :: Platform -> Operand -> SDoc
+pprOp plat op = case op of
+  OpReg w r           -> pprReg w r
+  OpRegExt w r x 0 -> pprReg w r <> comma <+> pprExt x
+  OpRegExt w r x i -> pprReg w r <> comma <+> pprExt x <> comma <+> char '#' <> int i
+  OpRegShift w r s i -> pprReg w r <> comma <+> pprShift s <> comma <+> char '#' <> int i
+  OpImm im          -> pprIm plat im
+  OpImmShift im s i -> pprIm plat im <> comma <+> pprShift s <+> char '#' <> int i
+  -- XXX: Address compuation always use registers as 64bit -- is this correct?
+  OpAddr (AddrRegReg r1 r2) -> char '[' <+> pprReg W64 r1 <> comma <+> pprReg W64 r2 <+> char ']'
+  OpAddr (AddrRegImm r1 im) -> char '[' <+> pprReg W64 r1 <> comma <+> pprImm plat im <+> char ']'
+  OpAddr (AddrReg r1)       -> char '[' <+> pprReg W64 r1 <+> char ']'
+
+pprReg :: Width -> Reg -> SDoc
+pprReg w r = case r of
+  RegReal    (RealRegSingle i) -> ppr_reg_no w i
+  RegReal    (RealRegPair{})   -> panic "AArch64.pprReg: no reg pairs on this arch!"
+  -- virtual regs should not show up, but this is helpful for debugging.
+  RegVirtual (VirtualRegI u)   -> text "%vI_" <> pprUniqueAlways u
+  RegVirtual (VirtualRegF u)   -> text "%vF_" <> pprUniqueAlways u
+  RegVirtual (VirtualRegD u)   -> text "%vD_" <> pprUniqueAlways u
+  _                            -> pprPanic "AArch64.pprReg" (text $ show r)
+
+  where
+    ppr_reg_no :: Width -> Int -> SDoc
+    ppr_reg_no w 31
+         | w == W64 = text "sp"
+         | w == W32 = text "wsp"
+
+    ppr_reg_no w i
+         | i < 0, w == W32 = text "wzr"
+         | i < 0, w == W64 = text "xzr"
+         | i < 0 = pprPanic "Invalid Zero Reg" (ppr w <+> int i)
+         -- General Purpose Registers
+         | i <= 31, w == W8  = text "w" <> int i      -- there are no byte or half
+         | i <= 31, w == W16 = text "w" <> int i      -- words... word will do.
+         | i <= 31, w == W32 = text "w" <> int i
+         | i <= 31, w == W64 = text "x" <> int i
+         | i <= 31 = pprPanic "Invalid Reg" (ppr w <+> int i)
+         -- Floating Point Registers
+         | i <= 63, w == W8  = text "b" <> int (i-32)
+         | i <= 63, w == W16 = text "h" <> int (i-32)
+         | i <= 63, w == W32 = text "s" <> int (i-32)
+         | i <= 63, w == W64 = text "d" <> int (i-32)
+         -- no support for 'q'uad in GHC's NCG yet.
+         | otherwise = text "very naughty powerpc register"
+
+isFloatOp :: Operand -> Bool
+isFloatOp (OpReg _ (RegReal (RealRegSingle i))) | i > 31 = True
+isFloatOp (OpReg _ (RegVirtual (VirtualRegF _))) = True
+isFloatOp (OpReg _ (RegVirtual (VirtualRegD _))) = True
+isFloatOp _ = False
+
+pprInstr :: Platform -> Instr -> SDoc
+pprInstr platform instr = case instr of
+  -- Meta Instructions ---------------------------------------------------------
+  COMMENT s  -> asmComment s
+  MULTILINE_COMMENT s -> asmMultilineComment s
+  ANN d i -> pprInstr platform i <+> asmDoubleslashComment d
+  LOCATION file line col _name
+    -> text "\t.loc" <+> ppr file <+> ppr line <+> ppr col
+  DELTA d    -> asmComment $ text ("\tdelta = " ++ show d)
+  NEWBLOCK _ -> panic "PprInstr: NEWBLOCK"
+  LDATA _ _  -> panic "pprInstr: LDATA"
+
+  -- Pseudo Instructions -------------------------------------------------------
+
+  PUSH_STACK_FRAME -> text "\tstp x29, x30, [sp, #-16]!"
+                   $$ text "\tmov x29, sp"
+
+  POP_STACK_FRAME -> text "\tldp x29, x30, [sp], #16"
+  -- ===========================================================================
+  -- AArch64 Instruction Set
+  -- 1. Arithmetic Instructions ------------------------------------------------
+  ADD  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> text "\tfadd"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+    | otherwise -> text "\tadd"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  CMN  o1 o2    -> text "\tcmn"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  CMP  o1 o2
+    | isFloatOp o1 && isFloatOp o2 -> text "\tfcmp"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+    | otherwise -> text "\tcmp" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  MSUB o1 o2 o3 o4 -> text "\tmsub" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
+  MUL  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> text "\tfmul"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+    | otherwise -> text "\tmul"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  NEG  o1 o2
+    | isFloatOp o1 && isFloatOp o2 -> text "\tfneg"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+    | otherwise -> text "\tneg"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  SDIV o1 o2 o3 | isFloatOp o1 && isFloatOp o2 && isFloatOp o3
+    -> text "\tfdiv" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  SDIV o1 o2 o3 -> text "\tsdiv" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+
+  SUB  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> text "\tfsub"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+    | otherwise -> text "\tsub"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  UDIV o1 o2 o3 -> text "\tudiv" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+
+  -- 2. Bit Manipulation Instructions ------------------------------------------
+  SBFM o1 o2 o3 o4 -> text "\tsbfm" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
+  UBFM o1 o2 o3 o4 -> text "\tubfm" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
+  -- 3. Logical and Move Instructions ------------------------------------------
+  AND o1 o2 o3  -> text "\tand" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ANDS o1 o2 o3 -> text "\tands" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ASR o1 o2 o3  -> text "\tasr" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  BIC o1 o2 o3  -> text "\tbic" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  BICS o1 o2 o3 -> text "\tbics" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  EON o1 o2 o3  -> text "\teon" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  EOR o1 o2 o3  -> text "\teor" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  LSL o1 o2 o3  -> text "\tlsl" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  LSR o1 o2 o3  -> text "\tlsr" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  MOV o1 o2
+    | isFloatOp o1 || isFloatOp o2 -> text "\tfmov" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+    | otherwise -> text "\tmov" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  MOVK o1 o2    -> text "\tmovk" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  MVN o1 o2     -> text "\tmvn" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  ORN o1 o2 o3  -> text "\torn" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ORR o1 o2 o3  -> text "\torr" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ROR o1 o2 o3  -> text "\tror" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  TST o1 o2     -> text "\ttst" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+  -- 4. Branch Instructions ----------------------------------------------------
+  J t            -> pprInstr platform (B t)
+  B (TBlock bid) -> text "\tb" <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  B (TLabel lbl) -> text "\tb" <+> pdoc platform lbl
+  B (TReg r)     -> text "\tbr" <+> pprReg W64 r
+
+  BL (TBlock bid) _ _ -> text "\tbl" <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  BL (TLabel lbl) _ _ -> text "\tbl" <+> pdoc platform lbl
+  BL (TReg r)     _ _ -> text "\tblr" <+> pprReg W64 r
+
+  BCOND c (TBlock bid) -> text "\t" <> pprBcond c <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  BCOND c (TLabel lbl) -> text "\t" <> pprBcond c <+> pdoc platform lbl
+  BCOND _ (TReg _)     -> panic "AArch64.ppr: No conditional branching to registers!"
+
+  -- 5. Atomic Instructions ----------------------------------------------------
+  -- 6. Conditional Instructions -----------------------------------------------
+  CSET o c  -> text "\tcset" <+> pprOp platform o <> comma <+> pprCond c
+
+  CBZ o (TBlock bid) -> text "\tcbz" <+> pprOp platform o <> comma <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  CBZ o (TLabel lbl) -> text "\tcbz" <+> pprOp platform o <> comma <+> pdoc platform lbl
+  CBZ _ (TReg _)     -> panic "AArch64.ppr: No conditional (cbz) branching to registers!"
+
+  CBNZ o (TBlock bid) -> text "\tcbnz" <+> pprOp platform o <> comma <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  CBNZ o (TLabel lbl) -> text "\tcbnz" <+> pprOp platform o <> comma <+> pdoc platform lbl
+  CBNZ _ (TReg _)     -> panic "AArch64.ppr: No conditional (cbnz) branching to registers!"
+
+  -- 7. Load and Store Instructions --------------------------------------------
+  -- NOTE: GHC may do whacky things where it only load the lower part of an
+  --       address. Not observing the correct size when loading will lead
+  --       inevitably to crashes.
+  STR _f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tstrb" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  STR _f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tstrh" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  STR _f o1 o2 -> text "\tstr" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+#if defined(darwin_HOST_OS)
+  LDR _f o1 (OpImm (ImmIndex lbl' off)) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- XXX: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- XXX: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@page" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@pageoff" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- XXX: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmCLbl lbl')) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@page" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@pageoff"
+#else
+  LDR _f o1 (OpImm (ImmIndex lbl' off)) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- XXX: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- XXX: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text ":lo12:" <> pdoc platform lbl $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- XXX: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmCLbl lbl')) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text ":lo12:" <> pdoc platform lbl
+#endif
+
+  LDR _f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tldrsb" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  LDR _f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tldrsh" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  LDR _f o1 o2 -> text "\tldr" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+  STP _f o1 o2 o3 -> text "\tstp" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  LDP _f o1 o2 o3 -> text "\tldp" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+
+  -- 8. Synchronization Instructions -------------------------------------------
+  DMBSY -> text "\tdmb sy"
+  -- 8. Synchronization Instructions -------------------------------------------
+  FCVT o1 o2 -> text "\tfcvt" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  SCVTF o1 o2 -> text "\tscvtf" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  FCVTZS o1 o2 -> text "\tfcvtzs" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+pprBcond :: Cond -> SDoc
+pprBcond c = text "b." <> pprCond c
+
+pprCond :: Cond -> SDoc
+pprCond c = case c of
+  ALWAYS -> text "al" -- Always
+  EQ     -> text "eq" -- Equal
+  NE     -> text "ne" -- Not Equal
+
+  SLT    -> text "lt" -- Signed less than                  ; Less than, or unordered
+  SLE    -> text "le" -- Signed less than or equal         ; Less than or equal, or unordered
+  SGE    -> text "ge" -- Signed greater than or equal      ; Greater than or equal
+  SGT    -> text "gt" -- Signed greater than               ; Greater than
+
+  ULT    -> text "lo" -- Carry clear/ unsigned lower       ; less than
+  ULE    -> text "ls" -- Unsigned lower or same            ; Less than or equal
+  UGE    -> text "hs" -- Carry set/unsigned higher or same ; Greater than or equal, or unordered
+  UGT    -> text "hi" -- Unsigned higher                   ; Greater than, or unordered
+
+  NEVER  -> text "nv" -- Never
+  VS     -> text "vs" -- Overflow                          ; Unordered (at least one NaN operand)
+  VC     -> text "vc" -- No overflow                       ; Not unordered
+
+  -- Orderd variants.  Respecting NaN.
+  OLT    -> text "mi"
+  OLE    -> text "ls"
+  OGE    -> text "ge"
+  OGT    -> text "gt"
+
+  -- Unordered
+  UOLT   -> text "lt"
+  UOLE   -> text "le"
+  UOGE   -> text "pl"
+  UOGT   -> text "hi"
--- a/compiler/GHC/CmmToAsm/AArch64/README.md
+++ b/compiler/GHC/CmmToAsm/AArch64/README.md
+# Native Code Gen for AArch64. Naive.
+
+- [ ] Give OpReg a width, and make sure we print the corresponding reg
+- [ ] Separate out FDIV
+- [ ] Switch statements may have rather large offsets, and not that large ranges.
+      subtracting the offset to make them 0 might reduce the number of cmp calls.
+
+- [ ] Make the compiler spit out to stdout with -o -
+- [ ] Allow us to terminate after a phase.
+- [ ] Add fuse phase to turn ldr,ldr itno ldp, str,str into stp.
+- [ ] Haskell Grinder Repo with test cases that excessively test the compiler
+      over a longer period of time.
+
+- [ ] Clean up
+- [ ] Document more
+
+## Intro
+
+The AArch64 architecture by Arm, is part of Armv8, the ISA is called A64.  This
+is the best ressource on the arm website: https://developer.arm.com/architectures/cpu-architecture/a-profile/exploration-tools
+The HTML view has an instruction lookup, the xml file is also good.
+
+### Cmm
+
+@GHC.CmmToAsm@ contains the top-level entry point @nativeCodeGen@,
+which will be called from @GHC.Driver.CodeOutput.outputAsm@, which
+is triggered via the @-fasm@ flag.
+
+Thus, we'll not concern ourselves with anything in the pipeline
+before that point.
+
+```
+outputAsm :: DynFlags -> Module -> ModLocation -> FilePath
+          -> Stream IO RawCmmGroup a
+          -> IO a
+
+nativeCodeGen :: forall a . DynFlags -> Module -> ModLocation -> Handle -> UniqSupply
+              -> Stream IO RawCmmGroup a
+              -> IO a
+```
+
+@nativeCodeGen@ will produce a configuration through @initConfig dflags@,
+and pass that usually to the architecture specific implementation. E.g.
+@x86NcgImpl config@ or @ppcNcgImpl config@.  Those are expected to produce
+
+```
+nCG' :: ( Outputable statics, Outputable instr
+        , Outputable jumpDest, Instruction instr)
+     => NcgImpl statics instr jumpDest
+```
+
+this is then fed into @nativeCodeGen'@, which will produce the final output,
+in the form of @IO a@.
+
+Therefore the job of a *new* NCG is to provide and @NcgImpl@, that can
+take a @config :: NCGConfig@, that contains platform, alignment, debug, PIC,
+and other configuraiton specific information.
+
+@NcgImpl@ looks like the following:
+
+```
+data NcgImpl statics instr jumpDest = NcgImpl {
+    ncgConfig                 :: !NCGConfig,
+    cmmTopCodeGen             :: RawCmmDecl -> NatM [NatCmmDecl statics instr],
+    generateJumpTableForInstr :: instr -> Maybe (NatCmmDecl statics instr),
+    getJumpDestBlockId        :: jumpDest -> Maybe BlockId,
+    canShortcut               :: instr -> Maybe jumpDest,
+    shortcutStatics           :: (BlockId -> Maybe jumpDest) -> statics -> statics,
+    shortcutJump              :: (BlockId -> Maybe jumpDest) -> instr -> instr,
+    pprNatCmmDecl             :: NatCmmDecl statics instr -> SDoc,
+    maxSpillSlots             :: Int,
+    allocatableRegs           :: [RealReg],
+    ncgExpandTop              :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
+    ncgAllocMoreStack         :: Int -> NatCmmDecl statics instr
+                              -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)]),
+    -- ^ The list of block ids records the redirected jumps to allow us to update
+    -- the CFG.
+    ncgMakeFarBranches        :: LabelMap RawCmmStatics -> [NatBasicBlock instr] -> [NatBasicBlock instr],
+    extractUnwindPoints       :: [instr] -> [UnwindPoint],
+    -- ^ given the instruction sequence of a block, produce a list of
+    -- the block's 'UnwindPoint's
+    -- See Note [What is this unwinding business?] in "GHC.Cmm.DebugBlock"
+    -- and Note [Unwinding information in the NCG] in this module.
+    invertCondBranches        :: Maybe CFG -> LabelMap RawCmmStatics -> [NatBasicBlock instr]
+                              -> [NatBasicBlock instr]
+    -- ^ Turn the sequence of `jcc l1; jmp l2` into `jncc l2; <block_l1>`
+    -- when possible.
+    }
+```
+
+We'll thus start with the bare minimum.  Ignoring most flags for now.  The
+approach we'll take is the following which has proven to work for the llvm-ng
+backend.  We'll re-use the llvm code gen, and start with a trivial Main module:
+
+```
+  main = putStrLn "Hello AArch64 NCG!"
+```
+
+and compile this with @ghc -fasm -c Main.hs@ and then link it manually (pot.
+after inspection).
+
+To build GHC, we'll use a cross compiler and nix and hadrian:
+
+```bash
+nix-shell --pure -p '[ python3 haskellPackages.alex haskellPackages.happy (haskell.compiler.ghc883.override { ghcFlavour = "prof"; }) cabal-install autoconf automake gmp.dev ripgrep zlib llvmPackages_9.llvm llvmPackages_9.clang pkgsCross.aarch64-multiplatform.buildPackages.binutils pkgsCross.aarch64-multiplatform.stdenv.cc perl git linuxHeaders elf-header pkgsCross.aarch64-multiplatform.gmp.dev file qemu htop ]'
+$ NM=aarch64-unknown-linux-gnu-nm LD=aarch64-unknown-linux-gnu-ld.gold AR=aarch64-unknown-linux-gnu-ar AS=aarch64-unknown-linux-gnu-as CC=aarch64-unknown-linux-gnu-cc CXX=aarch64-unknown-linux-gnu-cxx ./configure --target=aarch64-unknown-linux-gnu
+-- hadrian produces a dud :-/
+$ ./hadrian/build  --flavour=quickest -j
+-- useing make (ensure mk/build.mk is set to quick-llvm)
+$ make -j -s
+```
\ No newline at end of file
No results found