Compare revisions

ada04031 · ada04031 · ada04031 · ada04031 · ada04031 · ada04031
--- a/compiler/GHC/Cmm/Type.hs
+++ b/compiler/GHC/Cmm/Type.hs
@@ -4,7 +4,7 @@ module GHC.Cmm.Type
    , cInt
    , cmmBits, cmmFloat
    , typeWidth, setCmmTypeWidth
-    , cmmEqType, cmmEqType_ignoring_ptrhood
+    , cmmEqType, cmmCompatType
    , isFloatType, isGcPtrType, isBitsType
    , isWordAny, isWord32, isWord64
    , isFloat64, isFloat32
@@ -87,21 +87,27 @@ instance Outputable CmmCat where
 cmmEqType :: CmmType -> CmmType -> Bool -- Exact equality
 cmmEqType (CmmType c1 w1) (CmmType c2 w2) = c1==c2 && w1==w2

-cmmEqType_ignoring_ptrhood :: CmmType -> CmmType -> Bool
-  -- This equality is temporary; used in CmmLint
-  -- but the RTS files are not yet well-typed wrt pointers
-cmmEqType_ignoring_ptrhood (CmmType c1 w1) (CmmType c2 w2)
-   = c1 `weak_eq` c2 && w1==w2
+-- | A weaker notion of equality of 'CmmType's than 'cmmEqType',
+-- used (only) in Cmm Lint.
+--
+-- Why "weaker"? Because:
+--
+--  - we don't distinguish GcPtr vs NonGcPtr, because the the RTS files
+--    are not yet well-typed wrt pointers,
+--  - for vectors, we only compare the widths, because in practice things like
+--    X86 xmm registers support different types of data (e.g. 4xf32, 2xf64, 2xu64 etc).
+cmmCompatType :: CmmType -> CmmType -> Bool
+cmmCompatType (CmmType c1 w1) (CmmType c2 w2)
+   = c1 `weak_eq` c2 && w1 == w2
   where
     weak_eq :: CmmCat -> CmmCat -> Bool
-     FloatCat         `weak_eq` FloatCat         = True
-     FloatCat         `weak_eq` _other           = False
-     _other           `weak_eq` FloatCat         = False
-     (VecCat l1 cat1) `weak_eq` (VecCat l2 cat2) = l1 == l2
-                                                   && cat1 `weak_eq` cat2
-     (VecCat {})      `weak_eq` _other           = False
-     _other           `weak_eq` (VecCat {})      = False
-     _word1           `weak_eq` _word2           = True        -- Ignores GcPtr
+     FloatCat    `weak_eq` FloatCat    = True
+     FloatCat    `weak_eq` _other      = False
+     _other      `weak_eq` FloatCat    = False
+     (VecCat {}) `weak_eq` (VecCat {}) = True  -- only compare overall width
+     (VecCat {}) `weak_eq` _other      = False
+     _other      `weak_eq` (VecCat {}) = False
+     _word1      `weak_eq` _word2      = True  -- Ignores GcPtr

 --- Simple operations on CmmType -----
 typeWidth :: CmmType -> Width

--- a/compiler/GHC/Cmm/UniqueRenamer.hs
+++ b/compiler/GHC/Cmm/UniqueRenamer.hs
+{-# LANGUAGE LambdaCase, RecordWildCards, MagicHash, UnboxedTuples, PatternSynonyms, ExplicitNamespaces #-}
+module GHC.Cmm.UniqueRenamer
+  ( detRenameCmmGroup
+  , detRenameIPEMap
+  , MonadGetUnique(..)
+
+  -- Careful! Not for general use!
+  , DetUniqFM, emptyDetUFM
+
+  , module GHC.Types.Unique.DSM
+  )
+  where
+
+import GHC.Prelude
+import GHC.Utils.Monad.State.Strict
+import Data.Tuple (swap)
+import GHC.Word
+import GHC.Cmm
+import GHC.Cmm.CLabel
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Switch
+import GHC.Types.Unique
+import GHC.Types.Unique.FM
+import GHC.Types.Unique.DFM
+import GHC.Utils.Outputable as Outputable
+import GHC.Types.Id
+import GHC.Types.Unique.DSM
+import GHC.Types.Name hiding (varName)
+import GHC.Types.Var
+import GHC.Types.IPE
+
+{-
+Note [Renaming uniques deterministically]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+As mentioned by Note [Object determinism], a key step in producing
+deterministic objects is to rename all existing uniques deterministically.
+
+An important observation is that GHC already produces code in a deterministic
+order, both declarations (say, A_closure always comes before B_closure) and the
+instructions and data within.
+
+We can leverage this /deterministic order/ to
+rename all uniques deterministically, by traversing, specifically, Cmm code
+fresh off of StgToCmm and assigning a new unique from a deterministic supply
+(an incrementing counter) to every non-external unique in the order they are found.
+
+Since the order is deterministic across runs, so will the renamed uniques.
+
+This Cmm renaming pass is guarded by -fobject-determinism because it means the
+compiler must do more work. However, performance profiling has shown the impact
+to be small enough that we should consider enabling -fobject-determinism by
+default instead eventually.
+-}
+
+-- | A mapping from non-deterministic uniques to deterministic uniques, to
+-- rename local symbols with the end goal of producing deterministic object files.
+-- See Note [Renaming uniques deterministically]
+data DetUniqFM = DetUniqFM
+  { mapping :: !(UniqFM Unique Unique)
+  , supply  :: !Word64
+  }
+
+instance Outputable DetUniqFM where
+  ppr DetUniqFM{mapping, supply} =
+    ppr mapping $$
+    text "supply:" Outputable.<> ppr supply
+
+type DetRnM = State DetUniqFM
+
+emptyDetUFM :: DetUniqFM
+emptyDetUFM = DetUniqFM
+  { mapping = emptyUFM
+  -- NB: A lower initial value can get us label `Lsl` which is not parsed
+  -- correctly in older versions of LLVM assembler (llvm-project#80571)
+  -- So we use an `x` s.t. w64ToBase62 x > "R" > "L" > "r" > "l"
+  , supply = 54
+  }
+
+renameDetUniq :: Unique -> DetRnM Unique
+renameDetUniq uq = do
+  m <- gets mapping
+  case lookupUFM m uq of
+    Nothing -> do
+      new_w <- gets supply -- New deterministic unique in this `DetRnM`
+      let --(_tag, _) = unpkUnique uq
+          det_uniq = mkUnique 'Q' new_w
+      modify (\DetUniqFM{mapping, supply} ->
+        -- Update supply and mapping
+        DetUniqFM
+          { mapping = addToUFM mapping uq det_uniq
+          , supply = supply + 1
+          })
+      return det_uniq
+    Just det_uniq ->
+      return det_uniq
+
+-- The most important function here, which does the actual renaming.
+detRenameCLabel :: CLabel -> DetRnM CLabel
+detRenameCLabel = mapInternalNonDetUniques renameDetUniq
+
+-- | We want to rename uniques in Ids, but ONLY internal ones.
+detRenameId :: Id -> DetRnM Id
+detRenameId i
+  | isExternalName (varName i) = return i
+  | otherwise = setIdUnique i <$> renameDetUniq (getUnique i)
+
+-- | Similar to `detRenameId`, but for `Name`.
+detRenameName :: Name -> DetRnM Name
+detRenameName n
+  | isExternalName n = return n
+  | otherwise = setNameUnique n <$> renameDetUniq (getUnique n)
+
+detRenameCmmGroup :: DetUniqFM -> DCmmGroup -> (DetUniqFM, CmmGroup)
+detRenameCmmGroup dufm group = swap (runState (mapM detRenameCmmDecl group) dufm)
+  where
+    detRenameCmmDecl :: DCmmDecl -> DetRnM CmmDecl
+    detRenameCmmDecl (CmmProc h lbl regs g)
+      = do
+        h' <- detRenameCmmTop h
+        lbl' <- detRenameCLabel lbl
+        regs' <- mapM detRenameGlobalRegUse regs
+        g' <- detRenameCmmGraph g
+        return (CmmProc h' lbl' regs' g')
+    detRenameCmmDecl (CmmData sec d)
+      = CmmData <$> detRenameSection sec <*> detRenameCmmStatics d
+
+    detRenameCmmTop :: DCmmTopInfo -> DetRnM CmmTopInfo
+    detRenameCmmTop (TopInfo (DWrap i) b)
+      = TopInfo . mapFromList <$> mapM (detRenamePair detRenameLabel detRenameCmmInfoTable) i <*> pure b
+
+    detRenameCmmGraph :: DCmmGraph -> DetRnM CmmGraph
+    detRenameCmmGraph (CmmGraph entry bs)
+      = CmmGraph <$> detRenameLabel entry <*> detRenameGraph bs
+
+    detRenameGraph = \case
+      GNil  -> pure GNil
+      GUnit block -> GUnit <$> detRenameBlock block
+      GMany m1 b m2 -> GMany <$> detRenameMaybeBlock m1 <*> detRenameBody b <*> detRenameMaybeBlock m2
+
+    detRenameBody (DWrap b)
+      = mapFromList <$> mapM (detRenamePair detRenameLabel detRenameBlock) b
+
+    detRenameCmmStatics :: CmmStatics -> DetRnM CmmStatics
+    detRenameCmmStatics
+      (CmmStatics clbl info ccs lits1 lits2)
+        = CmmStatics <$> detRenameCLabel clbl <*> detRenameCmmInfoTable info <*> pure ccs <*> mapM detRenameCmmLit lits1 <*> mapM detRenameCmmLit lits2
+    detRenameCmmStatics
+      (CmmStaticsRaw lbl sts)
+        = CmmStaticsRaw <$> detRenameCLabel lbl <*> mapM detRenameCmmStatic sts
+
+    detRenameCmmInfoTable :: CmmInfoTable -> DetRnM CmmInfoTable
+    detRenameCmmInfoTable
+      CmmInfoTable{cit_lbl, cit_rep, cit_prof, cit_srt, cit_clo}
+        = CmmInfoTable <$> detRenameCLabel cit_lbl <*> pure cit_rep <*> pure cit_prof <*> detRenameMaybe detRenameCLabel cit_srt <*>
+           (case cit_clo of
+              Nothing -> pure Nothing
+              Just (an_id, ccs) -> Just . (,ccs) <$> detRenameId an_id)
+
+    detRenameCmmStatic :: CmmStatic -> DetRnM CmmStatic
+    detRenameCmmStatic = \case
+      CmmStaticLit l -> CmmStaticLit <$> detRenameCmmLit l
+      CmmUninitialised x -> pure $ CmmUninitialised x
+      CmmString x -> pure $ CmmString x
+      CmmFileEmbed f i -> pure $ CmmFileEmbed f i
+
+    detRenameCmmLit :: CmmLit -> DetRnM CmmLit
+    detRenameCmmLit = \case
+      CmmInt i w -> pure $ CmmInt i w
+      CmmFloat r w -> pure $ CmmFloat r w
+      CmmVec lits -> CmmVec <$> mapM detRenameCmmLit lits
+      CmmLabel lbl -> CmmLabel <$> detRenameCLabel lbl
+      CmmLabelOff lbl i -> CmmLabelOff <$> detRenameCLabel lbl <*> pure i
+      CmmLabelDiffOff lbl1 lbl2 i w ->
+        CmmLabelDiffOff <$> detRenameCLabel lbl1 <*> detRenameCLabel lbl2 <*> pure i <*> pure w
+      CmmBlock bid -> CmmBlock <$> detRenameLabel bid
+      CmmHighStackMark -> pure CmmHighStackMark
+
+    detRenameMaybeBlock :: MaybeO n (Block CmmNode a b) -> DetRnM (MaybeO n (Block CmmNode a b))
+    detRenameMaybeBlock (JustO x) = JustO <$> detRenameBlock x
+    detRenameMaybeBlock NothingO = pure NothingO
+
+    detRenameBlock :: Block CmmNode n m -> DetRnM (Block CmmNode n m)
+    detRenameBlock = \case
+      BlockCO n bn -> BlockCO <$> detRenameCmmNode n <*> detRenameBlock bn
+      BlockCC n1 bn n2 -> BlockCC <$> detRenameCmmNode n1 <*> detRenameBlock bn <*> detRenameCmmNode n2
+      BlockOC bn n -> BlockOC <$> detRenameBlock bn <*> detRenameCmmNode n
+      BNil    -> pure BNil
+      BMiddle n -> BMiddle <$> detRenameCmmNode n
+      BCat    b1 b2 -> BCat <$> detRenameBlock b1 <*> detRenameBlock b2
+      BSnoc   bn n -> BSnoc <$> detRenameBlock bn <*> detRenameCmmNode n
+      BCons   n bn -> BCons <$> detRenameCmmNode n <*> detRenameBlock bn
+
+    detRenameCmmNode :: CmmNode n m -> DetRnM (CmmNode n m)
+    detRenameCmmNode = \case
+      CmmEntry l t -> CmmEntry <$> detRenameLabel l <*> detRenameCmmTick t
+      CmmComment fs -> pure $ CmmComment fs
+      CmmTick tickish -> pure $ CmmTick tickish
+      CmmUnwind xs -> CmmUnwind <$> mapM (detRenamePair detRenameGlobalReg (detRenameMaybe detRenameCmmExpr)) xs
+      CmmAssign reg e -> CmmAssign <$> detRenameCmmReg reg <*> detRenameCmmExpr e
+      CmmStore e1 e2 align -> CmmStore <$> detRenameCmmExpr e1 <*> detRenameCmmExpr e2 <*> pure align
+      CmmUnsafeForeignCall ftgt cmmformal cmmactual ->
+        CmmUnsafeForeignCall <$> detRenameForeignTarget ftgt <*> mapM detRenameLocalReg cmmformal <*> mapM detRenameCmmExpr cmmactual
+      CmmBranch l -> CmmBranch <$> detRenameLabel l
+      CmmCondBranch pred t f likely ->
+        CmmCondBranch <$> detRenameCmmExpr pred <*> detRenameLabel t <*> detRenameLabel f <*> pure likely
+      CmmSwitch e sts -> CmmSwitch <$> detRenameCmmExpr e <*> mapSwitchTargetsA detRenameLabel sts
+      CmmCall tgt cont regs args retargs retoff ->
+        CmmCall <$> detRenameCmmExpr tgt <*> detRenameMaybe detRenameLabel cont <*> mapM detRenameGlobalRegUse regs
+                <*> pure args <*> pure retargs <*> pure retoff
+      CmmForeignCall tgt res args succ retargs retoff intrbl ->
+        CmmForeignCall <$> detRenameForeignTarget tgt <*> mapM detRenameLocalReg res <*> mapM detRenameCmmExpr args
+                       <*> detRenameLabel succ <*> pure retargs <*> pure retoff <*> pure intrbl
+
+    detRenameCmmExpr :: CmmExpr -> DetRnM CmmExpr
+    detRenameCmmExpr = \case
+      CmmLit l -> CmmLit <$> detRenameCmmLit l
+      CmmLoad e t a -> CmmLoad <$> detRenameCmmExpr e <*> pure t <*> pure a
+      CmmReg r -> CmmReg <$> detRenameCmmReg r
+      CmmMachOp mop es -> CmmMachOp mop <$> mapM detRenameCmmExpr es
+      CmmStackSlot a i -> CmmStackSlot <$> detRenameArea a <*> pure i
+      CmmRegOff r i -> CmmRegOff <$> detRenameCmmReg r <*> pure i
+
+    detRenameForeignTarget :: ForeignTarget -> DetRnM ForeignTarget
+    detRenameForeignTarget = \case
+        ForeignTarget e fc -> ForeignTarget <$> detRenameCmmExpr e <*> pure fc
+        PrimTarget cmop -> pure $ PrimTarget cmop
+
+    detRenameArea :: Area -> DetRnM Area
+    detRenameArea Old = pure Old
+    detRenameArea (Young l) = Young <$> detRenameLabel l
+
+    detRenameLabel :: Label -> DetRnM Label
+    detRenameLabel lbl
+      = mkHooplLabel . getKey <$> renameDetUniq (getUnique lbl)
+
+    detRenameSection :: Section -> DetRnM Section
+    detRenameSection (Section ty lbl)
+      = Section ty <$> detRenameCLabel lbl
+
+    detRenameCmmReg :: CmmReg -> DetRnM CmmReg
+    detRenameCmmReg = \case
+      CmmLocal l -> CmmLocal <$> detRenameLocalReg l
+      CmmGlobal x -> pure $ CmmGlobal x
+
+    detRenameLocalReg :: LocalReg -> DetRnM LocalReg
+    detRenameLocalReg (LocalReg uq t)
+      = LocalReg <$> renameDetUniq uq <*> pure t
+
+    -- Global registers don't need to be renamed.
+    detRenameGlobalReg :: GlobalReg -> DetRnM GlobalReg
+    detRenameGlobalReg = pure
+    detRenameGlobalRegUse :: GlobalRegUse -> DetRnM GlobalRegUse
+    detRenameGlobalRegUse = pure
+
+    -- todo: We may have to change this to get deterministic objects with ticks.
+    detRenameCmmTick :: CmmTickScope -> DetRnM CmmTickScope
+    detRenameCmmTick = pure
+
+    detRenameMaybe _ Nothing = pure Nothing
+    detRenameMaybe f (Just x) = Just <$> f x
+
+    detRenamePair f g (a, b) = (,) <$> f a <*> g b
+
+detRenameIPEMap :: DetUniqFM -> InfoTableProvMap -> (DetUniqFM, InfoTableProvMap)
+detRenameIPEMap dufm InfoTableProvMap{ provDC, provClosure, provInfoTables } =
+    (dufm2, InfoTableProvMap { provDC, provClosure = cm, provInfoTables })
+  where
+    (cm, dufm2) = runState (detRenameClosureMap provClosure) dufm
+
+    detRenameClosureMap :: ClosureMap -> DetRnM ClosureMap
+    detRenameClosureMap m =
+      -- `eltsUDFM` preserves the deterministic order, but it doesn't matter
+      -- since we will rename all uniques deterministically, thus the
+      -- reconstructed map will necessarily be deterministic too.
+      listToUDFM <$> mapM (\(n,r) -> do
+        n' <- detRenameName n
+        return (n', (n', r))
+        ) (eltsUDFM m)
--- a/compiler/GHC/Cmm/Utils.hs
+++ b/compiler/GHC/Cmm/Utils.hs
-{-# LANGUAGE GADTs, RankNTypes #-}
-{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE GADTs #-}
 {-# LANGUAGE LambdaCase #-}

-{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
-
 -----------------------------------------------------------------------------
 --
 -- Cmm utilities.
@@ -70,7 +67,7 @@ module GHC.Cmm.Utils(
 import GHC.Prelude

 import GHC.Core.TyCon     ( PrimRep(..), PrimElemRep(..) )
-import GHC.Types.RepType  ( UnaryType, SlotTy (..), typePrimRep1 )
+import GHC.Types.RepType  ( NvUnaryType, SlotTy (..), typePrimRepU )

 import GHC.Platform
 import GHC.Runtime.Heap.Layout
@@ -84,10 +81,10 @@ import GHC.Platform.Regs

 import Data.ByteString (ByteString)
 import qualified Data.ByteString as BS
+import Data.Foldable (toList)
 import GHC.Cmm.Dataflow.Graph
 import GHC.Cmm.Dataflow.Label
 import GHC.Cmm.Dataflow.Block
-import GHC.Cmm.Dataflow.Collections

 ---------------------------------------------------
 --
@@ -97,7 +94,6 @@ import GHC.Cmm.Dataflow.Collections

 primRepCmmType :: Platform -> PrimRep -> CmmType
 primRepCmmType platform = \case
-   VoidRep          -> panic "primRepCmmType:VoidRep"
   BoxedRep _       -> gcWord platform
   IntRep           -> bWord platform
   WordRep          -> bWord platform
@@ -136,11 +132,10 @@ primElemRepCmmType Word64ElemRep = b64
 primElemRepCmmType FloatElemRep  = f32
 primElemRepCmmType DoubleElemRep = f64

-typeCmmType :: Platform -> UnaryType -> CmmType
-typeCmmType platform ty = primRepCmmType platform (typePrimRep1 ty)
+typeCmmType :: Platform -> NvUnaryType -> CmmType
+typeCmmType platform ty = primRepCmmType platform (typePrimRepU ty)

 primRepForeignHint :: PrimRep -> ForeignHint
-primRepForeignHint VoidRep      = panic "primRepForeignHint:VoidRep"
 primRepForeignHint (BoxedRep _) = AddrHint
 primRepForeignHint IntRep       = SignedHint
 primRepForeignHint Int8Rep      = SignedHint
@@ -157,8 +152,8 @@ primRepForeignHint FloatRep     = NoHint
 primRepForeignHint DoubleRep    = NoHint
 primRepForeignHint (VecRep {})  = NoHint

-typeForeignHint :: UnaryType -> ForeignHint
-typeForeignHint = primRepForeignHint . typePrimRep1
+typeForeignHint :: NvUnaryType -> ForeignHint
+typeForeignHint = primRepForeignHint . typePrimRepU

 ---------------------------------------------------
 --
@@ -524,14 +519,12 @@ ofBlockMap entry bodyMap = CmmGraph {g_entry=entry, g_graph=GMany NothingO bodyM

 -- | like 'toBlockList', but the entry block always comes first
 toBlockListEntryFirst :: CmmGraph -> [CmmBlock]
-toBlockListEntryFirst g
-  | mapNull m  = []
-  | otherwise  = entry_block : others
+toBlockListEntryFirst g = do
+    entry_block <- toList $ mapLookup entry_id m
+    entry_block : filter ((/= entry_id) . entryLabel) (mapElems m)
  where
    m = toBlockMap g
    entry_id = g_entry g
-    Just entry_block = mapLookup entry_id m
-    others = filter ((/= entry_id) . entryLabel) (mapElems m)

 -- | Like 'toBlockListEntryFirst', but we strive to ensure that we order blocks
 -- so that the false case of a conditional jumps to the next block in the output
@@ -542,13 +535,10 @@ toBlockListEntryFirst g
 -- of successors returned for CmmCondBranch by the NonLocal instance for CmmNode
 -- defined in "GHC.Cmm.Node". -GBM
 toBlockListEntryFirstFalseFallthrough :: CmmGraph -> [CmmBlock]
-toBlockListEntryFirstFalseFallthrough g
-  | mapNull m  = []
-  | otherwise  = dfs setEmpty [entry_block]
+toBlockListEntryFirstFalseFallthrough g = dfs setEmpty $ toList $ mapLookup entry_id m
  where
    m = toBlockMap g
    entry_id = g_entry g
-    Just entry_block = mapLookup entry_id m

    dfs :: LabelSet -> [CmmBlock] -> [CmmBlock]
    dfs _ [] = []

--- a/compiler/GHC/CmmToAsm.hs
+++ b/compiler/GHC/CmmToAsm.hs
@@ -5,16 +5,6 @@
 --
 -- -----------------------------------------------------------------------------

-{-# LANGUAGE BangPatterns #-}
-{-# LANGUAGE DeriveFunctor #-}
-{-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE FlexibleInstances #-}
-{-# LANGUAGE GADTs #-}
-{-# LANGUAGE MultiParamTypeClasses #-}
-{-# LANGUAGE PatternSynonyms #-}
-{-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE UnboxedTuples #-}
-
 -- | Note [Native code generator]
 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 --
@@ -77,6 +67,7 @@ import qualified GHC.CmmToAsm.X86   as X86
 import qualified GHC.CmmToAsm.PPC   as PPC
 import qualified GHC.CmmToAsm.AArch64 as AArch64
 import qualified GHC.CmmToAsm.Wasm as Wasm32
+import qualified GHC.CmmToAsm.RV64  as RV64

 import GHC.CmmToAsm.Reg.Liveness
 import qualified GHC.CmmToAsm.Reg.Linear                as Linear
@@ -105,15 +96,12 @@ import GHC.Cmm.DebugBlock
 import GHC.Cmm.BlockId
 import GHC.StgToCmm.CgUtils ( fixStgRegisters )
 import GHC.Cmm
-import GHC.Cmm.Utils
-import GHC.Cmm.Dataflow.Collections
 import GHC.Cmm.Dataflow.Label
-import GHC.Cmm.Dataflow.Block
-import GHC.Cmm.Opt           ( cmmMachOpFold )
+import GHC.Cmm.GenericOpt
 import GHC.Cmm.CLabel

 import GHC.Types.Unique.FM
-import GHC.Types.Unique.Supply
+import GHC.Types.Unique.DSM
 import GHC.Driver.DynFlags
 import GHC.Driver.Ppr
 import GHC.Utils.Misc
@@ -129,7 +117,8 @@ import GHC.Utils.Constants (debugIsOn)
 import GHC.Data.FastString
 import GHC.Types.Unique.Set
 import GHC.Unit
-import GHC.Data.Stream (Stream)
+import GHC.StgToCmm.CgUtils (CgStream)
+import GHC.Data.Stream (liftIO)
 import qualified GHC.Data.Stream as Stream
 import GHC.Settings

@@ -142,14 +131,14 @@ import System.IO
 import System.Directory ( getCurrentDirectory )

 --------------------
-nativeCodeGen :: forall a . Logger -> ToolSettings -> NCGConfig -> ModLocation -> Handle -> UniqSupply
-              -> Stream IO RawCmmGroup a
-              -> IO a
-nativeCodeGen logger ts config modLoc h us cmms
+nativeCodeGen :: forall a . Logger -> ToolSettings -> NCGConfig -> ModLocation -> Handle
+              -> CgStream RawCmmGroup a
+              -> UniqDSMT IO a
+nativeCodeGen logger ts config modLoc h cmms
 = let platform = ncgPlatform config
       nCG' :: ( OutputableP Platform statics, Outputable jumpDest, Instruction instr)
-            => NcgImpl statics instr jumpDest -> IO a
-       nCG' ncgImpl = nativeCodeGen' logger config modLoc ncgImpl h us cmms
+            => NcgImpl statics instr jumpDest -> UniqDSMT IO a
+       nCG' ncgImpl = nativeCodeGen' logger config modLoc ncgImpl h cmms
   in case platformArch platform of
      ArchX86       -> nCG' (X86.ncgX86     config)
      ArchX86_64    -> nCG' (X86.ncgX86_64  config)
@@ -161,11 +150,11 @@ nativeCodeGen logger ts config modLoc h us cmms
      ArchAlpha     -> panic "nativeCodeGen: No NCG for Alpha"
      ArchMipseb    -> panic "nativeCodeGen: No NCG for mipseb"
      ArchMipsel    -> panic "nativeCodeGen: No NCG for mipsel"
-      ArchRISCV64   -> panic "nativeCodeGen: No NCG for RISCV64"
+      ArchRISCV64   -> nCG' (RV64.ncgRV64 config)
      ArchLoongArch64->panic "nativeCodeGen: No NCG for LoongArch64"
      ArchUnknown   -> panic "nativeCodeGen: No NCG for unknown arch"
      ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript"
-      ArchWasm32    -> Wasm32.ncgWasm logger platform ts us modLoc h cmms
+      ArchWasm32    -> Wasm32.ncgWasm config logger platform ts modLoc h cmms

 -- | Data accumulated during code generation. Mostly about statistics,
 -- but also collects debug data for DWARF generation.
@@ -216,19 +205,17 @@ nativeCodeGen' :: (OutputableP Platform statics, Outputable jumpDest, Instructio
               -> ModLocation
               -> NcgImpl statics instr jumpDest
               -> Handle
-               -> UniqSupply
-               -> Stream IO RawCmmGroup a
-               -> IO a
-nativeCodeGen' logger config modLoc ncgImpl h us cmms
+               -> CgStream RawCmmGroup a
+               -> UniqDSMT IO a
+nativeCodeGen' logger config modLoc ncgImpl h cmms
 = do
        -- BufHandle is a performance hack.  We could hide it inside
        -- Pretty if it weren't for the fact that we do lots of little
        -- printDocs here (in order to do codegen in constant space).
-        bufh <- newBufHandle h
+        bufh <- liftIO $ newBufHandle h
        let ngs0 = NGS [] [] [] [] [] [] emptyUFM mapEmpty
-        (ngs, us', a) <- cmmNativeGenStream logger config modLoc ncgImpl bufh us
-                                         cmms ngs0
-        _ <- finishNativeGen logger config modLoc bufh us' ngs
+        (ngs, a) <- cmmNativeGenStream logger config modLoc ncgImpl bufh cmms ngs0
+        _ <- finishNativeGen logger config modLoc bufh ngs
        return a

 finishNativeGen :: Instruction instr
@@ -236,23 +223,24 @@ finishNativeGen :: Instruction instr
                -> NCGConfig
                -> ModLocation
                -> BufHandle
-                -> UniqSupply
                -> NativeGenAcc statics instr
-                -> IO UniqSupply
-finishNativeGen logger config modLoc bufh us ngs
+                -> UniqDSMT IO ()
+finishNativeGen logger config modLoc bufh ngs
 = withTimingSilent logger (text "NCG") (`seq` ()) $ do
-        -- Write debug data and finish
-        us' <- if not (ncgDwarfEnabled config)
-                  then return us
-                  else do
-                     compPath <- getCurrentDirectory
-                     let (dwarf_h, us') = dwarfGen compPath config modLoc us (ngs_debug ngs)
-                         (dwarf_s, _)   = dwarfGen compPath config modLoc us (ngs_debug ngs)
-                     emitNativeCode logger config bufh dwarf_h dwarf_s
-                     return us'
+      -- Write debug data and finish
+      if not (ncgDwarfEnabled config)
+        then return ()
+        else withDUS $ \us -> do
+           compPath <- getCurrentDirectory
+           let (dwarf_h, us') = dwarfGen compPath config modLoc us (ngs_debug ngs)
+               (dwarf_s, _)   = dwarfGen compPath config modLoc us (ngs_debug ngs)
+           emitNativeCode logger config bufh dwarf_h dwarf_s
+           return ((), us')
+      liftIO $ do

        -- dump global NCG stats for graph coloring allocator
        let stats = concat (ngs_colorStats ngs)
+            platform = ncgPlatform config
        unless (null stats) $ do

          -- build the global register conflict graph
@@ -263,7 +251,7 @@ finishNativeGen logger config modLoc bufh us ngs

          dump_stats (Color.pprStats stats graphGlobal)

-          let platform = ncgPlatform config
+
          putDumpFileMaybe logger
                  Opt_D_dump_asm_conflicts "Register conflict graph"
                  FormatText
@@ -278,14 +266,14 @@ finishNativeGen logger config modLoc bufh us ngs
        -- dump global NCG stats for linear allocator
        let linearStats = concat (ngs_linearStats ngs)
        unless (null linearStats) $
-          dump_stats (Linear.pprStats (concat (ngs_natives ngs)) linearStats)
+          dump_stats (Linear.pprStats platform (concat (ngs_natives ngs)) linearStats)

        -- write out the imports
        let ctx = ncgAsmContext config
        bPutHDoc bufh ctx $ makeImportsDoc config (concat (ngs_imports ngs))
        bFlush bufh

-        return us'
+        return ()
  where
    dump_stats = logDumpFile logger (mkDumpStyle alwaysQualify)
                   Opt_D_dump_asm_stats "NCG stats"
@@ -297,20 +285,18 @@ cmmNativeGenStream :: forall statics jumpDest instr a . (OutputableP Platform st
              -> ModLocation
              -> NcgImpl statics instr jumpDest
              -> BufHandle
-              -> UniqSupply
-              -> Stream.Stream IO RawCmmGroup a
+              -> CgStream RawCmmGroup a
              -> NativeGenAcc statics instr
-              -> IO (NativeGenAcc statics instr, UniqSupply, a)
+              -> UniqDSMT IO (NativeGenAcc statics instr, a)

-cmmNativeGenStream logger config modLoc ncgImpl h us cmm_stream ngs
- = loop us (Stream.runStream cmm_stream) ngs
+cmmNativeGenStream logger config modLoc ncgImpl h cmm_stream ngs
+ = loop (Stream.runStream cmm_stream) ngs
  where
    ncglabel = text "NCG"
-    loop :: UniqSupply
-              -> Stream.StreamS IO RawCmmGroup a
-              -> NativeGenAcc statics instr
-              -> IO (NativeGenAcc statics instr, UniqSupply, a)
-    loop us s ngs =
+    loop :: Stream.StreamS (UniqDSMT IO) RawCmmGroup a
+         -> NativeGenAcc statics instr
+         -> UniqDSMT IO (NativeGenAcc statics instr, a)
+    loop s ngs =
      case s of
        Stream.Done a ->
          return (ngs { ngs_imports = reverse $ ngs_imports ngs
@@ -318,35 +304,33 @@ cmmNativeGenStream logger config modLoc ncgImpl h us cmm_stream ngs
                      , ngs_colorStats = reverse $ ngs_colorStats ngs
                      , ngs_linearStats = reverse $ ngs_linearStats ngs
                      },
-                  us,
                  a)
-        Stream.Effect m -> m >>= \cmm_stream' -> loop us cmm_stream' ngs
+        Stream.Effect m -> m >>= \cmm_stream' -> loop cmm_stream' ngs
        Stream.Yield cmms cmm_stream' -> do
-          (us', ngs'') <-
-            withTimingSilent logger
-                ncglabel (\(a, b) -> a `seq` b `seq` ()) $ do
+          ngs'' <-
+            withTimingSilent logger ncglabel (`seq` ()) $ do
              -- Generate debug information
              let !ndbgs | ncgDwarfEnabled config = cmmDebugGen modLoc cmms
                         | otherwise              = []
                  dbgMap = debugToMap ndbgs

              -- Generate native code
-              (ngs',us') <- cmmNativeGens logger config ncgImpl h
-                                          dbgMap us cmms ngs 0
+              ngs' <- withDUS $ cmmNativeGens logger config ncgImpl h
+                                  dbgMap cmms ngs 0

              -- Link native code information into debug blocks
              -- See Note [What is this unwinding business?] in "GHC.Cmm.DebugBlock".
              let !ldbgs = cmmDebugLink (ngs_labels ngs') (ngs_unwinds ngs') ndbgs
                  platform = ncgPlatform config
-              unless (null ldbgs) $
+              unless (null ldbgs) $ liftIO $
                putDumpFileMaybe logger Opt_D_dump_debug "Debug Infos" FormatText
                  (vcat $ map (pdoc platform) ldbgs)

              -- Accumulate debug information for emission in finishNativeGen.
              let ngs'' = ngs' { ngs_debug = ngs_debug ngs' ++ ldbgs, ngs_labels = [] }
-              return (us', ngs'')
+              return ngs''

-          loop us' cmm_stream' ngs''
+          loop cmm_stream' ngs''


 -- | Do native code generation on all these cmms.
@@ -358,24 +342,24 @@ cmmNativeGens :: forall statics instr jumpDest.
              -> NcgImpl statics instr jumpDest
              -> BufHandle
              -> LabelMap DebugBlock
-              -> UniqSupply
              -> [RawCmmDecl]
              -> NativeGenAcc statics instr
              -> Int
-              -> IO (NativeGenAcc statics instr, UniqSupply)
+              -> DUniqSupply
+              -> IO (NativeGenAcc statics instr, DUniqSupply)

 cmmNativeGens logger config ncgImpl h dbgMap = go
  where
-    go :: UniqSupply -> [RawCmmDecl]
-       -> NativeGenAcc statics instr -> Int
-       -> IO (NativeGenAcc statics instr, UniqSupply)
+    go :: [RawCmmDecl]
+       -> NativeGenAcc statics instr -> Int -> DUniqSupply
+       -> IO (NativeGenAcc statics instr, DUniqSupply)

-    go us [] ngs !_ =
+    go [] ngs !_ !us =
        return (ngs, us)

-    go us (cmm : cmms) ngs count = do
+    go (cmm : cmms) ngs count us = do
        let fileIds = ngs_dwarfFiles ngs
-        (us', fileIds', native, imports, colorStats, linearStats, unwinds)
+        (us', fileIds', native, imports, colorStats, linearStats, unwinds, mcfg)
          <- {-# SCC "cmmNativeGen" #-}
             cmmNativeGen logger ncgImpl us fileIds dbgMap
                          cmm count
@@ -403,7 +387,13 @@ cmmNativeGens logger config ncgImpl h dbgMap = go
        {-# SCC "seqString" #-} evaluate $ seqList (showSDocUnsafe $ vcat $ map (pprAsmLabel platform) imports) ()

        let !labels' = if ncgDwarfEnabled config
-                       then cmmDebugLabels isMetaInstr native else []
+                       then cmmDebugLabels is_valid_label isMetaInstr native else []
+            is_valid_label
+              -- filter dead labels: asm-shortcutting may remove some blocks
+              -- (#22792)
+              | Just cfg <- mcfg = hasNode cfg
+              | otherwise        = const True
+
            !natives' = if logHasDumpFlag logger Opt_D_dump_asm_stats
                        then native : ngs_natives ngs else []

@@ -416,7 +406,7 @@ cmmNativeGens logger config ncgImpl h dbgMap = go
                      , ngs_dwarfFiles  = fileIds'
                      , ngs_unwinds     = ngs_unwinds ngs `mapUnion` unwinds
                      }
-        go us' cmms ngs' (count + 1)
+        go cmms ngs' (count + 1) us'


 -- see Note [pprNatCmmDeclS and pprNatCmmDeclH] in GHC.CmmToAsm.Monad
@@ -437,18 +427,19 @@ cmmNativeGen
    :: forall statics instr jumpDest. (Instruction instr, OutputableP Platform statics, Outputable jumpDest)
    => Logger
    -> NcgImpl statics instr jumpDest
-        -> UniqSupply
+        -> DUniqSupply
        -> DwarfFiles
        -> LabelMap DebugBlock
        -> RawCmmDecl                                   -- ^ the cmm to generate code for
        -> Int                                          -- ^ sequence number of this top thing
-        -> IO   ( UniqSupply
+        -> IO   ( DUniqSupply
                , DwarfFiles
                , [NatCmmDecl statics instr]                -- native code
                , [CLabel]                                  -- things imported by this cmm
                , Maybe [Color.RegAllocStats statics instr] -- stats for the coloring register allocator
                , Maybe [Linear.RegAllocStats]              -- stats for the linear register allocators
                , LabelMap [UnwindPoint]                    -- unwinding information for blocks
+                , Maybe CFG                                 -- final CFG
                )

 cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count
@@ -481,7 +472,7 @@ cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count
        -- generate native code from cmm
        let ((native, lastMinuteImports, fileIds', nativeCfgWeights), usGen) =
                {-# SCC "genMachCode" #-}
-                initUs us $ genMachCode config
+                runUniqueDSM us $ genMachCode config
                                        (cmmTopCodeGen ncgImpl)
                                        fileIds dbgMap opt_cmm cmmCfg

@@ -499,7 +490,7 @@ cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count
                                else Nothing
        let (withLiveness, usLive) =
                {-# SCC "regLiveness" #-}
-                initUs usGen
+                runUniqueDSM usGen
                        $ mapM (cmmTopLiveness livenessCfg platform) native

        putDumpFileMaybe logger
@@ -512,7 +503,7 @@ cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count
         if ( ncgRegsGraph config || ncgRegsIterative config )
          then do
                -- the regs usable for allocation
-                let (alloc_regs :: UniqFM RegClass (UniqSet RealReg))
+                let alloc_regs :: UniqFM RegClass (UniqSet RealReg)
                        = foldr (\r -> plusUFM_C unionUniqSets
                                        $ unitUFM (targetClassOfRealReg platform r) (unitUniqSet r))
                                emptyUFM
@@ -521,7 +512,7 @@ cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count
                -- do the graph coloring register allocation
                let ((alloced, maybe_more_stack, regAllocStats), usAlloc)
                        = {-# SCC "RegAlloc-color" #-}
-                          initUs usLive
+                          runUniqueDSM usLive
                          $ Color.regAlloc
                                config
                                alloc_regs
@@ -531,13 +522,13 @@ cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count
                                livenessCfg

                let ((alloced', stack_updt_blks), usAlloc')
-                        = initUs usAlloc $
-                                case maybe_more_stack of
-                                Nothing     -> return (alloced, [])
-                                Just amount -> do
-                                    (alloced',stack_updt_blks) <- unzip <$>
-                                                (mapM ((ncgAllocMoreStack ncgImpl) amount) alloced)
-                                    return (alloced', concat stack_updt_blks )
+                        = runUniqueDSM usAlloc $
+                            case maybe_more_stack of
+                            Nothing     -> return (alloced, [])
+                            Just amount -> do
+                                (alloced',stack_updt_blks) <- unzip <$>
+                                            (mapM ((ncgAllocMoreStack ncgImpl) amount) alloced)
+                                return (alloced', concat stack_updt_blks )


                -- dump out what happened during register allocation
@@ -581,7 +572,7 @@ cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count

                let ((alloced, regAllocStats, stack_updt_blks), usAlloc)
                        = {-# SCC "RegAlloc-linear" #-}
-                          initUs usLive
+                          runUniqueDSM usLive
                          $ liftM unzip3
                          $ mapM reg_alloc withLiveness

@@ -653,7 +644,7 @@ cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count
        -- sequenced :: [NatCmmDecl statics instr]
        let (sequenced, us_seq) =
                        {-# SCC "sequenceBlocks" #-}
-                        initUs usAlloc $ mapM (BlockLayout.sequenceTop
+                        runUniqueDSM usAlloc $ mapM (BlockLayout.sequenceTop
                                ncgImpl optimizedCFG)
                            shorted

@@ -686,7 +677,9 @@ cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count
                , lastMinuteImports ++ imports
                , ppr_raStatsColor
                , ppr_raStatsLinear
-                , unwinds )
+                , unwinds
+                , optimizedCFG
+                )

 maybeDumpCfg :: Logger -> Maybe CFG -> String -> SDoc -> IO ()
 maybeDumpCfg _logger Nothing _ _ = return ()
@@ -923,7 +916,7 @@ genMachCode
        -> LabelMap DebugBlock
        -> RawCmmDecl
        -> CFG
-        -> UniqSM
+        -> UniqDSM
                ( [NatCmmDecl statics instr]
                , [CLabel]
                , DwarfFiles
@@ -931,208 +924,16 @@ genMachCode
                )

 genMachCode config cmmTopCodeGen fileIds dbgMap cmm_top cmm_cfg
-  = do  { initial_us <- getUniqueSupplyM
-        ; let initial_st           = mkNatM_State initial_us 0 config
-                                                  fileIds dbgMap cmm_cfg
-              (new_tops, final_st) = initNat initial_st (cmmTopCodeGen cmm_top)
-              final_delta          = natm_delta final_st
-              final_imports        = natm_imports final_st
-              final_cfg            = natm_cfg final_st
-        ; if   final_delta == 0
-          then return (new_tops, final_imports
-                      , natm_fileid final_st, final_cfg)
-          else pprPanic "genMachCode: nonzero final delta" (int final_delta)
-    }
-
-- -----------------------------------------------------------------------------
-- Generic Cmm optimiser
-
-{-
-Here we do:
-
-  (a) Constant folding
-  (c) Position independent code and dynamic linking
-        (i)  introduce the appropriate indirections
-             and position independent refs
-        (ii) compile a list of imported symbols
-  (d) Some arch-specific optimizations
-
-(a) will be moving to the new Hoopl pipeline, however, (c) and
-(d) are only needed by the native backend and will continue to live
-here.
-
-Ideas for other things we could do (put these in Hoopl please!):
-
-  - shortcut jumps-to-jumps
-  - simple CSE: if an expr is assigned to a temp, then replace later occs of
-    that expr with the temp, until the expr is no longer valid (can push through
-    temp assignments, and certain assigns to mem...)
-}
-
-cmmToCmm :: NCGConfig -> RawCmmDecl -> (RawCmmDecl, [CLabel])
-cmmToCmm _ top@(CmmData _ _) = (top, [])
-cmmToCmm config (CmmProc info lbl live graph)
-    = runCmmOpt config $
-      do blocks' <- mapM cmmBlockConFold (toBlockList graph)
-         return $ CmmProc info lbl live (ofBlockList (g_entry graph) blocks')
-
-type OptMResult a = (# a, [CLabel] #)
-
-pattern OptMResult :: a -> b -> (# a, b #)
-pattern OptMResult x y = (# x, y #)
-{-# COMPLETE OptMResult #-}
-
-newtype CmmOptM a = CmmOptM (NCGConfig -> [CLabel] -> OptMResult a)
-    deriving (Functor)
-
-instance Applicative CmmOptM where
-    pure x = CmmOptM $ \_ imports -> OptMResult x imports
-    (<*>) = ap
-
-instance Monad CmmOptM where
-  (CmmOptM f) >>= g =
-    CmmOptM $ \config imports0 ->
-                case f config imports0 of
-                  OptMResult x imports1 ->
-                    case g x of
-                      CmmOptM g' -> g' config imports1
-
-instance CmmMakeDynamicReferenceM CmmOptM where
-    addImport = addImportCmmOpt
-
-addImportCmmOpt :: CLabel -> CmmOptM ()
-addImportCmmOpt lbl = CmmOptM $ \_ imports -> OptMResult () (lbl:imports)
-
-getCmmOptConfig :: CmmOptM NCGConfig
-getCmmOptConfig = CmmOptM $ \config imports -> OptMResult config imports
-
-runCmmOpt :: NCGConfig -> CmmOptM a -> (a, [CLabel])
-runCmmOpt config (CmmOptM f) =
-  case f config [] of
-    OptMResult result imports -> (result, imports)
-
-cmmBlockConFold :: CmmBlock -> CmmOptM CmmBlock
-cmmBlockConFold block = do
-  let (entry, middle, last) = blockSplit block
-      stmts = blockToList middle
-  stmts' <- mapM cmmStmtConFold stmts
-  last' <- cmmStmtConFold last
-  return $ blockJoin entry (blockFromList stmts') last'
-
-- This does three optimizations, but they're very quick to check, so we don't
-- bother turning them off even when the Hoopl code is active.  Since
-- this is on the old Cmm representation, we can't reuse the code either:
--  * reg = reg      --> nop
--  * if 0 then jump --> nop
--  * if 1 then jump --> jump
-- We might be tempted to skip this step entirely of not Opt_PIC, but
-- there is some PowerPC code for the non-PIC case, which would also
-- have to be separated.
-cmmStmtConFold :: CmmNode e x -> CmmOptM (CmmNode e x)
-cmmStmtConFold stmt
-   = case stmt of
-        CmmAssign reg src
-           -> do src' <- cmmExprConFold DataReference src
-                 return $ case src' of
-                   CmmReg reg' | reg == reg' -> CmmComment (fsLit "nop")
-                   new_src -> CmmAssign reg new_src
-
-        CmmStore addr src align
-           -> do addr' <- cmmExprConFold DataReference addr
-                 src'  <- cmmExprConFold DataReference src
-                 return $ CmmStore addr' src' align
-
-        CmmCall { cml_target = addr }
-           -> do addr' <- cmmExprConFold JumpReference addr
-                 return $ stmt { cml_target = addr' }
-
-        CmmUnsafeForeignCall target regs args
-           -> do target' <- case target of
-                              ForeignTarget e conv -> do
-                                e' <- cmmExprConFold CallReference e
-                                return $ ForeignTarget e' conv
-                              PrimTarget _ ->
-                                return target
-                 args' <- mapM (cmmExprConFold DataReference) args
-                 return $ CmmUnsafeForeignCall target' regs args'
-
-        CmmCondBranch test true false likely
-           -> do test' <- cmmExprConFold DataReference test
-                 return $ case test' of
-                   CmmLit (CmmInt 0 _) -> CmmBranch false
-                   CmmLit (CmmInt _ _) -> CmmBranch true
-                   _other -> CmmCondBranch test' true false likely
-
-        CmmSwitch expr ids
-           -> do expr' <- cmmExprConFold DataReference expr
-                 return $ CmmSwitch expr' ids
-
-        other
-           -> return other
-
-cmmExprConFold :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr
-cmmExprConFold referenceKind expr = do
-    config <- getCmmOptConfig
-
-    let expr' = if not (ncgDoConstantFolding config)
-                    then expr
-                    else cmmExprCon config expr
-
-    cmmExprNative referenceKind expr'
-
-cmmExprCon :: NCGConfig -> CmmExpr -> CmmExpr
-cmmExprCon config (CmmLoad addr rep align) = CmmLoad (cmmExprCon config addr) rep align
-cmmExprCon config (CmmMachOp mop args)
-    = cmmMachOpFold (ncgPlatform config) mop (map (cmmExprCon config) args)
-cmmExprCon _ other = other
-
-- handles both PIC and non-PIC cases... a very strange mixture
-- of things to do.
-cmmExprNative :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr
-cmmExprNative referenceKind expr = do
-     config <- getCmmOptConfig
-     let platform = ncgPlatform config
-         arch = platformArch platform
-     case expr of
-        CmmLoad addr rep align
-          -> do addr' <- cmmExprNative DataReference addr
-                return $ CmmLoad addr' rep align
-
-        CmmMachOp mop args
-          -> do args' <- mapM (cmmExprNative DataReference) args
-                return $ CmmMachOp mop args'
-
-        CmmLit (CmmBlock id)
-          -> cmmExprNative referenceKind (CmmLit (CmmLabel (infoTblLbl id)))
-          -- we must convert block Ids to CLabels here, because we
-          -- might have to do the PIC transformation.  Hence we must
-          -- not modify BlockIds beyond this point.
-
-        CmmLit (CmmLabel lbl)
-          -> cmmMakeDynamicReference config referenceKind lbl
-        CmmLit (CmmLabelOff lbl off)
-          -> do dynRef <- cmmMakeDynamicReference config referenceKind lbl
-                -- need to optimize here, since it's late
-                return $ cmmMachOpFold platform (MO_Add (wordWidth platform)) [
-                    dynRef,
-                    (CmmLit $ CmmInt (fromIntegral off) (wordWidth platform))
-                  ]
-
-        -- On powerpc (non-PIC), it's easier to jump directly to a label than
-        -- to use the register table, so we replace these registers
-        -- with the corresponding labels:
-        CmmReg (CmmGlobal (GlobalRegUse EagerBlackholeInfo _))
-          | arch == ArchPPC && not (ncgPIC config)
-          -> cmmExprNative referenceKind $
-             CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_EAGER_BLACKHOLE_info")))
-        CmmReg (CmmGlobal (GlobalRegUse GCEnter1 _))
-          | arch == ArchPPC && not (ncgPIC config)
-          -> cmmExprNative referenceKind $
-             CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_gc_enter_1")))
-        CmmReg (CmmGlobal (GlobalRegUse GCFun _))
-          | arch == ArchPPC && not (ncgPIC config)
-          -> cmmExprNative referenceKind $
-             CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_gc_fun")))
-
-        other
-           -> return other
+  = UDSM $ \initial_us -> do
+      { let initial_st           = mkNatM_State initial_us 0 config
+                                                fileIds dbgMap cmm_cfg
+            (new_tops, final_st) = initNat initial_st (cmmTopCodeGen cmm_top)
+            final_delta          = natm_delta final_st
+            final_imports        = natm_imports final_st
+            final_cfg            = natm_cfg final_st
+      ; if   final_delta == 0
+        then DUniqResult
+              (new_tops, final_imports
+                , natm_fileid final_st, final_cfg) (natm_us final_st)
+        else DUniqResult (pprPanic "genMachCode: nonzero final delta" (int final_delta)) undefined
+      }
--- a/compiler/GHC/CmmToAsm/AArch64.hs
+++ b/compiler/GHC/CmmToAsm/AArch64.hs
@@ -44,16 +44,17 @@ ncgAArch64 config
 -- | Instruction instance for aarch64
 instance Instruction AArch64.Instr where
        regUsageOfInstr         = AArch64.regUsageOfInstr
-        patchRegsOfInstr        = AArch64.patchRegsOfInstr
+        patchRegsOfInstr _      = AArch64.patchRegsOfInstr
        isJumpishInstr          = AArch64.isJumpishInstr
        jumpDestsOfInstr        = AArch64.jumpDestsOfInstr
+        canFallthroughTo        = AArch64.canFallthroughTo
        patchJumpInstr          = AArch64.patchJumpInstr
        mkSpillInstr            = AArch64.mkSpillInstr
        mkLoadInstr             = AArch64.mkLoadInstr
        takeDeltaInstr          = AArch64.takeDeltaInstr
        isMetaInstr             = AArch64.isMetaInstr
        mkRegRegMoveInstr _     = AArch64.mkRegRegMoveInstr
-        takeRegRegMoveInstr     = AArch64.takeRegRegMoveInstr
+        takeRegRegMoveInstr _   = AArch64.takeRegRegMoveInstr
        mkJumpInstr             = AArch64.mkJumpInstr
        mkStackAllocInstr       = AArch64.mkStackAllocInstr
        mkStackDeallocInstr     = AArch64.mkStackDeallocInstr

--- a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
-{-# language GADTs #-}
-{-# LANGUAGE TupleSections #-}
-{-# LANGUAGE BangPatterns #-}
-{-# LANGUAGE BinaryLiterals #-}
+{-# language GADTs, LambdaCase #-}
 {-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE NumericUnderscores #-}
 module GHC.CmmToAsm.AArch64.CodeGen (
      cmmTopCodeGen
    , generateJumpTableForInstr
@@ -27,7 +23,7 @@ import GHC.Cmm.DebugBlock
 import GHC.CmmToAsm.Monad
   ( NatM, getNewRegNat
   , getPicBaseMaybeNat, getPlatform, getConfig
-   , getDebugBlock, getFileId
+   , getDebugBlock, getFileId, getNewLabelNat
   )
 -- import GHC.CmmToAsm.Instr
 import GHC.CmmToAsm.PIC
@@ -48,14 +44,13 @@ import GHC.Cmm.Dataflow.Label
 import GHC.Cmm.Dataflow.Graph
 import GHC.Types.Tickish ( GenTickish(..) )
 import GHC.Types.SrcLoc  ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
-import GHC.Types.Unique.Supply
+import GHC.Types.Unique.DSM

 -- The rest:
 import GHC.Data.OrdList
 import GHC.Utils.Outputable

-import Control.Monad    ( mapAndUnzipM, foldM )
-import Data.Maybe
+import Control.Monad    ( mapAndUnzipM )
 import GHC.Float

 import GHC.Types.Basic
@@ -66,8 +61,6 @@ import GHC.Utils.Panic
 import GHC.Utils.Constants (debugIsOn)
 import GHC.Utils.Monad (mapAccumLM)

-import GHC.Cmm.Dataflow.Collections
-
 -- Note [General layout of an NCG]
 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 -- @cmmTopCodeGen@ will be our main entry point to code gen.  Here we'll get
@@ -155,15 +148,16 @@ basicBlockCodeGen block = do
            let line = srcSpanStartLine span; col = srcSpanStartCol span
            return $ unitOL $ LOCATION fileId line col (unpackFS name)
    _ -> return nilOL
-  (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
-  (!tail_instrs,_) <- stmtToInstrs mid_bid tail
+  mid_instrs <- stmtsToInstrs stmts
+  (!tail_instrs) <- stmtToInstrs tail
  let instrs = header_comment_instr `appOL` loc_instrs `appOL` mid_instrs `appOL` tail_instrs
  -- TODO: Then x86 backend run @verifyBasicBlock@ here and inserts
  --      unwinding info. See Ticket 19913
  -- code generation may introduce new basic block boundaries, which
  -- are indicated by the NEWBLOCK instruction.  We must split up the
-  -- instruction stream into basic blocks again.  Also, we extract
-  -- LDATAs here too.
+  -- instruction stream into basic blocks again. Also, we may extract
+  -- LDATAs here too (if they are implemented by AArch64 again - See
+  -- PPC how to do that.)
  let
        (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs

@@ -174,8 +168,6 @@ mkBlocks :: Instr
          -> ([Instr], [GenBasicBlock Instr], [GenCmmDecl RawCmmStatics h g])
 mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
  = ([], BasicBlock id instrs : blocks, statics)
-mkBlocks (LDATA sec dat) (instrs,blocks,statics)
-  = (instrs, blocks, CmmData sec dat:statics)
 mkBlocks instr (instrs,blocks,statics)
  = (instr:instrs, blocks, statics)
 -- -----------------------------------------------------------------------------
@@ -196,7 +188,7 @@ ann doc instr {- debugIsOn -} = ANN doc instr
 -- going back to the exact CmmExpr representation can be laborious and adds
 -- indirections to find the matches that lead to the assembly.
 --
-- An improvement oculd be to have
+-- An improvement could be to have
 --
 --    (pprExpr genericPlatform e) <> parens (text. show e)
 --
@@ -217,80 +209,106 @@ annExpr e instr {- debugIsOn -} = ANN (text . show $ e) instr
 -- -----------------------------------------------------------------------------
 -- Generating a table-branch

-- TODO jump tables would be a lot faster, but we'll use bare bones for now.
-- this is usually done by sticking the jump table ids into an instruction
-- and then have the @generateJumpTableForInstr@ callback produce the jump
-- table as a static.
--
-- See Ticket 19912
--
-- data SwitchTargets =
--    SwitchTargets
--        Bool                       -- Signed values
--        (Integer, Integer)         -- Range
--        (Maybe Label)              -- Default value
--        (M.Map Integer Label)      -- The branches
+-- | Generate jump to jump table target
 --
-- Non Jumptable plan:
-- xE <- expr
+-- The index into the jump table is calulated by evaluating @expr@. The
+-- corresponding table entry contains the relative address to jump to (relative
+-- to the jump table's first entry / the table's own label).
+genSwitch :: NCGConfig -> CmmExpr -> SwitchTargets -> NatM InstrBlock
+genSwitch config expr targets = do
+  (reg, fmt1, e_code) <- getSomeReg indexExpr
+  let fmt = II64
+  targetReg <- getNewRegNat fmt
+  lbl <- getNewLabelNat
+  dynRef <- cmmMakeDynamicReference config DataReference lbl
+  (tableReg, fmt2, t_code) <- getSomeReg dynRef
+  let code =
+        toOL
+          [ COMMENT (text "indexExpr" <+> (text . show) indexExpr),
+            COMMENT (text "dynRef" <+> (text . show) dynRef)
+          ]
+          `appOL` e_code
+          `appOL` t_code
+          `appOL` toOL
+            [ COMMENT (ftext "Jump table for switch"),
+              -- index to offset into the table (relative to tableReg)
+              annExpr expr (LSL (OpReg (formatToWidth fmt1) reg) (OpReg (formatToWidth fmt1) reg) (OpImm (ImmInt 3))),
+              -- calculate table entry address
+              ADD (OpReg W64 targetReg) (OpReg (formatToWidth fmt1) reg) (OpReg (formatToWidth fmt2) tableReg),
+              -- load table entry (relative offset from tableReg (first entry) to target label)
+              LDR II64 (OpReg W64 targetReg) (OpAddr (AddrRegImm targetReg (ImmInt 0))),
+              -- calculate absolute address of the target label
+              ADD (OpReg W64 targetReg) (OpReg W64 targetReg) (OpReg W64 tableReg),
+              -- prepare jump to target label
+              J_TBL ids (Just lbl) targetReg
+            ]
+  return code
+  where
+    -- See Note [Sub-word subtlety during jump-table indexing] in
+    -- GHC.CmmToAsm.X86.CodeGen for why we must first offset, then widen.
+    indexExpr0 = cmmOffset platform expr offset
+    -- We widen to a native-width register to sanitize the high bits
+    indexExpr =
+      CmmMachOp
+        (MO_UU_Conv expr_w (platformWordWidth platform))
+        [indexExpr0]
+    expr_w = cmmExprWidth platform expr
+    (offset, ids) = switchTargetsToTable targets
+    platform = ncgPlatform config
+
+-- | Generate jump table data (if required)
 --
-genSwitch :: CmmExpr -> SwitchTargets -> NatM InstrBlock
-genSwitch expr targets = do -- pprPanic "genSwitch" (ppr expr)
-  (reg, format, code) <- getSomeReg expr
-  let w = formatToWidth format
-  let mkbranch acc (key, bid) = do
-        (keyReg, _format, code) <- getSomeReg (CmmLit (CmmInt key w))
-        return $ code `appOL`
-                 toOL [ CMP (OpReg w reg) (OpReg w keyReg)
-                      , BCOND EQ (TBlock bid)
-                      ] `appOL` acc
-      def_code = case switchTargetsDefault targets of
-        Just bid -> unitOL (B (TBlock bid))
-        Nothing  -> nilOL
-
-  switch_code <- foldM mkbranch nilOL (switchTargetsCases targets)
-  return $ code `appOL` switch_code `appOL` def_code
-
-- We don't do jump tables for now, see Ticket 19912
-generateJumpTableForInstr :: NCGConfig -> Instr
-  -> Maybe (NatCmmDecl RawCmmStatics Instr)
+-- The idea is to emit one table entry per case. The entry is the relative
+-- address of the block to jump to (relative to the table's first entry /
+-- table's own label.) The calculation itself is done by the linker.
+generateJumpTableForInstr ::
+  NCGConfig ->
+  Instr ->
+  Maybe (NatCmmDecl RawCmmStatics Instr)
+generateJumpTableForInstr config (J_TBL ids (Just lbl) _) =
+  let jumpTable =
+        map jumpTableEntryRel ids
+        where
+          jumpTableEntryRel Nothing =
+            CmmStaticLit (CmmInt 0 (ncgWordWidth config))
+          jumpTableEntryRel (Just blockid) =
+            CmmStaticLit
+              ( CmmLabelDiffOff
+                  blockLabel
+                  lbl
+                  0
+                  (ncgWordWidth config)
+              )
+            where
+              blockLabel = blockLbl blockid
+   in Just (CmmData (Section ReadOnlyData lbl) (CmmStaticsRaw lbl jumpTable))
 generateJumpTableForInstr _ _ = Nothing

 -- -----------------------------------------------------------------------------
 -- Top-level of the instruction selector

-- See Note [Keeping track of the current block] for why
-- we pass the BlockId.
-stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
-              -> [CmmNode O O] -- ^ Cmm Statement
-              -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
-stmtsToInstrs bid stmts =
-    go bid stmts nilOL
+stmtsToInstrs :: [CmmNode O O] -- ^ Cmm Statements
+              -> NatM InstrBlock -- ^ Resulting instructions
+stmtsToInstrs stmts =
+    go stmts nilOL
  where
-    go bid  []        instrs = return (instrs,bid)
-    go bid (s:stmts)  instrs = do
-      (instrs',bid') <- stmtToInstrs bid s
-      -- If the statement introduced a new block, we use that one
-      let !newBid = fromMaybe bid bid'
-      go newBid stmts (instrs `appOL` instrs')
-
-- | `bid` refers to the current block and is used to update the CFG
--   if new blocks are inserted in the control flow.
-- See Note [Keeping track of the current block] for more details.
-stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
-             -> CmmNode e x
-             -> NatM (InstrBlock, Maybe BlockId)
-             -- ^ Instructions, and bid of new block if successive
-             -- statements are placed in a different basic block.
-stmtToInstrs bid stmt = do
+    go []        instrs = return instrs
+    go (s:stmts) instrs = do
+      instrs' <- stmtToInstrs s
+      go stmts (instrs `appOL` instrs')
+
+stmtToInstrs :: CmmNode e x -- ^ Cmm Statement
+             -> NatM InstrBlock -- ^ Resulting Instructions
+stmtToInstrs stmt = do
  -- traceM $ "-- -------------------------- stmtToInstrs -------------------------- --\n"
  --     ++ showSDocUnsafe (ppr stmt)
+  config <- getConfig
  platform <- getPlatform
  case stmt of
    CmmUnsafeForeignCall target result_regs args
-       -> genCCall target result_regs args bid
+       -> genCCall target result_regs args

-    _ -> (,Nothing) <$> case stmt of
+    _ -> case stmt of
      CmmComment s   -> return (unitOL (COMMENT (ftext s)))
      CmmTick {}     -> return nilOL

@@ -311,9 +329,9 @@ stmtToInstrs bid stmt = do
      --We try to arrange blocks such that the likely branch is the fallthrough
      --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
      CmmCondBranch arg true false _prediction ->
-          genCondBranch bid true false arg
+          genCondBranch true false arg

-      CmmSwitch arg ids -> genSwitch arg ids
+      CmmSwitch arg ids -> genSwitch config arg ids

      CmmCall { cml_target = arg } -> genJump arg

@@ -358,12 +376,6 @@ getRegisterReg platform (CmmGlobal reg@(GlobalRegUse mid _))
        -- ones which map to a real machine register on this
        -- platform.  Hence if it's not mapped to a registers something
        -- went wrong earlier in the pipeline.
-- | Convert a BlockId to some CmmStatic data
-- TODO: Add JumpTable Logic, see Ticket 19912
-- jumpTableEntry :: NCGConfig -> Maybe BlockId -> CmmStatic
-- jumpTableEntry config Nothing   = CmmStaticLit (CmmInt 0 (ncgWordWidth config))
-- jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
--     where blockLabel = blockLbl blockid

 -- -----------------------------------------------------------------------------
 -- General things for putting together code sequences
@@ -435,7 +447,7 @@ getMovWideImm n w

 -- | Arithmetic(immediate)
 --  Allows for 12bit immediates which can be shifted by 0 or 12 bits.
-- Used with ADD, ADDS, SUB, SUBS, CMP, CMN
+-- Used with ADD, ADDS, SUB, SUBS, CMP
 -- See Note [Aarch64 immediates]
 getArithImm :: Integer -> Width -> Maybe Operand
 getArithImm n w
@@ -460,7 +472,7 @@ getArithImm n w

 -- |  Logical (immediate)
 -- Allows encoding of some repeated bitpatterns
-- Used with AND, ANDS, EOR, ORR, TST
+-- Used with AND, EOR, ORR
 -- and their aliases which includes at least MOV (bitmask immediate)
 -- See Note [Aarch64 immediates]
 getBitmaskImm :: Integer -> Width -> Maybe Operand
@@ -490,7 +502,7 @@ isOffsetImm off w

 -- TODO OPT: we might be able give getRegister
 --          a hint, what kind of register we want.
-getFloatReg :: HasCallStack => CmmExpr -> NatM (Reg, Format, InstrBlock)
+getFloatReg :: HasDebugCallStack => CmmExpr -> NatM (Reg, Format, InstrBlock)
 getFloatReg expr = do
  r <- getRegister expr
  case r of
@@ -744,6 +756,12 @@ getRegister' config plat expr

    -- for MachOps, see GHC.Cmm.MachOp
    -- For CmmMachOp, see GHC.Cmm.Expr
+
+    -- Handle MO_RelaxedRead as a normal CmmLoad, to allow
+    -- non-trivial addressing modes to be used.
+    CmmMachOp (MO_RelaxedRead w) [e] ->
+      getRegister (CmmLoad e (cmmBits w) NaturallyAligned)
+
    CmmMachOp op [e] -> do
      (reg, _format, code) <- getSomeReg e
      case op of
@@ -756,8 +774,8 @@ getRegister' config plat expr
        MO_S_Neg w -> negate code w reg
        MO_F_Neg w -> return $ Any (floatFormat w) (\dst -> code `snocOL` NEG (OpReg w dst) (OpReg w reg))

-        MO_SF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` SCVTF (OpReg to dst) (OpReg from reg))  -- (Signed ConVerT Float)
-        MO_FS_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` FCVTZS (OpReg to dst) (OpReg from reg)) -- (float convert (-> zero) signed)
+        MO_SF_Round    from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` SCVTF (OpReg to dst) (OpReg from reg))  -- (Signed ConVerT Float)
+        MO_FS_Truncate from to -> return $ Any (intFormat to) (\dst -> code `snocOL` FCVTZS (OpReg to dst) (OpReg from reg)) -- (float convert (-> zero) signed)

        -- TODO this is very hacky
        -- Note, UBFM and SBFM expect source and target register to be of the same size, so we'll use @max from to@
@@ -765,12 +783,85 @@ getRegister' config plat expr
        MO_UU_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` UBFM (OpReg (max from to) dst) (OpReg (max from to) reg) (OpImm (ImmInt 0)) (toImm (min from to)))
        MO_SS_Conv from to -> ss_conv from to reg code
        MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` FCVT (OpReg to dst) (OpReg from reg))
+        MO_WF_Bitcast w    -> return $ Any (floatFormat w)  (\dst -> code `snocOL` FMOV (OpReg w dst) (OpReg w reg))
+        MO_FW_Bitcast w    -> return $ Any (intFormat w)    (\dst -> code `snocOL` FMOV (OpReg w dst) (OpReg w reg))

        -- Conversions
        MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e

-        _ -> pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
+        MO_Eq {} -> notUnary
+        MO_Ne {} -> notUnary
+        MO_Mul {} -> notUnary
+        MO_S_MulMayOflo {} -> notUnary
+        MO_S_Quot {} -> notUnary
+        MO_S_Rem {} -> notUnary
+        MO_U_Quot {} -> notUnary
+        MO_U_Rem {} -> notUnary
+        MO_S_Ge {} -> notUnary
+        MO_S_Le {} -> notUnary
+        MO_S_Gt {} -> notUnary
+        MO_S_Lt {} -> notUnary
+        MO_U_Ge {} -> notUnary
+        MO_U_Le {} -> notUnary
+        MO_U_Gt {} -> notUnary
+        MO_U_Lt {} -> notUnary
+        MO_F_Add {} -> notUnary
+        MO_F_Sub {} -> notUnary
+        MO_F_Mul {} -> notUnary
+        MO_F_Quot {} -> notUnary
+        MO_FMA {} -> notUnary
+        MO_F_Eq {} -> notUnary
+        MO_F_Ne {} -> notUnary
+        MO_F_Ge {} -> notUnary
+        MO_F_Le {} -> notUnary
+        MO_F_Gt {} -> notUnary
+        MO_F_Lt {} -> notUnary
+        MO_And {} -> notUnary
+        MO_Or {} -> notUnary
+        MO_Xor {} -> notUnary
+        MO_Shl {} -> notUnary
+        MO_U_Shr {} -> notUnary
+        MO_S_Shr {} -> notUnary
+        MO_V_Insert {} -> notUnary
+        MO_V_Extract {} -> notUnary
+        MO_V_Add {} -> notUnary
+        MO_V_Sub {} -> notUnary
+        MO_V_Mul {} -> notUnary
+        MO_VS_Quot {} -> notUnary
+        MO_VS_Rem {} -> notUnary
+        MO_VS_Neg {} -> notUnary
+        MO_VU_Quot {} -> notUnary
+        MO_VU_Rem {} -> notUnary
+        MO_V_Shuffle {} -> notUnary
+        MO_VF_Shuffle  {} -> notUnary
+        MO_VF_Insert {} -> notUnary
+        MO_VF_Extract {} -> notUnary
+        MO_VF_Add {} -> notUnary
+        MO_VF_Sub {} -> notUnary
+        MO_VF_Mul {} -> notUnary
+        MO_VF_Quot {} -> notUnary
+        MO_Add {} -> notUnary
+        MO_Sub {} -> notUnary
+
+        MO_F_Min {} -> notUnary
+        MO_F_Max {} -> notUnary
+        MO_VU_Min {} -> notUnary
+        MO_VU_Max {} -> notUnary
+        MO_VS_Min {} -> notUnary
+        MO_VS_Max {} -> notUnary
+        MO_VF_Min {} -> notUnary
+        MO_VF_Max {} -> notUnary
+
+        MO_AlignmentCheck {} ->
+          pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
+
+        MO_V_Broadcast {} -> vectorsNeedLlvm
+        MO_VF_Broadcast {} -> vectorsNeedLlvm
+        MO_VF_Neg {} -> vectorsNeedLlvm
      where
+        notUnary = pprPanic "getRegister' (non-unary CmmMachOp with 1 argument):" (pdoc plat expr)
+        vectorsNeedLlvm =
+            sorry "SIMD operations on AArch64 currently require the LLVM backend"
        toImm W8 =  (OpImm (ImmInt 7))
        toImm W16 = (OpImm (ImmInt 15))
        toImm W32 = (OpImm (ImmInt 31))
@@ -1075,6 +1166,8 @@ getRegister' config plat expr
        MO_F_Sub w   -> floatOp w (\d x y -> unitOL $ SUB d x y)
        MO_F_Mul w   -> floatOp w (\d x y -> unitOL $ MUL d x y)
        MO_F_Quot w  -> floatOp w (\d x y -> unitOL $ SDIV d x y)
+        MO_F_Min w   -> floatOp w (\d x y -> unitOL $ FMIN d x y)
+        MO_F_Max w   -> floatOp w (\d x y -> unitOL $ FMAX d x y)

        -- Floating point comparison
        MO_F_Eq w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d EQ ])
@@ -1098,10 +1191,56 @@ getRegister' config plat expr
        MO_U_Shr w -> intOp False w (\d x y -> unitOL $ LSR d x y)
        MO_S_Shr w -> intOp True  w (\d x y -> unitOL $ ASR d x y)

-        -- TODO
-
-        op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $
-                (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
+        -- Non-dyadic MachOp with 2 arguments
+        MO_S_Neg {} -> notDyadic
+        MO_F_Neg {} -> notDyadic
+        MO_FMA {} -> notDyadic
+        MO_Not {} -> notDyadic
+        MO_SF_Round {} -> notDyadic
+        MO_FS_Truncate {} -> notDyadic
+        MO_SS_Conv {} -> notDyadic
+        MO_UU_Conv {} -> notDyadic
+        MO_XX_Conv {} -> notDyadic
+        MO_FF_Conv {} -> notDyadic
+        MO_WF_Bitcast {} -> notDyadic
+        MO_FW_Bitcast {} -> notDyadic
+        MO_V_Broadcast {} -> notDyadic
+        MO_VF_Broadcast {} -> notDyadic
+        MO_V_Insert {} -> notDyadic
+        MO_VF_Insert {} -> notDyadic
+        MO_AlignmentCheck {} -> notDyadic
+        MO_RelaxedRead {} -> notDyadic
+
+        -- Vector operations: currently unsupported in the AArch64 NCG.
+        MO_V_Extract {} -> vectorsNeedLlvm
+        MO_V_Add {} -> vectorsNeedLlvm
+        MO_V_Sub {} -> vectorsNeedLlvm
+        MO_V_Mul {} -> vectorsNeedLlvm
+        MO_VS_Quot {} -> vectorsNeedLlvm
+        MO_VS_Rem {} -> vectorsNeedLlvm
+        MO_VS_Neg {} -> vectorsNeedLlvm
+        MO_VU_Quot {} -> vectorsNeedLlvm
+        MO_VU_Rem {} -> vectorsNeedLlvm
+        MO_VF_Extract {} -> vectorsNeedLlvm
+        MO_VF_Add {} -> vectorsNeedLlvm
+        MO_VF_Sub {} -> vectorsNeedLlvm
+        MO_VF_Neg {} -> vectorsNeedLlvm
+        MO_VF_Mul {} -> vectorsNeedLlvm
+        MO_VF_Quot {} -> vectorsNeedLlvm
+        MO_V_Shuffle {} -> vectorsNeedLlvm
+        MO_VF_Shuffle {} -> vectorsNeedLlvm
+        MO_VU_Min {} -> vectorsNeedLlvm
+        MO_VU_Max {} -> vectorsNeedLlvm
+        MO_VS_Min {} -> vectorsNeedLlvm
+        MO_VS_Max {} -> vectorsNeedLlvm
+        MO_VF_Min {} -> vectorsNeedLlvm
+        MO_VF_Max {} -> vectorsNeedLlvm
+        where
+          notDyadic =
+            pprPanic "getRegister' (non-dyadic CmmMachOp with 2 arguments): " $
+              (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
+          vectorsNeedLlvm =
+            sorry "SIMD operations on AArch64 currently require the LLVM backend"

    -- Generic ternary case.
    CmmMachOp op [x, y, z] ->
@@ -1115,16 +1254,25 @@ getRegister' config plat expr
        -- x86 fnmadd - x * y + z <=> AArch64 fmsub : d = - r1 * r2 + r3
        -- x86 fnmsub - x * y - z <=> AArch64 fnmadd: d = - r1 * r2 - r3

-        MO_FMA var w -> case var of
-          FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
-          FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
-          FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMSub  d n m a)
-          FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
+        MO_FMA var l w
+          | l == 1
+          -> case var of
+            FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
+            FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
+            FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMSub  d n m a)
+            FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
+          | otherwise
+          -> vectorsNeedLlvm
+
+        MO_V_Insert {} -> vectorsNeedLlvm
+        MO_VF_Insert {} -> vectorsNeedLlvm

        _ -> pprPanic "getRegister' (unhandled ternary CmmMachOp): " $
                (pprMachOp op) <+> text "in" <+> (pdoc plat expr)

      where
+          vectorsNeedLlvm =
+            sorry "SIMD operations on AArch64 currently require the LLVM backend"
          float3Op w op = do
            (reg_fx, format_x, code_fx) <- getFloatReg x
            (reg_fy, format_y, code_fy) <- getFloatReg y
@@ -1439,7 +1587,7 @@ genCondJump bid expr = do
      _ -> pprPanic "AArch64.genCondJump: " (text $ show expr)

 -- A conditional jump with at least +/-128M jump range
-genCondFarJump :: MonadUnique m => Cond -> Target -> m InstrBlock
+genCondFarJump :: MonadGetUnique m => Cond -> Target -> m InstrBlock
 genCondFarJump cond far_target = do
  skip_lbl_id <- newBlockId
  jmp_lbl_id <- newBlockId
@@ -1455,14 +1603,12 @@ genCondFarJump cond far_target = do
                , B far_target
                , NEWBLOCK skip_lbl_id]

-genCondBranch
-    :: BlockId      -- the source of the jump
-    -> BlockId      -- the true branch target
+genCondBranch :: BlockId      -- the true branch target
    -> BlockId      -- the false branch target
    -> CmmExpr      -- the condition on which to branch
    -> NatM InstrBlock -- Instructions

-genCondBranch _ true false expr = do
+genCondBranch true false expr = do
  b1 <- genCondJump true expr
  b2 <- genBranch false
  return (b1 `appOL` b2)
@@ -1548,16 +1694,15 @@ genCCall
    :: ForeignTarget      -- function to call
    -> [CmmFormal]        -- where to put the result
    -> [CmmActual]        -- arguments (of mixed type)
-    -> BlockId            -- The block we are in
-    -> NatM (InstrBlock, Maybe BlockId)
+    -> NatM InstrBlock
 -- TODO: Specialize where we can.
 -- Generic impl
-genCCall target dest_regs arg_regs bid = do
+genCCall target dest_regs arg_regs = do
  -- we want to pass arg_regs into allArgRegs
  -- pprTraceM "genCCall target" (ppr target)
  -- pprTraceM "genCCall formal" (ppr dest_regs)
  -- pprTraceM "genCCall actual" (ppr arg_regs)
-
+  platform <- getPlatform
  case target of
    -- The target :: ForeignTarget call can either
    -- be a foreign procedure with an address expr
@@ -1585,7 +1730,6 @@ genCCall target dest_regs arg_regs bid = do
      let (_res_hints, arg_hints) = foreignTargetHints target
          arg_regs'' = zipWith (\(r, f, c) h -> (r,f,h,c)) arg_regs' arg_hints

-      platform <- getPlatform
      let packStack = platformOS platform == OSDarwin

      (stackSpace', passRegs, passArgumentsCode) <- passArguments packStack allGpArgRegs allFpArgRegs arg_regs'' 0 [] nilOL
@@ -1596,7 +1740,7 @@ genCCall target dest_regs arg_regs bid = do
                       then 8 * (stackSpace' `div` 8 + 1)
                       else stackSpace'

-      (returnRegs, readResultsCode)   <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL
+      readResultsCode <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL

      let moveStackDown 0 = toOL [ PUSH_STACK_FRAME
                                 , DELTA (-16) ]
@@ -1614,17 +1758,301 @@ genCCall target dest_regs arg_regs bid = do
      let code =    call_target_code          -- compute the label (possibly into a register)
            `appOL` moveStackDown (stackSpace `div` 8)
            `appOL` passArgumentsCode         -- put the arguments into x0, ...
-            `appOL` (unitOL $ BL call_target passRegs returnRegs) -- branch and link.
+            `appOL` (unitOL $ BL call_target passRegs) -- branch and link.
            `appOL` readResultsCode           -- parse the results into registers
            `appOL` moveStackUp (stackSpace `div` 8)
-      return (code, Nothing)
+      return code

    PrimTarget MO_F32_Fabs
      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
        unaryFloatOp W32 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
+      | otherwise -> panic "mal-formed MO_F32_Fabs"
    PrimTarget MO_F64_Fabs
      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
        unaryFloatOp W64 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
+      | otherwise -> panic "mal-formed MO_F64_Fabs"
+    PrimTarget MO_F32_Sqrt
+      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
+        unaryFloatOp W32 (\d x -> unitOL $ FSQRT d x) arg_reg dest_reg
+      | otherwise -> panic "mal-formed MO_F32_Sqrt"
+    PrimTarget MO_F64_Sqrt
+      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
+        unaryFloatOp W64 (\d x -> unitOL $ FSQRT d x) arg_reg dest_reg
+      | otherwise -> panic "mal-formed MO_F64_Sqrt"
+
+
+    PrimTarget (MO_S_Mul2 w)
+          -- Life is easier when we're working with word sized operands,
+          -- we can use SMULH to compute the high 64 bits, and dst_needed
+          -- checks if the high half's bits are all the same as the low half's
+          -- top bit.
+          | w == W64
+          , [src_a, src_b] <- arg_regs
+          -- dst_needed = did the result fit into just the low half
+          , [dst_needed, dst_hi, dst_lo] <- dest_regs
+            ->  do
+              (reg_a, _format_x, code_x) <- getSomeReg src_a
+              (reg_b, _format_y, code_y) <- getSomeReg src_b
+
+              let lo = getRegisterReg platform (CmmLocal dst_lo)
+                  hi = getRegisterReg platform (CmmLocal dst_hi)
+                  nd = getRegisterReg platform (CmmLocal dst_needed)
+              return $
+                  code_x `appOL`
+                  code_y `snocOL`
+                  MUL   (OpReg W64 lo) (OpReg W64 reg_a) (OpReg W64 reg_b) `snocOL`
+                  SMULH (OpReg W64 hi) (OpReg W64 reg_a) (OpReg W64 reg_b) `snocOL`
+                  -- Are all high bits equal to the sign bit of the low word?
+                  -- nd = (hi == ASR(lo,width-1)) ? 1 : 0
+                  CMP   (OpReg W64 hi) (OpRegShift W64 lo SASR (widthInBits w - 1)) `snocOL`
+                  CSET  (OpReg W64 nd) NE
+            -- For sizes < platform width, we can just perform a multiply and shift
+            -- using the normal 64 bit multiply. Calculating the dst_needed value is
+            -- complicated a little by the need to be careful when truncation happens.
+            -- Currently this case can't be generated since
+            -- timesInt2# :: Int# -> Int# -> (# Int#, Int#, Int# #)
+            -- TODO: Should this be removed or would other primops be useful?
+          | w < W64
+          , [src_a, src_b] <- arg_regs
+          , [dst_needed, dst_hi, dst_lo] <- dest_regs
+            ->  do
+              (reg_a', _format_x, code_a) <- getSomeReg src_a
+              (reg_b', _format_y, code_b) <- getSomeReg src_b
+
+              let lo = getRegisterReg platform (CmmLocal dst_lo)
+                  hi = getRegisterReg platform (CmmLocal dst_hi)
+                  nd = getRegisterReg platform (CmmLocal dst_needed)
+                  -- Do everything in a full 64 bit registers
+                  w' = platformWordWidth platform
+
+              (reg_a, code_a') <- signExtendReg w w' reg_a'
+              (reg_b, code_b') <- signExtendReg w w' reg_b'
+
+              return $
+                  code_a  `appOL`
+                  code_b  `appOL`
+                  code_a' `appOL`
+                  code_b' `snocOL`
+                  -- the low 2w' of lo contains the full multiplication;
+                  -- eg: int8 * int8 -> int16 result
+                  -- so lo is in the last w of the register, and hi is in the second w.
+                  SMULL (OpReg w' lo) (OpReg w' reg_a) (OpReg w' reg_b) `snocOL`
+                  -- Make sure we hold onto the sign bits for dst_needed
+                  ASR (OpReg w' hi) (OpReg w' lo)    (OpImm (ImmInt $ widthInBits w)) `appOL`
+                  -- lo can now be truncated so we can get at it's top bit easily.
+                  truncateReg w' w lo `snocOL`
+                  -- Note the use of CMN (compare negative), not CMP: we want to
+                  -- test if the top half is negative one and the top
+                  -- bit of the bottom half is positive one. eg:
+                  -- hi = 0b1111_1111  (actually 64 bits)
+                  -- lo = 0b1010_1111  (-81, so the result didn't need the top half)
+                  -- lo' = ASR(lo,7)   (second reg of SMN)
+                  --     = 0b0000_0001 (theeshift gives us 1 for negative,
+                  --                    and 0 for positive)
+                  -- hi == -lo'?
+                  -- 0b1111_1111 == 0b1111_1111 (yes, top half is just overflow)
+                  -- Another way to think of this is if hi + lo' == 0, which is what
+                  -- CMN really is under the hood.
+                  CMN   (OpReg w' hi) (OpRegShift w' lo SLSR (widthInBits w - 1)) `snocOL`
+                  -- Set dst_needed to 1 if hi and lo' were (negatively) equal
+                  CSET  (OpReg w' nd) EQ `appOL`
+                  -- Finally truncate hi to drop any extraneous sign bits.
+                  truncateReg w' w hi
+          -- Can't handle > 64 bit operands
+          | otherwise -> unsupported (MO_S_Mul2 w)
+    PrimTarget (MO_U_Mul2  w)
+          -- The unsigned case is much simpler than the signed, all we need to
+          -- do is the multiplication straight into the destination registers.
+          | w == W64
+          , [src_a, src_b] <- arg_regs
+          , [dst_hi, dst_lo] <- dest_regs
+            ->  do
+              (reg_a, _format_x, code_x) <- getSomeReg src_a
+              (reg_b, _format_y, code_y) <- getSomeReg src_b
+
+              let lo = getRegisterReg platform (CmmLocal dst_lo)
+                  hi = getRegisterReg platform (CmmLocal dst_hi)
+              return (
+                  code_x `appOL`
+                  code_y `snocOL`
+                  MUL   (OpReg W64 lo) (OpReg W64 reg_a) (OpReg W64 reg_b) `snocOL`
+                  UMULH (OpReg W64 hi) (OpReg W64 reg_a) (OpReg W64 reg_b)
+                  )
+            -- For sizes < platform width, we can just perform a multiply and shift
+            -- Need to be careful to truncate the low half, but the upper half should be
+            -- be ok if the invariant in [Signed arithmetic on AArch64] is maintained.
+            -- Currently this case can't be produced by the compiler since
+            -- timesWord2# :: Word# -> Word# -> (# Word#, Word# #)
+            -- TODO: Remove? Or would the extra primop be useful for avoiding the extra
+            -- steps needed to do this in userland?
+          | w < W64
+          , [src_a, src_b] <- arg_regs
+          , [dst_hi, dst_lo] <- dest_regs
+            ->  do
+              (reg_a, _format_x, code_x) <- getSomeReg src_a
+              (reg_b, _format_y, code_y) <- getSomeReg src_b
+
+              let lo = getRegisterReg platform (CmmLocal dst_lo)
+                  hi = getRegisterReg platform (CmmLocal dst_hi)
+                  w' = opRegWidth w
+              return (
+                  code_x `appOL`
+                  code_y `snocOL`
+                  -- UMULL: Xd = Wa * Wb with 64 bit result
+                  -- W64 inputs should have been caught by case above
+                  UMULL (OpReg W64 lo) (OpReg w' reg_a) (OpReg w' reg_b) `snocOL`
+                  -- Extract and truncate high result
+                  -- hi[w:0] = lo[2w:w]
+                  UBFX (OpReg W64 hi) (OpReg W64 lo)
+                      (OpImm (ImmInt $ widthInBits w)) -- lsb
+                      (OpImm (ImmInt $ widthInBits w)) -- width to extract
+                      `appOL`
+                  truncateReg W64 w lo
+                  )
+          | otherwise -> unsupported (MO_U_Mul2  w)
+    PrimTarget (MO_Clz  w)
+          | w == W64 || w == W32
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst_reg = getRegisterReg platform (CmmLocal dst)
+              return (
+                  code_x `snocOL`
+                  CLZ   (OpReg w dst_reg) (OpReg w reg_a)
+                  )
+          | w == W16
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst' = getRegisterReg platform (CmmLocal dst)
+                  r n = OpReg W32 n
+                  imm n = OpImm (ImmInt n)
+              {- dst = clz(x << 16 | 0x0000_8000) -}
+              return (
+                  code_x `appOL` toOL
+                    [ LSL (r dst') (r reg_a) (imm 16)
+                    , ORR (r dst') (r dst')  (imm 0x00008000)
+                    , CLZ (r dst') (r dst')
+                    ]
+                  )
+          | w == W8
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst' = getRegisterReg platform (CmmLocal dst)
+                  r n = OpReg W32 n
+                  imm n = OpImm (ImmInt n)
+              {- dst = clz(x << 24 | 0x0080_0000) -}
+              return $
+                  code_x `appOL` toOL
+                    [ LSL (r dst') (r reg_a) (imm 24)
+                    , ORR (r dst') (r dst')  (imm 0x00800000)
+                    , CLZ (r dst') (r dst')
+                    ]
+            | otherwise -> unsupported (MO_Clz  w)
+    PrimTarget (MO_Ctz  w)
+          | w == W64 || w == W32
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst_reg = getRegisterReg platform (CmmLocal dst)
+              return $
+                  code_x `snocOL`
+                  RBIT (OpReg w dst_reg) (OpReg w reg_a) `snocOL`
+                  CLZ  (OpReg w dst_reg) (OpReg w dst_reg)
+          | w == W16
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst' = getRegisterReg platform (CmmLocal dst)
+                  r n = OpReg W32 n
+                  imm n = OpImm (ImmInt n)
+              {- dst = clz(reverseBits(x) | 0x0000_8000) -}
+              return $
+                  code_x `appOL` toOL
+                    [ RBIT (r dst') (r reg_a)
+                    , ORR  (r dst') (r dst') (imm 0x00008000)
+                    , CLZ  (r dst') (r dst')
+                    ]
+          | w == W8
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst' = getRegisterReg platform (CmmLocal dst)
+                  r n = OpReg W32 n
+                  imm n = OpImm (ImmInt n)
+              {- dst = clz(reverseBits(x) | 0x0080_0000) -}
+              return $
+                  code_x `appOL` toOL
+                    [ RBIT (r dst') (r reg_a)
+                    , ORR (r dst')  (r dst') (imm 0x00800000)
+                    , CLZ  (r dst')  (r dst')
+                    ]
+            | otherwise -> unsupported (MO_Ctz  w)
+    PrimTarget (MO_BRev  w)
+          | w == W64 || w == W32
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst_reg = getRegisterReg platform (CmmLocal dst)
+              return $
+                  code_x `snocOL`
+                  RBIT (OpReg w dst_reg) (OpReg w reg_a)
+          | w == W16
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst' = getRegisterReg platform (CmmLocal dst)
+                  r n = OpReg W32 n
+                  imm n = OpImm (ImmInt n)
+              {- dst = reverseBits32(x << 16) -}
+              return $
+                  code_x `appOL` toOL
+                    [ LSL  (r dst') (r reg_a) (imm 16)
+                    , RBIT (r dst') (r dst')
+                    ]
+          | w == W8
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst' = getRegisterReg platform (CmmLocal dst)
+                  r n = OpReg W32 n
+                  imm n = OpImm (ImmInt n)
+              {- dst = reverseBits32(x << 24) -}
+              return $
+                  code_x `appOL` toOL
+                    [ LSL  (r dst') (r reg_a) (imm 24)
+                    , RBIT (r dst') (r dst')
+                    ]
+            | otherwise -> unsupported (MO_BRev  w)
+    PrimTarget (MO_BSwap  w)
+          | w == W64 || w == W32
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst_reg = getRegisterReg platform (CmmLocal dst)
+              return $ code_x `snocOL` REV (OpReg w dst_reg) (OpReg w reg_a)
+          | w == W16
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst' = getRegisterReg platform (CmmLocal dst)
+                  r n = OpReg W32 n
+              -- Swaps the bytes in each 16bit word
+              -- TODO: Expose the 32 & 64 bit version of this?
+              return $ code_x `snocOL` REV16 (r dst') (r reg_a)
+          | otherwise -> unsupported (MO_BSwap w)

    -- or a possibly side-effecting machine operation
    -- mop :: CallishMachOp (see GHC.Cmm.MachOp)
@@ -1654,8 +2082,6 @@ genCCall target dest_regs arg_regs bid = do
        MO_F64_Log1P -> mkCCall "log1p"
        MO_F64_Exp   -> mkCCall "exp"
        MO_F64_ExpM1 -> mkCCall "expm1"
-        MO_F64_Fabs  -> mkCCall "fabs"
-        MO_F64_Sqrt  -> mkCCall "sqrt"

        -- 32 bit float ops
        MO_F32_Pwr   -> mkCCall "powf"
@@ -1676,8 +2102,6 @@ genCCall target dest_regs arg_regs bid = do
        MO_F32_Log1P -> mkCCall "log1pf"
        MO_F32_Exp   -> mkCCall "expf"
        MO_F32_ExpM1 -> mkCCall "expm1f"
-        MO_F32_Fabs  -> mkCCall "fabsf"
-        MO_F32_Sqrt  -> mkCCall "sqrtf"

        -- 64-bit primops
        MO_I64_ToI   -> mkCCall "hs_int64ToInt"
@@ -1715,7 +2139,6 @@ genCCall target dest_regs arg_regs bid = do

        -- Arithmatic
        -- These are not supported on X86, so I doubt they are used much.
-        MO_S_Mul2     _w -> unsupported mop
        MO_S_QuotRem  _w -> unsupported mop
        MO_U_QuotRem  _w -> unsupported mop
        MO_U_QuotRem2 _w -> unsupported mop
@@ -1724,14 +2147,18 @@ genCCall target dest_regs arg_regs bid = do
        MO_SubWordC   _w -> unsupported mop
        MO_AddIntC    _w -> unsupported mop
        MO_SubIntC    _w -> unsupported mop
-        MO_U_Mul2     _w -> unsupported mop

        -- Memory Ordering
-        MO_AcquireFence     ->  return (unitOL DMBISH, Nothing)
-        MO_ReleaseFence     ->  return (unitOL DMBISH, Nothing)
-        MO_Touch            ->  return (nilOL, Nothing) -- Keep variables live (when using interior pointers)
+        -- Set flags according to their C pendants (stdatomic.h):
+        -- atomic_thread_fence(memory_order_acquire); // -> dmb ishld
+        MO_AcquireFence     ->  return . unitOL $ DMBISH DmbLoad
+        -- atomic_thread_fence(memory_order_release); // -> dmb ish
+        MO_ReleaseFence     ->  return . unitOL $ DMBISH DmbLoadStore
+        -- atomic_thread_fence(memory_order_seq_cst); // -> dmb ish
+        MO_SeqCstFence      ->  return . unitOL $ DMBISH DmbLoadStore
+        MO_Touch            ->  return nilOL -- Keep variables live (when using interior pointers)
        -- Prefetch
-        MO_Prefetch_Data _n -> return (nilOL, Nothing) -- Prefetch hint.
+        MO_Prefetch_Data _n -> return nilOL -- Prefetch hint.

        -- Memory copy/set/move/cmp, with alignment for optimization

@@ -1751,10 +2178,6 @@ genCCall target dest_regs arg_regs bid = do
        MO_PopCnt w         -> mkCCall (popCntLabel w)
        MO_Pdep w           -> mkCCall (pdepLabel w)
        MO_Pext w           -> mkCCall (pextLabel w)
-        MO_Clz w            -> mkCCall (clzLabel w)
-        MO_Ctz w            -> mkCCall (ctzLabel w)
-        MO_BSwap w          -> mkCCall (bSwapLabel w)
-        MO_BRev w           -> mkCCall (bRevLabel w)

        -- -- Atomic read-modify-write.
        MO_AtomicRead w ord
@@ -1769,7 +2192,7 @@ genCCall target dest_regs arg_regs bid = do
                  code =
                    code_p `snocOL`
                    instr (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p)
-              return (code, Nothing)
+              return code
          | otherwise -> panic "mal-formed AtomicRead"
        MO_AtomicWrite w ord
          | [p_reg, val_reg] <- arg_regs -> do
@@ -1782,7 +2205,7 @@ genCCall target dest_regs arg_regs bid = do
                    code_p `appOL`
                    code_val `snocOL`
                    instr fmt_val (OpReg w val) (OpAddr $ AddrReg p)
-              return (code, Nothing)
+              return code
          | otherwise -> panic "mal-formed AtomicWrite"
        MO_AtomicRMW w amop -> mkCCall (atomicRMWLabel w amop)
        MO_Cmpxchg w        -> mkCCall (cmpxchgLabel w)
@@ -1795,13 +2218,13 @@ genCCall target dest_regs arg_regs bid = do
    unsupported :: Show a => a -> b
    unsupported mop = panic ("outOfLineCmmOp: " ++ show mop
                          ++ " not supported here")
-    mkCCall :: FastString -> NatM (InstrBlock, Maybe BlockId)
+    mkCCall :: FastString -> NatM InstrBlock
    mkCCall name = do
      config <- getConfig
      target <- cmmMakeDynamicReference config CallReference $
-          mkForeignLabel name Nothing ForeignLabelInThisPackage IsFunction
+          mkForeignLabel name ForeignLabelInThisPackage IsFunction
      let cconv = ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn
-      genCCall (ForeignTarget target cconv) dest_regs arg_regs bid
+      genCCall (ForeignTarget target cconv) dest_regs arg_regs

    -- TODO: Optimize using paired stores and loads (STP, LDP). It is
    -- automatically done by the allocator for us. However it's not optimal,
@@ -1943,8 +2366,8 @@ genCCall target dest_regs arg_regs bid = do

    passArguments _ _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")

-    readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg]-> InstrBlock -> NatM ([Reg], InstrBlock)
-    readResults _ _ [] accumRegs accumCode = return (accumRegs, accumCode)
+    readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg]-> InstrBlock -> NatM (InstrBlock)
+    readResults _ _ [] _ accumCode = return accumCode
    readResults [] _ _ _ _ = do
      platform <- getPlatform
      pprPanic "genCCall, out of gp registers when reading results" (pdoc platform target)
@@ -1967,7 +2390,7 @@ genCCall target dest_regs arg_regs bid = do
      (reg_fx, _format_x, code_fx) <- getFloatReg arg_reg
      let dst = getRegisterReg platform (CmmLocal dest_reg)
      let code = code_fx `appOL` op (OpReg w dst) (OpReg w reg_fx)
-      return (code, Nothing)
+      return code

 {- Note [AArch64 far jumps]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2012,7 +2435,7 @@ data BlockInRange = InRange | NotInRange Target

 -- See Note [AArch64 far jumps]
 makeFarBranches :: Platform -> LabelMap RawCmmStatics -> [NatBasicBlock Instr]
-                -> UniqSM [NatBasicBlock Instr]
+                -> UniqDSM [NatBasicBlock Instr]
 makeFarBranches {- only used when debugging -} _platform statics basic_blocks = do
  -- All offsets/positions are counted in multiples of 4 bytes (the size of AArch64 instructions)
  -- That is an offset of 1 represents a 4-byte/one instruction offset.
@@ -2033,7 +2456,7 @@ makeFarBranches {- only used when debugging -} _platform statics basic_blocks =
    long_bz_jump_size =  4 :: Int

    -- Replace out of range conditional jumps with unconditional jumps.
-    replace_blk :: LabelMap Int -> Int -> GenBasicBlock Instr -> UniqSM (Int, [GenBasicBlock Instr])
+    replace_blk :: LabelMap Int -> Int -> GenBasicBlock Instr -> UniqDSM (Int, [GenBasicBlock Instr])
    replace_blk !m !pos (BasicBlock lbl instrs) = do
      -- Account for a potential info table before the label.
      let !block_pos = pos + infoTblSize_maybe lbl
@@ -2047,12 +2470,14 @@ makeFarBranches {- only used when debugging -} _platform statics basic_blocks =
      let final_blocks = BasicBlock lbl top : split_blocks
      pure (pos', final_blocks)

-    replace_jump :: LabelMap Int -> Int -> Instr -> UniqSM (Int, [Instr])
+    replace_jump :: LabelMap Int -> Int -> Instr -> UniqDSM (Int, [Instr])
    replace_jump !m !pos instr = do
      case instr of
        ANN ann instr -> do
-          (idx,instr':instrs') <- replace_jump m pos instr
-          pure (idx, ANN ann instr':instrs')
+          replace_jump m pos instr >>= \case
+            (idx,instr':instrs') ->
+              pure (idx, ANN ann instr':instrs')
+            (idx,[]) -> pprPanic "replace_jump" (text "empty return list for " <+> ppr idx)
        BCOND cond t
          -> case target_in_range m t pos of
              InRange -> pure (pos+long_bc_jump_size,[instr])

--- a/compiler/GHC/CmmToAsm/AArch64/Instr.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
@@ -14,21 +14,22 @@ import GHC.CmmToAsm.Format
 import GHC.CmmToAsm.Types
 import GHC.CmmToAsm.Utils
 import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Reg.Target (targetClassOfReg)
 import GHC.Platform.Reg
+import GHC.Platform.Reg.Class.Unified

 import GHC.Platform.Regs
 import GHC.Cmm.BlockId
-import GHC.Cmm.Dataflow.Collections
 import GHC.Cmm.Dataflow.Label
 import GHC.Cmm
 import GHC.Cmm.CLabel
 import GHC.Utils.Outputable
 import GHC.Platform
-import GHC.Types.Unique.Supply
+import GHC.Types.Unique.DSM

 import GHC.Utils.Panic

-import Data.Maybe (fromMaybe)
+import Data.Maybe (fromMaybe, catMaybes)

 import GHC.Stack

@@ -79,13 +80,15 @@ regUsageOfInstr platform instr = case instr of

  -- 1. Arithmetic Instructions ------------------------------------------------
  ADD dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
-  CMN l r                  -> usage (regOp l ++ regOp r, [])
  CMP l r                  -> usage (regOp l ++ regOp r, [])
+  CMN l r                  -> usage (regOp l ++ regOp r, [])
  MSUB dst src1 src2 src3  -> usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
  MUL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
  NEG dst src              -> usage (regOp src, regOp dst)
  SMULH dst src1 src2      -> usage (regOp src1 ++ regOp src2, regOp dst)
  SMULL dst src1 src2      -> usage (regOp src1 ++ regOp src2, regOp dst)
+  UMULH dst src1 src2      -> usage (regOp src1 ++ regOp src2, regOp dst)
+  UMULL dst src1 src2      -> usage (regOp src1 ++ regOp src2, regOp dst)
  SDIV dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
  SUB dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
  UDIV dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
@@ -99,12 +102,14 @@ regUsageOfInstr platform instr = case instr of
  UXTB dst src             -> usage (regOp src, regOp dst)
  SXTH dst src             -> usage (regOp src, regOp dst)
  UXTH dst src             -> usage (regOp src, regOp dst)
+  CLZ  dst src             -> usage (regOp src, regOp dst)
+  RBIT dst src             -> usage (regOp src, regOp dst)
+  REV   dst src            -> usage (regOp src, regOp dst)
+  -- REV32 dst src            -> usage (regOp src, regOp dst)
+  REV16 dst src            -> usage (regOp src, regOp dst)
  -- 3. Logical and Move Instructions ------------------------------------------
  AND dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
  ASR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
-  BIC dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
-  BICS dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
-  EON dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
  EOR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
  LSL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
  LSR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
@@ -113,13 +118,12 @@ regUsageOfInstr platform instr = case instr of
  MOVZ dst src             -> usage (regOp src, regOp dst)
  MVN dst src              -> usage (regOp src, regOp dst)
  ORR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
-  ROR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
-  TST src1 src2            -> usage (regOp src1 ++ regOp src2, [])
  -- 4. Branch Instructions ----------------------------------------------------
  J t                      -> usage (regTarget t, [])
+  J_TBL _ _ t              -> usage ([t], [])
  B t                      -> usage (regTarget t, [])
  BCOND _ t                -> usage (regTarget t, [])
-  BL t ps _rs              -> usage (regTarget t ++ ps, callerSavedRegisters)
+  BL t ps                  -> usage (regTarget t ++ ps, callerSavedRegisters)

  -- 5. Atomic Instructions ----------------------------------------------------
  -- 6. Conditional Instructions -----------------------------------------------
@@ -131,30 +135,38 @@ regUsageOfInstr platform instr = case instr of
  STLR _ src dst           -> usage (regOp src ++ regOp dst, [])
  LDR _ dst src            -> usage (regOp src, regOp dst)
  LDAR _ dst src           -> usage (regOp src, regOp dst)
-  -- TODO is this right? see STR, which I'm only partial about being right?
-  STP _ src1 src2 dst      -> usage (regOp src1 ++ regOp src2 ++ regOp dst, [])
-  LDP _ dst1 dst2 src      -> usage (regOp src, regOp dst1 ++ regOp dst2)

  -- 8. Synchronization Instructions -------------------------------------------
-  DMBSY                    -> usage ([], [])
-  DMBISH                   -> usage ([], [])
+  DMBISH _                 -> usage ([], [])

  -- 9. Floating Point Instructions --------------------------------------------
+  FMOV dst src             -> usage (regOp src, regOp dst)
  FCVT dst src             -> usage (regOp src, regOp dst)
  SCVTF dst src            -> usage (regOp src, regOp dst)
  FCVTZS dst src           -> usage (regOp src, regOp dst)
  FABS dst src             -> usage (regOp src, regOp dst)
+  FSQRT dst src            -> usage (regOp src, regOp dst)
+  FMIN dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  FMAX dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
  FMA _ dst src1 src2 src3 ->
    usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)

-  _ -> panic $ "regUsageOfInstr: " ++ instrCon instr
+  LOCATION{} -> panic $ "regUsageOfInstr: " ++ instrCon instr
+  NEWBLOCK{} -> panic $ "regUsageOfInstr: " ++ instrCon instr

  where
        -- filtering the usage is necessary, otherwise the register
        -- allocator will try to allocate pre-defined fixed stg
        -- registers as well, as they show up.
-        usage (src, dst) = RU (filter (interesting platform) src)
-                              (filter (interesting platform) dst)
+        usage (src, dst) = RU (map mkFmt $ filter (interesting platform) src)
+                              (map mkFmt $ filter (interesting platform) dst)
+          -- SIMD NCG TODO: the format here is used for register spilling/unspilling.
+          -- As the AArch64 NCG does not currently support SIMD registers,
+          -- this simple logic is OK.
+        mkFmt r = RegWithFormat r fmt
+          where fmt = case targetClassOfReg platform r of
+                        RcInteger -> II64
+                        RcFloatOrVector -> FF64

        regAddr :: AddrMode -> [Reg]
        regAddr (AddrRegReg r1 r2) = [r1, r2]
@@ -175,9 +187,10 @@ regUsageOfInstr platform instr = case instr of
        -- Is this register interesting for the register allocator?
        interesting :: Platform -> Reg -> Bool
        interesting _        (RegVirtual _)                 = True
-        interesting _        (RegReal (RealRegSingle (-1))) = False
        interesting platform (RegReal (RealRegSingle i))    = freeReg platform i

+-- Note [AArch64 Register assignments]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 -- Save caller save registers
 -- This is x0-x18
 --
@@ -200,6 +213,8 @@ regUsageOfInstr platform instr = case instr of
 -- '---------------------------------------------------------------------------------------------------------------------------------------------------------------'
 -- IR: Indirect result location register, IP: Intra-procedure register, PL: Platform register, FP: Frame pointer, LR: Link register, SP: Stack pointer
 -- BR: Base, SL: SpLim
+--
+-- TODO: The zero register is currently mapped to -1 but should get it's own separate number.
 callerSavedRegisters :: [Reg]
 callerSavedRegisters
    = map regSingle [0..18]
@@ -219,13 +234,15 @@ patchRegsOfInstr instr env = case instr of
    DELTA{}             -> instr
    -- 1. Arithmetic Instructions ----------------------------------------------
    ADD o1 o2 o3   -> ADD (patchOp o1) (patchOp o2) (patchOp o3)
-    CMN o1 o2      -> CMN (patchOp o1) (patchOp o2)
    CMP o1 o2      -> CMP (patchOp o1) (patchOp o2)
+    CMN o1 o2      -> CMN (patchOp o1) (patchOp o2)
    MSUB o1 o2 o3 o4 -> MSUB (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
    MUL o1 o2 o3   -> MUL (patchOp o1) (patchOp o2) (patchOp o3)
    NEG o1 o2      -> NEG (patchOp o1) (patchOp o2)
    SMULH o1 o2 o3 -> SMULH (patchOp o1) (patchOp o2)  (patchOp o3)
    SMULL o1 o2 o3 -> SMULL (patchOp o1) (patchOp o2)  (patchOp o3)
+    UMULH o1 o2 o3 -> UMULH (patchOp o1) (patchOp o2)  (patchOp o3)
+    UMULL o1 o2 o3 -> UMULL (patchOp o1) (patchOp o2)  (patchOp o3)
    SDIV o1 o2 o3  -> SDIV (patchOp o1) (patchOp o2) (patchOp o3)
    SUB o1 o2 o3   -> SUB  (patchOp o1) (patchOp o2) (patchOp o3)
    UDIV o1 o2 o3  -> UDIV (patchOp o1) (patchOp o2) (patchOp o3)
@@ -239,14 +256,16 @@ patchRegsOfInstr instr env = case instr of
    UXTB o1 o2       -> UXTB (patchOp o1) (patchOp o2)
    SXTH o1 o2       -> SXTH (patchOp o1) (patchOp o2)
    UXTH o1 o2       -> UXTH (patchOp o1) (patchOp o2)
+    CLZ o1 o2        -> CLZ  (patchOp o1) (patchOp o2)
+    RBIT o1 o2       -> RBIT (patchOp o1) (patchOp o2)
+    REV   o1 o2      -> REV  (patchOp o1) (patchOp o2)
+    -- REV32 o1 o2      -> REV32 (patchOp o1) (patchOp o2)
+    REV16 o1 o2      -> REV16 (patchOp o1) (patchOp o2)
+

    -- 3. Logical and Move Instructions ----------------------------------------
    AND o1 o2 o3   -> AND  (patchOp o1) (patchOp o2) (patchOp o3)
-    ANDS o1 o2 o3  -> ANDS (patchOp o1) (patchOp o2) (patchOp o3)
    ASR o1 o2 o3   -> ASR  (patchOp o1) (patchOp o2) (patchOp o3)
-    BIC o1 o2 o3   -> BIC  (patchOp o1) (patchOp o2) (patchOp o3)
-    BICS o1 o2 o3  -> BICS (patchOp o1) (patchOp o2) (patchOp o3)
-    EON o1 o2 o3   -> EON  (patchOp o1) (patchOp o2) (patchOp o3)
    EOR o1 o2 o3   -> EOR  (patchOp o1) (patchOp o2) (patchOp o3)
    LSL o1 o2 o3   -> LSL  (patchOp o1) (patchOp o2) (patchOp o3)
    LSR o1 o2 o3   -> LSR  (patchOp o1) (patchOp o2) (patchOp o3)
@@ -255,14 +274,13 @@ patchRegsOfInstr instr env = case instr of
    MOVZ o1 o2     -> MOVZ (patchOp o1) (patchOp o2)
    MVN o1 o2      -> MVN  (patchOp o1) (patchOp o2)
    ORR o1 o2 o3   -> ORR  (patchOp o1) (patchOp o2) (patchOp o3)
-    ROR o1 o2 o3   -> ROR  (patchOp o1) (patchOp o2) (patchOp o3)
-    TST o1 o2      -> TST  (patchOp o1) (patchOp o2)

    -- 4. Branch Instructions --------------------------------------------------
-    J t            -> J (patchTarget t)
-    B t            -> B (patchTarget t)
-    BL t rs ts     -> BL (patchTarget t) rs ts
-    BCOND c t      -> BCOND c (patchTarget t)
+    J t               -> J (patchTarget t)
+    J_TBL ids mbLbl t -> J_TBL ids mbLbl (env t)
+    B t               -> B (patchTarget t)
+    BL t rs           -> BL (patchTarget t) rs
+    BCOND c t         -> BCOND c (patchTarget t)

    -- 5. Atomic Instructions --------------------------------------------------
    -- 6. Conditional Instructions ---------------------------------------------
@@ -274,22 +292,24 @@ patchRegsOfInstr instr env = case instr of
    STLR f o1 o2   -> STLR f (patchOp o1) (patchOp o2)
    LDR f o1 o2    -> LDR f (patchOp o1) (patchOp o2)
    LDAR f o1 o2   -> LDAR f (patchOp o1) (patchOp o2)
-    STP f o1 o2 o3 -> STP f (patchOp o1) (patchOp o2) (patchOp o3)
-    LDP f o1 o2 o3 -> LDP f (patchOp o1) (patchOp o2) (patchOp o3)

    -- 8. Synchronization Instructions -----------------------------------------
-    DMBSY          -> DMBSY
-    DMBISH         -> DMBISH
+    DMBISH c       -> DMBISH c

    -- 9. Floating Point Instructions ------------------------------------------
+    FMOV o1 o2     -> FMOV (patchOp o1) (patchOp o2)
    FCVT o1 o2     -> FCVT (patchOp o1) (patchOp o2)
    SCVTF o1 o2    -> SCVTF (patchOp o1) (patchOp o2)
    FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
    FABS o1 o2     -> FABS (patchOp o1) (patchOp o2)
+    FSQRT o1 o2    -> FSQRT (patchOp o1) (patchOp o2)
+    FMIN o1 o2 o3  -> FMIN (patchOp o1) (patchOp o2) (patchOp o3)
+    FMAX o1 o2 o3  -> FMAX (patchOp o1) (patchOp o2) (patchOp o3)
    FMA s o1 o2 o3 o4 ->
      FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)

-    _              -> panic $ "patchRegsOfInstr: " ++ instrCon instr
+    NEWBLOCK{}     -> panic $ "patchRegsOfInstr: " ++ instrCon instr
+    LOCATION{}     -> panic $ "patchRegsOfInstr: " ++ instrCon instr
    where
        patchOp :: Operand -> Operand
        patchOp (OpReg w r) = OpReg w (env r)
@@ -314,6 +334,7 @@ isJumpishInstr instr = case instr of
    CBZ{} -> True
    CBNZ{} -> True
    J{} -> True
+    J_TBL{} -> True
    B{} -> True
    BL{} -> True
    BCOND{} -> True
@@ -327,11 +348,23 @@ jumpDestsOfInstr (ANN _ i) = jumpDestsOfInstr i
 jumpDestsOfInstr (CBZ _ t) = [ id | TBlock id <- [t]]
 jumpDestsOfInstr (CBNZ _ t) = [ id | TBlock id <- [t]]
 jumpDestsOfInstr (J t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (J_TBL ids _mbLbl _r) = catMaybes ids
 jumpDestsOfInstr (B t) = [id | TBlock id <- [t]]
-jumpDestsOfInstr (BL t _ _) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (BL t _) = [ id | TBlock id <- [t]]
 jumpDestsOfInstr (BCOND _ t) = [ id | TBlock id <- [t]]
 jumpDestsOfInstr _ = []

+canFallthroughTo :: Instr -> BlockId -> Bool
+canFallthroughTo (ANN _ i) bid = canFallthroughTo i bid
+canFallthroughTo (J (TBlock target)) bid = bid == target
+canFallthroughTo (J_TBL targets _ _) bid = all isTargetBid targets
+  where
+    isTargetBid target = case target of
+      Nothing -> True
+      Just target -> target == bid
+canFallthroughTo (B (TBlock target)) bid = bid == target
+canFallthroughTo _ _ = False
+
 -- | Change the destination of this jump instruction.
 -- Used in the linear allocator when adding fixup blocks for join
 -- points.
@@ -342,8 +375,9 @@ patchJumpInstr instr patchF
        CBZ r (TBlock bid) -> CBZ r (TBlock (patchF bid))
        CBNZ r (TBlock bid) -> CBNZ r (TBlock (patchF bid))
        J (TBlock bid) -> J (TBlock (patchF bid))
+        J_TBL ids mbLbl r -> J_TBL (map (fmap patchF) ids) mbLbl r
        B (TBlock bid) -> B (TBlock (patchF bid))
-        BL (TBlock bid) ps rs -> BL (TBlock (patchF bid)) ps rs
+        BL (TBlock bid) ps -> BL (TBlock (patchF bid)) ps
        BCOND c (TBlock bid) -> BCOND c (TBlock (patchF bid))
        _ -> panic $ "patchJumpInstr: " ++ instrCon instr

@@ -367,12 +401,12 @@ patchJumpInstr instr patchF
 mkSpillInstr
   :: HasCallStack
   => NCGConfig
-   -> Reg       -- register to spill
+   -> RegWithFormat -- register to spill
   -> Int       -- current stack delta
   -> Int       -- spill slot to use
   -> [Instr]

-mkSpillInstr config reg delta slot =
+mkSpillInstr config (RegWithFormat reg fmt) delta slot =
  case off - delta of
    imm | -256 <= imm && imm <= 255                               -> [ mkStrSp imm ]
    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkStrSp imm ]
@@ -383,8 +417,8 @@ mkSpillInstr config reg delta slot =
    where
        a .&~. b = a .&. (complement b)

-        fmt = fmtOfRealReg (case reg of { RegReal r -> r; _ -> panic "Expected real reg"})
-
+        -- SIMD NCG TODO: emit the correct instructions to spill a vector register.
+        -- You can take inspiration from the X86_64 backend.
        mkIp0SpillAddr imm = ANN (text "Spill: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
        mkStrSp imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
        mkStrIp0 imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
@@ -393,12 +427,11 @@ mkSpillInstr config reg delta slot =

 mkLoadInstr
   :: NCGConfig
-   -> Reg       -- register to load
+   -> RegWithFormat
   -> Int       -- current stack delta
   -> Int       -- spill slot to use
   -> [Instr]
-
-mkLoadInstr config reg delta slot =
+mkLoadInstr config (RegWithFormat reg fmt) delta slot =
  case off - delta of
    imm | -256 <= imm && imm <= 255                               -> [ mkLdrSp imm ]
    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkLdrSp imm ]
@@ -409,8 +442,8 @@ mkLoadInstr config reg delta slot =
    where
        a .&~. b = a .&. (complement b)

-        fmt = fmtOfRealReg (case reg of { RegReal r -> r; _ -> panic "Expected real reg"})
-
+        -- SIMD NCG TODO: emit the correct instructions to load a vector register.
+        -- You can take inspiration from the X86_64 backend.
        mkIp0SpillAddr imm = ANN (text "Reload: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
        mkLdrSp imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
        mkLdrIp0 imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
@@ -432,7 +465,6 @@ isMetaInstr instr
    COMMENT{}   -> True
    MULTILINE_COMMENT{} -> True
    LOCATION{}  -> True
-    LDATA{}     -> True
    NEWBLOCK{}  -> True
    DELTA{}     -> True
    PUSH_STACK_FRAME -> True
@@ -441,13 +473,27 @@ isMetaInstr instr

 -- | Copy the value in a register to another one.
 -- Must work for all register classes.
-mkRegRegMoveInstr :: Reg -> Reg -> Instr
-mkRegRegMoveInstr src dst = ANN (text "Reg->Reg Move: " <> ppr src <> text " -> " <> ppr dst) $ MOV (OpReg W64 dst) (OpReg W64 src)
+mkRegRegMoveInstr :: Format -> Reg -> Reg -> Instr
+mkRegRegMoveInstr _fmt src dst
+  = ANN (text "Reg->Reg Move: " <> ppr src <> text " -> " <> ppr dst) $ MOV (OpReg W64 dst) (OpReg W64 src)
+  -- SIMD NCG TODO: incorrect for vector formats

-- | Take the source and destination from this reg -> reg move instruction
-- or Nothing if it's not one
+-- | Take the source and destination registers from a move instruction of same
+-- register class (`RegClass`).
+--
+-- The idea is to identify moves that can be eliminated by the register
+-- allocator: If the source register serves no special purpose, one could
+-- continue using it; saving one move instruction. For this, the register kinds
+-- (classes) must be the same (no conversion involved.)
 takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg)
--takeRegRegMoveInstr (MOV (OpReg fmt dst) (OpReg fmt' src)) | fmt == fmt' = Just (src, dst)
+takeRegRegMoveInstr (MOV (OpReg _fmt dst) (OpReg _fmt' src))
+  | classOfReg dst == classOfReg src = pure (src, dst)
+  where
+    classOfReg :: Reg -> RegClass
+    classOfReg reg
+      = case reg of
+        RegVirtual vr -> classOfVirtualReg ArchAArch64 vr
+        RegReal rr -> classOfRealReg rr
 takeRegRegMoveInstr _ = Nothing

 -- | Make an unconditional jump instruction.
@@ -475,13 +521,13 @@ allocMoreStack
  :: Platform
  -> Int
  -> NatCmmDecl statics GHC.CmmToAsm.AArch64.Instr.Instr
-  -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.AArch64.Instr.Instr, [(BlockId,BlockId)])
+  -> UniqDSM (NatCmmDecl statics GHC.CmmToAsm.AArch64.Instr.Instr, [(BlockId,BlockId)])

 allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
 allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
    let entries = entryBlocks proc

-    uniqs <- getUniquesM
+    retargetList <- mapM (\e -> (e,) <$> newBlockId) entries

    let
      delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
@@ -490,8 +536,6 @@ allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
      alloc   = mkStackAllocInstr   platform delta
      dealloc = mkStackDeallocInstr platform delta

-      retargetList = (zip entries (map mkBlockId uniqs))
-
      new_blockmap :: LabelMap BlockId
      new_blockmap = mapFromList retargetList

@@ -535,11 +579,6 @@ data Instr
    -- location pseudo-op (file, line, col, name)
    | LOCATION Int Int Int String

-    -- some static data spat out during code
-    -- generation.  Will be extracted before
-    -- pretty-printing.
-    | LDATA   Section RawCmmStatics
-
    -- start a new basic block.  Useful during
    -- codegen, removed later.  Preceding
    -- instruction should be a jump, as per the
@@ -566,8 +605,8 @@ data Instr
    -- | ADDS Operand Operand Operand -- rd = rn + rm
    -- | ADR ...
    -- | ADRP ...
-    | CMN Operand Operand -- rd + op2
    | CMP Operand Operand -- rd - op2
+    | CMN Operand Operand -- rd + op2
    -- | MADD ...
    -- | MNEG ...
    | MSUB Operand Operand Operand Operand -- rd = ra - rn × rm
@@ -590,8 +629,8 @@ data Instr
    -- | UMADDL ...  -- Xd = Xa + Wn × Wm
    -- | UMNEGL ... -- Xd = - Wn × Wm
    -- | UMSUBL ... -- Xd = Xa - Wn × Wm
-    -- | UMULH ... -- Xd = (Xn × Xm)_127:64
-    -- | UMULL ... -- Xd = Wn × Wm
+    | UMULH Operand Operand Operand -- Xd = (Xn × Xm)_127:64
+    | UMULL Operand Operand Operand -- Xd = Wn × Wm

    -- 2. Bit Manipulation Instructions ----------------------------------------
    | SBFM Operand Operand Operand Operand -- rd = rn[i,j]
@@ -604,14 +643,18 @@ data Instr
    -- Signed/Unsigned bitfield extract
    | SBFX Operand Operand Operand Operand -- rd = rn[i,j]
    | UBFX Operand Operand Operand Operand -- rd = rn[i,j]
+    | CLZ  Operand Operand -- rd = countLeadingZeros(rn)
+    | RBIT Operand Operand -- rd = reverseBits(rn)
+    | REV Operand Operand   -- rd = reverseBytes(rn): (for 32 & 64 bit operands)
+                            -- 0xAABBCCDD -> 0xDDCCBBAA
+    | REV16 Operand Operand -- rd = reverseBytes16(rn)
+                            -- 0xAABB_CCDD -> xBBAA_DDCC
+    -- | REV32 Operand Operand -- rd = reverseBytes32(rn) - 64bit operands only!
+    --                         -- 0xAABBCCDD_EEFFGGHH -> 0XDDCCBBAA_HHGGFFEE

    -- 3. Logical and Move Instructions ----------------------------------------
    | AND Operand Operand Operand -- rd = rn & op2
-    | ANDS Operand Operand Operand -- rd = rn & op2
    | ASR Operand Operand Operand -- rd = rn ≫ rm  or  rd = rn ≫ #i, i is 6 bits
-    | BIC Operand Operand Operand -- rd = rn & ~op2
-    | BICS Operand Operand Operand -- rd = rn & ~op2
-    | EON Operand Operand Operand -- rd = rn ⊕ ~op2
    | EOR Operand Operand Operand -- rd = rn ⊕ op2
    | LSL Operand Operand Operand -- rd = rn ≪ rm  or rd = rn ≪ #i, i is 6 bits
    | LSR Operand Operand Operand -- rd = rn ≫ rm  or rd = rn ≫ #i, i is 6 bits
@@ -620,18 +663,13 @@ data Instr
    -- | MOVN Operand Operand
    | MOVZ Operand Operand
    | MVN Operand Operand -- rd = ~rn
-    | ORN Operand Operand Operand -- rd = rn | ~op2
    | ORR Operand Operand Operand -- rd = rn | op2
-    | ROR Operand Operand Operand -- rd = rn ≫ rm  or  rd = rn ≫ #i, i is 6 bits
-    | TST Operand Operand -- rn & op2
    -- Load and stores.
    -- TODO STR/LDR might want to change to STP/LDP with XZR for the second register.
    | STR Format Operand Operand -- str Xn, address-mode // Xn -> *addr
    | STLR Format Operand Operand -- stlr Xn, address-mode // Xn -> *addr
    | LDR Format Operand Operand -- ldr Xn, address-mode // Xn <- *addr
    | LDAR Format Operand Operand -- ldar Xn, address-mode // Xn <- *addr
-    | STP Format Operand Operand Operand -- stp Xn, Xm, address-mode // Xn -> *addr, Xm -> *(addr + 8)
-    | LDP Format Operand Operand Operand -- stp Xn, Xm, address-mode // Xn <- *addr, Xm <- *(addr + 8)

    -- Conditional instructions
    | CSET Operand Cond   -- if(cond) op <- 1 else op <- 0
@@ -640,14 +678,16 @@ data Instr
    | CBNZ Operand Target -- if op /= 0, then branch.
    -- Branching.
    | J Target            -- like B, but only generated from genJump. Used to distinguish genJumps from others.
+    | J_TBL [Maybe BlockId] (Maybe CLabel) Reg -- A jump instruction with data for switch/jump tables
    | B Target            -- unconditional branching b/br. (To a blockid, label or register)
-    | BL Target [Reg] [Reg] -- branch and link (e.g. set x30 to next pc, and branch)
+    | BL Target [Reg] -- branch and link (e.g. set x30 to next pc, and branch)
    | BCOND Cond Target   -- branch with condition. b.<cond>

    -- 8. Synchronization Instructions -----------------------------------------
-    | DMBSY
-    | DMBISH
+    | DMBISH DMBISHFlags
    -- 9. Floating Point Instructions
+    -- move to/from general purpose <-> floating, or floating to floating
+    | FMOV Operand Operand
    -- Float ConVerT
    | FCVT Operand Operand
    -- Signed ConVerT Float
@@ -656,6 +696,12 @@ data Instr
    | FCVTZS Operand Operand
    -- Float ABSolute value
    | FABS Operand Operand
+    -- Float minimum
+    | FMIN Operand Operand Operand
+    -- Float maximum
+    | FMAX Operand Operand Operand
+    -- Float SQuare RooT
+    | FSQRT Operand Operand

    -- | Floating-point fused multiply-add instructions
    --
@@ -665,6 +711,9 @@ data Instr
    -- - fnmadd: d = - r1 * r2 - r3
    | FMA FMASign Operand Operand Operand Operand

+data DMBISHFlags = DmbLoad | DmbLoadStore
+  deriving (Eq, Show)
+
 instrCon :: Instr -> String
 instrCon i =
    case i of
@@ -672,7 +721,6 @@ instrCon i =
      MULTILINE_COMMENT{} -> "COMMENT"
      ANN{} -> "ANN"
      LOCATION{} -> "LOCATION"
-      LDATA{} -> "LDATA"
      NEWBLOCK{} -> "NEWBLOCK"
      DELTA{} -> "DELTA"
      SXTB{} -> "SXTB"
@@ -682,26 +730,29 @@ instrCon i =
      PUSH_STACK_FRAME{} -> "PUSH_STACK_FRAME"
      POP_STACK_FRAME{} -> "POP_STACK_FRAME"
      ADD{} -> "ADD"
-      CMN{} -> "CMN"
      CMP{} -> "CMP"
+      CMN{} -> "CMN"
      MSUB{} -> "MSUB"
      MUL{} -> "MUL"
      NEG{} -> "NEG"
      SDIV{} -> "SDIV"
      SMULH{} -> "SMULH"
      SMULL{} -> "SMULL"
+      UMULH{} -> "UMULH"
+      UMULL{} -> "UMULL"
      SUB{} -> "SUB"
      UDIV{} -> "UDIV"
      SBFM{} -> "SBFM"
      UBFM{} -> "UBFM"
      SBFX{} -> "SBFX"
      UBFX{} -> "UBFX"
+      CLZ{} -> "CLZ"
+      RBIT{} -> "RBIT"
+      REV{} -> "REV"
+      REV16{} -> "REV16"
+      -- REV32{} -> "REV32"
      AND{} -> "AND"
-      ANDS{} -> "ANDS"
      ASR{} -> "ASR"
-      BIC{} -> "BIC"
-      BICS{} -> "BICS"
-      EON{} -> "EON"
      EOR{} -> "EOR"
      LSL{} -> "LSL"
      LSR{} -> "LSR"
@@ -709,29 +760,28 @@ instrCon i =
      MOVK{} -> "MOVK"
      MOVZ{} -> "MOVZ"
      MVN{} -> "MVN"
-      ORN{} -> "ORN"
      ORR{} -> "ORR"
-      ROR{} -> "ROR"
-      TST{} -> "TST"
      STR{} -> "STR"
      STLR{} -> "STLR"
      LDR{} -> "LDR"
      LDAR{} -> "LDAR"
-      STP{} -> "STP"
-      LDP{} -> "LDP"
      CSET{} -> "CSET"
      CBZ{} -> "CBZ"
      CBNZ{} -> "CBNZ"
      J{} -> "J"
+      J_TBL {} -> "J_TBL"
      B{} -> "B"
      BL{} -> "BL"
      BCOND{} -> "BCOND"
-      DMBSY{} -> "DMBSY"
      DMBISH{} -> "DMBISH"
+      FMOV{} -> "FMOV"
      FCVT{} -> "FCVT"
      SCVTF{} -> "SCVTF"
      FCVTZS{} -> "FCVTZS"
      FABS{} -> "FABS"
+      FSQRT{} -> "FSQRT"
+      FMIN {} -> "FMIN"
+      FMAX {} -> "FMAX"
      FMA variant _ _ _ _ ->
        case variant of
          FMAdd  -> "FMADD"
@@ -777,15 +827,10 @@ data Operand
 opReg :: Width -> Reg -> Operand
 opReg = OpReg

-xzr, wzr, sp, ip0 :: Operand
-xzr = OpReg W64 (RegReal (RealRegSingle (-1)))
-wzr = OpReg W32 (RegReal (RealRegSingle (-1)))
+sp, ip0 :: Operand
 sp  = OpReg W64 (RegReal (RealRegSingle 31))
 ip0 = OpReg W64 (RegReal (RealRegSingle 16))

-reg_zero :: Reg
-reg_zero = RegReal (RealRegSingle (-1))
-
 _x :: Int -> Operand
 _x i = OpReg W64 (RegReal (RealRegSingle i))
 x0,  x1,  x2,  x3,  x4,  x5,  x6,  x7  :: Operand

--- a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
@@ -16,7 +16,6 @@ import GHC.CmmToAsm.Types
 import GHC.CmmToAsm.Utils

 import GHC.Cmm hiding (topInfoTable)
-import GHC.Cmm.Dataflow.Collections
 import GHC.Cmm.Dataflow.Label

 import GHC.Cmm.BlockId
@@ -307,7 +306,6 @@ pprReg w r = case r of
  RegReal    (RealRegSingle i) -> ppr_reg_no w i
  -- virtual regs should not show up, but this is helpful for debugging.
  RegVirtual (VirtualRegI u)   -> text "%vI_" <> pprUniqueAlways u
-  RegVirtual (VirtualRegF u)   -> text "%vF_" <> pprUniqueAlways u
  RegVirtual (VirtualRegD u)   -> text "%vD_" <> pprUniqueAlways u
  _                            -> pprPanic "AArch64.pprReg" (text $ show r)

@@ -317,6 +315,7 @@ pprReg w r = case r of
         | w == W64 = text "sp"
         | w == W32 = text "wsp"

+    -- See Note [AArch64 Register assignments]
    ppr_reg_no w i
         | i < 0, w == W32 = text "wzr"
         | i < 0, w == W64 = text "xzr"
@@ -332,13 +331,13 @@ pprReg w r = case r of
         | i <= 63, w == W16 = text "h" <> int (i-32)
         | i <= 63, w == W32 = text "s" <> int (i-32)
         | i <= 63, w == W64 = text "d" <> int (i-32)
-         -- no support for 'q'uad in GHC's NCG yet.
-         | otherwise = text "very naughty powerpc register"
+         | i <= 63, w == W128= text "q" <> int (i-32)
+         | otherwise = text "very naughty AArch64 register" <+> parens (text (show w) <+> int i)

 isFloatOp :: Operand -> Bool
 isFloatOp (OpReg _ (RegReal (RealRegSingle i))) | i > 31 = True
-isFloatOp (OpReg _ (RegVirtual (VirtualRegF _))) = True
 isFloatOp (OpReg _ (RegVirtual (VirtualRegD _))) = True
+-- SIMD NCG TODO: what about VirtualVecV128? Could be floating-point or not?
 isFloatOp _ = False

 pprInstr :: IsDoc doc => Platform -> Instr -> doc
@@ -357,7 +356,6 @@ pprInstr platform instr = case instr of
                      -- in the final instruction stream. But we still want to be able to
                      -- print it for debugging purposes.
                      line (text "BLOCK " <> pprAsmLabel platform (blockLbl blockid))
-  LDATA _ _  -> panic "pprInstr: LDATA"

  -- Pseudo Instructions -------------------------------------------------------

@@ -371,16 +369,18 @@ pprInstr platform instr = case instr of
  ADD  o1 o2 o3
    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfadd") o1 o2 o3
    | otherwise -> op3 (text "\tadd") o1 o2 o3
-  CMN  o1 o2    -> op2 (text "\tcmn") o1 o2
  CMP  o1 o2
    | isFloatOp o1 && isFloatOp o2 -> op2 (text "\tfcmp") o1 o2
    | otherwise -> op2 (text "\tcmp") o1 o2
+  CMN  o1 o2       -> op2 (text "\tcmn") o1 o2
  MSUB o1 o2 o3 o4 -> op4 (text "\tmsub") o1 o2 o3 o4
  MUL  o1 o2 o3
    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> op3 (text "\tfmul") o1 o2 o3
    | otherwise -> op3 (text "\tmul") o1 o2 o3
  SMULH o1 o2 o3 -> op3 (text "\tsmulh") o1 o2 o3
  SMULL o1 o2 o3 -> op3 (text "\tsmull") o1 o2 o3
+  UMULH o1 o2 o3 -> op3 (text "\tumulh") o1 o2 o3
+  UMULL o1 o2 o3 -> op3 (text "\tumull") o1 o2 o3
  NEG  o1 o2
    | isFloatOp o1 && isFloatOp o2 -> op2 (text "\tfneg") o1 o2
    | otherwise -> op2 (text "\tneg") o1 o2
@@ -396,6 +396,11 @@ pprInstr platform instr = case instr of
  -- 2. Bit Manipulation Instructions ------------------------------------------
  SBFM o1 o2 o3 o4 -> op4 (text "\tsbfm") o1 o2 o3 o4
  UBFM o1 o2 o3 o4 -> op4 (text "\tubfm") o1 o2 o3 o4
+  CLZ  o1 o2       -> op2 (text "\tclz")  o1 o2
+  RBIT  o1 o2      -> op2 (text "\trbit")  o1 o2
+  REV o1 o2        -> op2 (text "\trev")  o1 o2
+  REV16 o1 o2      -> op2 (text "\trev16")  o1 o2
+  -- REV32 o1 o2      -> op2 (text "\trev32")  o1 o2
  -- signed and unsigned bitfield extract
  SBFX o1 o2 o3 o4 -> op4 (text "\tsbfx") o1 o2 o3 o4
  UBFX o1 o2 o3 o4 -> op4 (text "\tubfx") o1 o2 o3 o4
@@ -406,11 +411,7 @@ pprInstr platform instr = case instr of

  -- 3. Logical and Move Instructions ------------------------------------------
  AND o1 o2 o3  -> op3 (text "\tand") o1 o2 o3
-  ANDS o1 o2 o3 -> op3 (text "\tands") o1 o2 o3
  ASR o1 o2 o3  -> op3 (text "\tasr") o1 o2 o3
-  BIC o1 o2 o3  -> op3 (text "\tbic") o1 o2 o3
-  BICS o1 o2 o3 -> op3 (text "\tbics") o1 o2 o3
-  EON o1 o2 o3  -> op3 (text "\teon") o1 o2 o3
  EOR o1 o2 o3  -> op3 (text "\teor") o1 o2 o3
  LSL o1 o2 o3  -> op3 (text "\tlsl") o1 o2 o3
  LSR o1 o2 o3  -> op3 (text "\tlsr") o1 o2 o3
@@ -420,20 +421,18 @@ pprInstr platform instr = case instr of
  MOVK o1 o2    -> op2 (text "\tmovk") o1 o2
  MOVZ o1 o2    -> op2 (text "\tmovz") o1 o2
  MVN o1 o2     -> op2 (text "\tmvn") o1 o2
-  ORN o1 o2 o3  -> op3 (text "\torn") o1 o2 o3
  ORR o1 o2 o3  -> op3 (text "\torr") o1 o2 o3
-  ROR o1 o2 o3  -> op3 (text "\tror") o1 o2 o3
-  TST o1 o2     -> op2 (text "\ttst") o1 o2

  -- 4. Branch Instructions ----------------------------------------------------
  J t            -> pprInstr platform (B t)
+  J_TBL _ _ r    -> pprInstr platform (B (TReg r))
  B (TBlock bid) -> line $ text "\tb" <+> pprAsmLabel platform (mkLocalBlockLabel (getUnique bid))
  B (TLabel lbl) -> line $ text "\tb" <+> pprAsmLabel platform lbl
  B (TReg r)     -> line $ text "\tbr" <+> pprReg W64 r

-  BL (TBlock bid) _ _ -> line $ text "\tbl" <+> pprAsmLabel platform (mkLocalBlockLabel (getUnique bid))
-  BL (TLabel lbl) _ _ -> line $ text "\tbl" <+> pprAsmLabel platform lbl
-  BL (TReg r)     _ _ -> line $ text "\tblr" <+> pprReg W64 r
+  BL (TBlock bid) _ -> line $ text "\tbl" <+> pprAsmLabel platform (mkLocalBlockLabel (getUnique bid))
+  BL (TLabel lbl) _ -> line $ text "\tbl" <+> pprAsmLabel platform lbl
+  BL (TReg r)     _ -> line $ text "\tblr" <+> pprReg W64 r

  BCOND c (TBlock bid) -> line $ text "\t" <> pprBcond c <+> pprAsmLabel platform (mkLocalBlockLabel (getUnique bid))
  BCOND c (TLabel lbl) -> line $ text "\t" <> pprBcond c <+> pprAsmLabel platform lbl
@@ -527,17 +526,19 @@ pprInstr platform instr = case instr of
  LDR _f o1 o2 -> op2 (text "\tldr") o1 o2
  LDAR _f o1 o2 -> op2 (text "\tldar") o1 o2

-  STP _f o1 o2 o3 -> op3 (text "\tstp") o1 o2 o3
-  LDP _f o1 o2 o3 -> op3 (text "\tldp") o1 o2 o3
-
  -- 8. Synchronization Instructions -------------------------------------------
-  DMBSY -> line $ text "\tdmb sy"
-  DMBISH -> line $ text "\tdmb ish"
+  DMBISH DmbLoadStore -> line $ text "\tdmb ish"
+  DMBISH DmbLoad -> line $ text "\tdmb ishld"
+
  -- 9. Floating Point Instructions --------------------------------------------
+  FMOV o1 o2 -> op2 (text "\tfmov") o1 o2
  FCVT o1 o2 -> op2 (text "\tfcvt") o1 o2
  SCVTF o1 o2 -> op2 (text "\tscvtf") o1 o2
  FCVTZS o1 o2 -> op2 (text "\tfcvtzs") o1 o2
  FABS o1 o2 -> op2 (text "\tfabs") o1 o2
+  FSQRT o1 o2 -> op2 (text "\tfsqrt") o1 o2
+  FMIN o1 o2 o3 -> op3 (text "\tfmin") o1 o2 o3
+  FMAX o1 o2 o3 -> op3 (text "\tfmax") o1 o2 o3
  FMA variant d r1 r2 r3 ->
    let fma = case variant of
                FMAdd  -> text "\tfmadd"

--- a/compiler/GHC/CmmToAsm/AArch64/RegInfo.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/RegInfo.hs
@@ -14,18 +14,16 @@ data JumpDest = DestBlockId BlockId
 instance Outputable JumpDest where
  ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid

-- TODO: documen what this does. See Ticket 19914
+-- Implementations of the methods of 'NgcImpl'
+
 getJumpDestBlockId :: JumpDest -> Maybe BlockId
 getJumpDestBlockId (DestBlockId bid) = Just bid

-- TODO: document what this does. See Ticket 19914
 canShortcut :: Instr -> Maybe JumpDest
 canShortcut _ = Nothing

-- TODO: document what this does. See Ticket 19914
 shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics
 shortcutStatics _ other_static = other_static

-- TODO: document what this does. See Ticket 19914
 shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
 shortcutJump _ other = other
--- a/compiler/GHC/CmmToAsm/AArch64/Regs.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Regs.hs
@@ -5,7 +5,7 @@ import GHC.Prelude
 import GHC.Data.FastString

 import GHC.Platform.Reg
-import GHC.Platform.Reg.Class
+import GHC.Platform.Reg.Class.Unified
 import GHC.CmmToAsm.Format

 import GHC.Cmm
@@ -17,6 +17,7 @@ import GHC.Utils.Outputable
 import GHC.Utils.Panic
 import GHC.Platform

+-- TODO: Should this include the zero register?
 allMachRegNos   :: [RegNo]
 allMachRegNos   = [0..31] ++ [32..63]
 -- allocatableRegs is allMachRegNos with the fixed-use regs removed.
@@ -108,14 +109,12 @@ virtualRegSqueeze cls vr
                VirtualRegHi{}          -> 1
                _other                  -> 0

-        RcDouble
+        RcFloatOrVector
         -> case vr of
                VirtualRegD{}           -> 1
-                VirtualRegF{}           -> 0
+                VirtualRegV128{}        -> 1
                _other                  -> 0

-        _other -> 0
-
 {-# INLINE realRegSqueeze #-}
 realRegSqueeze :: RegClass -> RealReg -> Int
 realRegSqueeze cls rr
@@ -126,14 +125,12 @@ realRegSqueeze cls rr
                        | regNo < 32    -> 1     -- first fp reg is 32
                        | otherwise     -> 0

-        RcDouble
+        RcFloatOrVector
         -> case rr of
                RealRegSingle regNo
                        | regNo < 32    -> 0
                        | otherwise     -> 1

-        _other -> 0
-
 mkVirtualReg :: Unique -> Format -> VirtualReg
 mkVirtualReg u format
   | not (isFloatFormat format) = VirtualRegI u
@@ -147,18 +144,10 @@ mkVirtualReg u format
 classOfRealReg :: RealReg -> RegClass
 classOfRealReg (RealRegSingle i)
        | i < 32        = RcInteger
-        | otherwise     = RcDouble
-
-fmtOfRealReg :: RealReg -> Format
-fmtOfRealReg real_reg =
-  case classOfRealReg real_reg of
-            RcInteger -> II64
-            RcDouble  -> FF64
-            RcFloat   -> panic "No float regs on arm"
+        | otherwise     = RcFloatOrVector

 regDotColor :: RealReg -> SDoc
 regDotColor reg
 = case classOfRealReg reg of
        RcInteger       -> text "blue"
-        RcFloat         -> text "red"
-        RcDouble        -> text "green"
+        RcFloatOrVector -> text "red"
--- a/compiler/GHC/CmmToAsm/BlockLayout.hs
+++ b/compiler/GHC/CmmToAsm/BlockLayout.hs
@@ -26,7 +26,6 @@ import GHC.CmmToAsm.Config

 import GHC.Cmm
 import GHC.Cmm.BlockId
-import GHC.Cmm.Dataflow.Collections
 import GHC.Cmm.Dataflow.Label

 import GHC.Types.Unique.FM
@@ -49,7 +48,7 @@ import Data.STRef
 import Control.Monad.ST.Strict
 import Control.Monad (foldM, unless)
 import GHC.Data.UnionFind
-import GHC.Types.Unique.Supply (UniqSM)
+import GHC.Types.Unique.DSM (UniqDSM)

 {-
  Note [CFG based code layout]
@@ -509,8 +508,8 @@ mergeChains edges chains
              union cFrom new_point
            merge edges chains
          where
-            cFrom = expectJust "mergeChains:chainMap:from" $ mapLookup from chains
-            cTo = expectJust "mergeChains:chainMap:to"   $ mapLookup to   chains
+            cFrom = expectJust $ mapLookup from chains
+            cTo = expectJust $ mapLookup to   chains


 -- See Note [Chain based CFG serialization] for the general idea.
@@ -758,7 +757,7 @@ sequenceChain  info weights     blocks@((BasicBlock entry _):_) =
            --pprTraceIt "placedBlocks" $
            -- ++ [] is still kinda expensive
            if null unplaced then blockList else blockList ++ unplaced
-        getBlock bid = expectJust "Block placement" $ mapLookup bid blockMap
+        getBlock bid = expectJust $ mapLookup bid blockMap
    in
        --Assert we placed all blocks given as input
        assert (all (\bid -> mapMember bid blockMap) placedBlocks) $
@@ -772,10 +771,9 @@ dropJumps :: forall a i. Instruction i => LabelMap a -> [GenBasicBlock i]
 dropJumps _    [] = []
 dropJumps info (BasicBlock lbl ins:todo)
    | Just ins <- nonEmpty ins --This can happen because of shortcutting
-    , [dest] <- jumpDestsOfInstr (NE.last ins)
    , BasicBlock nextLbl _ : _ <- todo
-    , not (mapMember dest info)
-    , nextLbl == dest
+    , canFallthroughTo (NE.last ins) nextLbl
+    , not (mapMember nextLbl info)
    = BasicBlock lbl (NE.init ins) : dropJumps info todo
    | otherwise
    = BasicBlock lbl ins : dropJumps info todo
@@ -795,7 +793,7 @@ sequenceTop
    => NcgImpl statics instr jumpDest
    -> Maybe CFG -- ^ CFG if we have one.
    -> NatCmmDecl statics instr -- ^ Function to serialize
-    -> UniqSM (NatCmmDecl statics instr)
+    -> UniqDSM (NatCmmDecl statics instr)

 sequenceTop _       _           top@(CmmData _ _) = pure top
 sequenceTop ncgImpl edgeWeights (CmmProc info lbl live (ListGraph blocks)) = do

--- a/compiler/GHC/CmmToAsm/CFG.hs
+++ b/compiler/GHC/CmmToAsm/CFG.hs
@@ -48,7 +48,6 @@ import GHC.Cmm.BlockId
 import GHC.Cmm as Cmm

 import GHC.Cmm.Switch
-import GHC.Cmm.Dataflow.Collections
 import GHC.Cmm.Dataflow.Label
 import GHC.Cmm.Dataflow.Block
 import qualified GHC.Cmm.Dataflow.Graph as G
@@ -312,8 +311,8 @@ shortcutWeightMap cuts cfg
        cuts_vars <- traverse (\p -> (p,) <$> fresh (Just p)) (concatMap (\(a, b) -> [a] ++ maybe [] (:[]) b) cuts_list)
        let cuts_map = mapFromList cuts_vars :: LabelMap (Point s (Maybe BlockId))
        -- Then unify according to the rewrites in the cuts map
-        mapM_ (\(from, to) -> expectJust "shortcutWeightMap" (mapLookup from cuts_map)
-                              `union` expectJust "shortcutWeightMap" (maybe (Just null) (flip mapLookup cuts_map) to) ) cuts_list
+        mapM_ (\(from, to) -> expectJust (mapLookup from cuts_map)
+                              `union` expectJust (maybe (Just null) (flip mapLookup cuts_map) to) ) cuts_list
        -- Then recover the unique representative, which is the result of following
        -- the chain to the end.
        mapM find cuts_map
@@ -417,13 +416,10 @@ getEdgeInfo from to m
    = Nothing

 getEdgeWeight :: CFG -> BlockId -> BlockId -> EdgeWeight
-getEdgeWeight cfg from to =
-    edgeWeight $ expectJust "Edgeweight for nonexisting block" $
-                 getEdgeInfo from to cfg
+getEdgeWeight cfg from to = edgeWeight $ expectJust $ getEdgeInfo from to cfg

 getTransitionSource :: BlockId -> BlockId -> CFG -> TransitionSource
-getTransitionSource from to cfg = transitionSource $ expectJust "Source info for nonexisting block" $
-                        getEdgeInfo from to cfg
+getTransitionSource from to cfg = transitionSource $ expectJust $ getEdgeInfo from to cfg

 reverseEdges :: CFG -> CFG
 reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg
@@ -601,7 +597,7 @@ addNodesBetween weights m updates =

 -}
 -- | Generate weights for a Cmm proc based on some simple heuristics.
-getCfgProc :: Platform -> Weights -> RawCmmDecl -> CFG
+getCfgProc :: Platform -> Weights -> GenCmmDecl d h CmmGraph -> CFG
 getCfgProc _        _       (CmmData {}) = mapEmpty
 getCfgProc platform weights (CmmProc _info _lab _live graph) = getCfg platform weights graph

@@ -1005,7 +1001,7 @@ mkGlobalWeights root localCfg
    blockMapping = listArray (0,mapSize vertexMapping - 1) revOrder :: Array Int BlockId
    -- Map from blockId to indices starting at zero
    toVertex :: BlockId -> Int
-    toVertex   blockId  = expectJust "mkGlobalWeights" $ mapLookup blockId vertexMapping
+    toVertex   blockId  = expectJust $ mapLookup blockId vertexMapping
    -- Map from indices starting at zero to blockIds
    fromVertex :: Int -> BlockId
    fromVertex vertex   = blockMapping ! vertex

--- a/compiler/GHC/CmmToAsm/CFG/Dominators.hs
+++ b/compiler/GHC/CmmToAsm/CFG/Dominators.hs
-{-# LANGUAGE BangPatterns #-}
-{-# LANGUAGE DeriveFunctor #-}
-{-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE RankNTypes #-}
 {-# LANGUAGE Strict #-}

 {- |

--- a/compiler/GHC/CmmToAsm/Config.hs
+++ b/compiler/GHC/CmmToAsm/Config.hs
@@ -30,6 +30,9 @@ data NCGConfig = NCGConfig
   , ncgAsmLinting            :: !Bool            -- ^ Perform ASM linting pass
   , ncgDoConstantFolding     :: !Bool            -- ^ Perform CMM constant folding
   , ncgSseVersion            :: Maybe SseVersion -- ^ (x86) SSE instructions
+   , ncgAvxEnabled            :: !Bool
+   , ncgAvx2Enabled           :: !Bool
+   , ncgAvx512fEnabled        :: !Bool
   , ncgBmiVersion            :: Maybe BmiVersion -- ^ (x86) BMI instructions
   , ncgDumpRegAllocStages    :: !Bool
   , ncgDumpAsmStats          :: !Bool

--- a/compiler/GHC/CmmToAsm/Dwarf.hs
+++ b/compiler/GHC/CmmToAsm/Dwarf.hs
@@ -14,7 +14,7 @@ import GHC.Unit.Module
 import GHC.Utils.Outputable
 import GHC.Platform
 import GHC.Types.Unique
-import GHC.Types.Unique.Supply
+import GHC.Types.Unique.DSM

 import GHC.CmmToAsm.Dwarf.Constants
 import GHC.CmmToAsm.Dwarf.Types
@@ -29,11 +29,9 @@ import qualified Data.Map as Map
 import System.FilePath

 import qualified GHC.Cmm.Dataflow.Label as H
-import qualified GHC.Cmm.Dataflow.Collections as H

 -- | Generate DWARF/debug information
-dwarfGen :: IsDoc doc => String -> NCGConfig -> ModLocation -> UniqSupply -> [DebugBlock]
-            -> (doc, UniqSupply)
+dwarfGen :: IsDoc doc => String -> NCGConfig -> ModLocation -> DUniqSupply -> [DebugBlock] -> (doc, DUniqSupply)
 dwarfGen _        _      _      us []     = (empty, us)
 dwarfGen compPath config modLoc us blocks =
  let platform = ncgPlatform config
@@ -66,7 +64,7 @@ dwarfGen compPath config modLoc us blocks =

  -- .debug_info section: Information records on procedures and blocks
      -- unique to identify start and end compilation unit .debug_inf
-      (unitU, us') = takeUniqFromSupply us
+      (unitU, us') = takeUniqueFromDSupply us
      infoSct = vcat [ line (dwarfInfoLabel <> colon)
                     , dwarfInfoSection platform
                     , compileUnitHeader platform unitU
@@ -80,7 +78,7 @@ dwarfGen compPath config modLoc us blocks =
                line (dwarfLineLabel <> colon)

  -- .debug_frame section: Information about the layout of the GHC stack
-      (framesU, us'') = takeUniqFromSupply us'
+      (framesU, us'') = takeUniqueFromDSupply us'
      frameSct = dwarfFrameSection platform $$
                 line (dwarfFrameLabel <> colon) $$
                 pprDwarfFrame platform (debugFrame platform framesU procs)
@@ -91,8 +89,8 @@ dwarfGen compPath config modLoc us blocks =
      aranges = dwarfARangesSection platform $$ pprDwarfARanges platform aranges' unitU

  in (infoSct $$ abbrevSct $$ lineSct $$ frameSct $$ aranges, us'')
-{-# SPECIALIZE dwarfGen :: String -> NCGConfig -> ModLocation -> UniqSupply -> [DebugBlock] -> (SDoc, UniqSupply) #-}
-{-# SPECIALIZE dwarfGen :: String -> NCGConfig -> ModLocation -> UniqSupply -> [DebugBlock] -> (HDoc, UniqSupply) #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable
+{-# SPECIALIZE dwarfGen :: String -> NCGConfig -> ModLocation -> DUniqSupply -> [DebugBlock] -> (SDoc, DUniqSupply) #-}
+{-# SPECIALIZE dwarfGen :: String -> NCGConfig -> ModLocation -> DUniqSupply -> [DebugBlock] -> (HDoc, DUniqSupply) #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable

 -- | Build an address range entry for one proc.
 -- With split sections, each proc needs its own entry, since they may get

--- a/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
+++ b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
@@ -240,6 +240,7 @@ dwarfRegNo p r = case platformArch p of
    | r == xmm15 -> 32
  ArchPPC_64 _ -> fromIntegral $ toRegNo r
  ArchAArch64  -> fromIntegral $ toRegNo r
+  ArchRISCV64  -> fromIntegral $ toRegNo r
  _other -> error "dwarfRegNo: Unsupported platform or unknown register!"

 -- | Virtual register number to use for return address.
@@ -252,5 +253,6 @@ dwarfReturnRegNo p
    ArchX86    -> 8  -- eip
    ArchX86_64 -> 16 -- rip
    ArchPPC_64 ELF_V2 -> 65 -- lr (link register)
-    ArchAArch64-> 30
+    ArchAArch64 -> 30
+    ArchRISCV64 -> 1 -- ra (return address)
    _other     -> error "dwarfReturnRegNo: Unsupported platform!"
--- a/compiler/GHC/CmmToAsm/Dwarf/Types.hs
+++ b/compiler/GHC/CmmToAsm/Dwarf/Types.hs
@@ -150,14 +150,14 @@ pprAbbrevDecls platform haveDebugLine =
 pprDwarfInfo :: IsDoc doc => Platform -> Bool -> DwarfInfo -> doc
 pprDwarfInfo platform haveSrc d
  = case d of
-      DwarfCompileUnit {}  -> hasChildren
-      DwarfSubprogram {}   -> hasChildren
-      DwarfBlock {}        -> hasChildren
-      DwarfSrcNote {}      -> noChildren
+      DwarfCompileUnit {dwChildren = kids} -> hasChildren kids
+      DwarfSubprogram  {dwChildren = kids} -> hasChildren kids
+      DwarfBlock       {dwChildren = kids} -> hasChildren kids
+      DwarfSrcNote {}                      -> noChildren
  where
-    hasChildren =
+    hasChildren kids =
        pprDwarfInfoOpen platform haveSrc d $$
-        vcat (map (pprDwarfInfo platform haveSrc) (dwChildren d)) $$
+        vcat (map (pprDwarfInfo platform haveSrc) kids) $$
        pprDwarfInfoClose
    noChildren = pprDwarfInfoOpen platform haveSrc d
 {-# SPECIALIZE pprDwarfInfo :: Platform -> Bool -> DwarfInfo -> SDoc #-}

--- a/compiler/GHC/CmmToAsm/Format.hs
+++ b/compiler/GHC/CmmToAsm/Format.hs
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE PatternSynonyms #-}
+{-# LANGUAGE ViewPatterns #-}
+
 -- | Formats on this architecture
 --      A Format is a combination of width and class
 --
@@ -9,14 +13,28 @@
 --              properly. eg SPARC doesn't care about FF80.
 --
 module GHC.CmmToAsm.Format (
-    Format(..),
+    Format(.., IntegerFormat),
+    ScalarFormat(..),
    intFormat,
    floatFormat,
    isIntFormat,
+    isIntScalarFormat,
+    intScalarFormat,
    isFloatFormat,
+    vecFormat,
+    isVecFormat,
    cmmTypeFormat,
    formatToWidth,
-    formatInBytes
+    scalarWidth,
+    formatInBytes,
+    isFloatScalarFormat,
+    isFloatOrFloatVecFormat,
+    floatScalarFormat,
+    scalarFormatFormat,
+    VirtualRegWithFormat(..),
+    RegWithFormat(..),
+    takeVirtualRegs,
+    takeRealRegs,
 )

 where
@@ -24,9 +42,33 @@ where
 import GHC.Prelude

 import GHC.Cmm
+import GHC.Platform.Reg ( Reg(..), RealReg, VirtualReg )
+import GHC.Types.Unique ( Uniquable(..) )
+import GHC.Types.Unique.Set
 import GHC.Utils.Outputable
 import GHC.Utils.Panic

+{- Note [GHC's data format representations]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+GHC has severals types that represent various aspects of data format.
+These include:
+
+ * 'CmmType.CmmType': The data classification used throughout the C--
+   pipeline. This is a pair of a CmmCat and a Width.
+
+ * 'CmmType.CmmCat': What the bits in a C-- value mean (e.g. a pointer, integer, or floating-point value)
+
+ * 'CmmType.Width': The width of a C-- value.
+
+ * 'CmmType.Length': The width (measured in number of scalars) of a vector value.
+
+ * 'Format.Format': The data format representation used by much of the backend.
+
+ * 'Format.ScalarFormat': The format of a 'Format.VecFormat'\'s scalar.
+
+ * 'RegClass.RegClass': Whether a register is an integer or a floating point/vector register.
+-}
+
 -- It looks very like the old MachRep, but it's now of purely local
 -- significance, here in the native code generator.  You can change it
 -- without global consequences.
@@ -49,8 +91,70 @@ data Format
        | II64
        | FF32
        | FF64
-        deriving (Show, Eq)
+        | VecFormat !Length       -- ^ number of elements (always at least 2)
+                    !ScalarFormat -- ^ format of each element
+        deriving (Show, Eq, Ord)
+
+pattern IntegerFormat :: Format
+pattern IntegerFormat <- ( isIntegerFormat -> True )
+{-# COMPLETE IntegerFormat, FF32, FF64, VecFormat #-}
+
+isIntegerFormat :: Format -> Bool
+isIntegerFormat = \case
+  II8  -> True
+  II16 -> True
+  II32 -> True
+  II64 -> True
+  _    -> False
+
+
+instance Outputable Format where
+  ppr fmt = text (show fmt)
+
+data ScalarFormat
+  = FmtInt8
+  | FmtInt16
+  | FmtInt32
+  | FmtInt64
+  | FmtFloat
+  | FmtDouble
+  deriving (Show, Eq, Ord)
+
+scalarFormatFormat :: ScalarFormat -> Format
+scalarFormatFormat = \case
+  FmtInt8 -> II8
+  FmtInt16 -> II16
+  FmtInt32 -> II32
+  FmtInt64 -> II64
+  FmtFloat -> FF32
+  FmtDouble -> FF64
+
+isFloatScalarFormat :: ScalarFormat -> Bool
+isFloatScalarFormat = \case
+  FmtFloat -> True
+  FmtDouble -> True
+  _ -> False
+
+isFloatOrFloatVecFormat :: Format -> Bool
+isFloatOrFloatVecFormat = \case
+  VecFormat _ sFmt -> isFloatScalarFormat sFmt
+  fmt -> isFloatFormat fmt
+
+floatScalarFormat :: Width -> ScalarFormat
+floatScalarFormat W32 = FmtFloat
+floatScalarFormat W64 = FmtDouble
+floatScalarFormat w = pprPanic "floatScalarFormat" (ppr w)

+isIntScalarFormat :: ScalarFormat -> Bool
+isIntScalarFormat = not . isFloatScalarFormat
+
+intScalarFormat :: Width -> ScalarFormat
+intScalarFormat = \case
+  W8  -> FmtInt8
+  W16 -> FmtInt16
+  W32 -> FmtInt32
+  W64 -> FmtInt64
+  w   -> pprPanic "intScalarFormat" (ppr w)

 -- | Get the integer format of this width.
 intFormat :: Width -> Format
@@ -64,6 +168,15 @@ intFormat width
            "produce code for Format.intFormat " ++ show other
            ++ "\n\tConsider using the llvm backend with -fllvm"

+-- | Check if a format represent an integer value.
+isIntFormat :: Format -> Bool
+isIntFormat format =
+  case format of
+    II8  -> True
+    II16 -> True
+    II32 -> True
+    II64 -> True
+    _    -> False

 -- | Get the float format of this width.
 floatFormat :: Width -> Format
@@ -71,13 +184,8 @@ floatFormat width
 = case width of
        W32     -> FF32
        W64     -> FF64
-
        other   -> pprPanic "Format.floatFormat" (ppr other)

-- | Check if a format represent an integer value.
-isIntFormat :: Format -> Bool
-isIntFormat = not . isFloatFormat
-
 -- | Check if a format represents a floating point value.
 isFloatFormat :: Format -> Bool
 isFloatFormat format
@@ -86,11 +194,33 @@ isFloatFormat format
        FF64    -> True
        _       -> False

+vecFormat :: CmmType -> Format
+vecFormat ty =
+  let l      = vecLength ty
+      elemTy = vecElemType ty
+   in if isFloatType elemTy
+      then case typeWidth elemTy of
+             W32 -> VecFormat l FmtFloat
+             W64 -> VecFormat l FmtDouble
+             _   -> pprPanic "Incorrect vector element width" (ppr elemTy)
+      else case typeWidth elemTy of
+             W8  -> VecFormat l FmtInt8
+             W16 -> VecFormat l FmtInt16
+             W32 -> VecFormat l FmtInt32
+             W64 -> VecFormat l FmtInt64
+             _   -> pprPanic "Incorrect vector element width" (ppr elemTy)
+
+-- | Check if a format represents a vector
+isVecFormat :: Format -> Bool
+isVecFormat (VecFormat {}) = True
+isVecFormat _              = False
+

 -- | Convert a Cmm type to a Format.
 cmmTypeFormat :: CmmType -> Format
 cmmTypeFormat ty
        | isFloatType ty        = floatFormat (typeWidth ty)
+        | isVecType ty          = vecFormat ty
        | otherwise             = intFormat (typeWidth ty)


@@ -98,13 +228,65 @@ cmmTypeFormat ty
 formatToWidth :: Format -> Width
 formatToWidth format
 = case format of
-        II8             -> W8
-        II16            -> W16
-        II32            -> W32
-        II64            -> W64
-        FF32            -> W32
-        FF64            -> W64
+        II8  -> W8
+        II16 -> W16
+        II32 -> W32
+        II64 -> W64
+        FF32 -> W32
+        FF64 -> W64
+        VecFormat l s ->
+          widthFromBytes (l * widthInBytes (scalarWidth s))

+scalarWidth :: ScalarFormat -> Width
+scalarWidth = \case
+  FmtInt8   -> W8
+  FmtInt16  -> W16
+  FmtInt32  -> W32
+  FmtInt64  -> W64
+  FmtFloat  -> W32
+  FmtDouble -> W64

 formatInBytes :: Format -> Int
 formatInBytes = widthInBytes . formatToWidth
+
+--------------------------------------------------------------------------------
+
+-- | A typed virtual register: a virtual register, together with the specific
+-- format we are using it at.
+data VirtualRegWithFormat
+    = VirtualRegWithFormat
+    { virtualRegWithFormat_reg :: {-# UNPACK #-} !VirtualReg
+    , virtualRegWithFormat_format :: !Format
+    }
+
+-- | A typed register: a register, together with the specific format we
+-- are using it at.
+data RegWithFormat
+    = RegWithFormat
+    { regWithFormat_reg :: {-# UNPACK #-} !Reg
+    , regWithFormat_format :: !Format
+    }
+
+instance Show RegWithFormat where
+  show (RegWithFormat reg fmt) = show reg ++ "::" ++ show fmt
+
+instance Uniquable RegWithFormat where
+  getUnique = getUnique . regWithFormat_reg
+
+instance Outputable VirtualRegWithFormat where
+  ppr (VirtualRegWithFormat reg fmt) = ppr reg <+> dcolon <+> ppr fmt
+
+instance Outputable RegWithFormat where
+  ppr (RegWithFormat reg fmt) = ppr reg <+> dcolon <+> ppr fmt
+
+-- | Take all the virtual registers from this set.
+takeVirtualRegs :: UniqSet RegWithFormat -> UniqSet VirtualReg
+takeVirtualRegs = mapMaybeUniqSet_sameUnique $
+  \ case { RegWithFormat { regWithFormat_reg = RegVirtual vr } -> Just vr; _ -> Nothing }
+  -- See Note [Unique Determinism and code generation]
+
+-- | Take all the real registers from this set.
+takeRealRegs :: UniqSet RegWithFormat -> UniqSet RealReg
+takeRealRegs = mapMaybeUniqSet_sameUnique $
+  \ case { RegWithFormat { regWithFormat_reg = RegReal rr } -> Just rr; _ -> Nothing }
+  -- See Note [Unique Determinism and code generation]
--- a/compiler/GHC/CmmToAsm/Instr.hs
+++ b/compiler/GHC/CmmToAsm/Instr.hs
@@ -16,6 +16,9 @@ import GHC.Cmm.BlockId

 import GHC.CmmToAsm.Config
 import GHC.Data.FastString
+import GHC.CmmToAsm.Format
+
+import GHC.Utils.Misc (HasDebugCallStack)

 -- | Holds a list of source and destination registers used by a
 --      particular instruction.
@@ -29,8 +32,8 @@ import GHC.Data.FastString
 --
 data RegUsage
        = RU    {
-                reads :: [Reg],
-                writes :: [Reg]
+                reads :: [RegWithFormat],
+                writes :: [RegWithFormat]
                }
        deriving Show

@@ -59,7 +62,9 @@ class Instruction instr where
        -- | Apply a given mapping to all the register references in this
        --      instruction.
        patchRegsOfInstr
-                :: instr
+                :: HasDebugCallStack
+                => Platform
+                -> instr
                -> (Reg -> Reg)
                -> instr

@@ -71,11 +76,17 @@ class Instruction instr where
                :: instr -> Bool


-        -- | Give the possible destinations of this jump instruction.
+        -- | Give the possible *local block* destinations of this jump instruction.
        --      Must be defined for all jumpish instructions.
        jumpDestsOfInstr
                :: instr -> [BlockId]

+        -- | Check if the instr always transfers control flow
+        -- to the given block. Used by code layout to eliminate
+        -- jumps that can be replaced by fall through.
+        canFallthroughTo
+                :: instr -> BlockId -> Bool
+

        -- | Change the destination of this jump instruction.
        --      Used in the linear allocator when adding fixup blocks for join
@@ -88,20 +99,22 @@ class Instruction instr where

        -- | An instruction to spill a register into a spill slot.
        mkSpillInstr
-                :: NCGConfig
-                -> Reg          -- ^ the reg to spill
+                :: HasDebugCallStack
+                => NCGConfig
+                -> RegWithFormat    -- ^ the reg to spill
                -> Int          -- ^ the current stack delta
-                -> Int          -- ^ spill slot to use
-                -> [instr]        -- ^ instructions
+                -> Int          -- ^ spill slots to use
+                -> [instr]      -- ^ instructions


        -- | An instruction to reload a register from a spill slot.
        mkLoadInstr
-                :: NCGConfig
-                -> Reg          -- ^ the reg to reload.
+                :: HasDebugCallStack
+                => NCGConfig
+                -> RegWithFormat    -- ^ the reg to reload.
                -> Int          -- ^ the current stack delta
                -> Int          -- ^ the spill slot to use
-                -> [instr]        -- ^ instructions
+                -> [instr]      -- ^ instructions

        -- | See if this instruction is telling us the current C stack delta
        takeDeltaInstr
@@ -124,15 +137,18 @@ class Instruction instr where
        -- | Copy the value in a register to another one.
        --      Must work for all register classes.
        mkRegRegMoveInstr
-                :: Platform
-                -> Reg          -- ^ source register
-                -> Reg          -- ^ destination register
+                :: HasDebugCallStack
+                => NCGConfig
+                -> Format
+                -> Reg -- ^ source register
+                -> Reg -- ^ destination register
                -> instr

        -- | Take the source and destination from this reg -> reg move instruction
        --      or Nothing if it's not one
        takeRegRegMoveInstr
-                :: instr
+                :: Platform
+                -> instr
                -> Maybe (Reg, Reg)

        -- | Make an unconditional jump instruction.

--- a/compiler/GHC/CmmToAsm/Monad.hs
+++ b/compiler/GHC/CmmToAsm/Monad.hs
@@ -55,7 +55,6 @@ import GHC.CmmToAsm.Config
 import GHC.CmmToAsm.Types

 import GHC.Cmm.BlockId
-import GHC.Cmm.Dataflow.Collections
 import GHC.Cmm.Dataflow.Label
 import GHC.Cmm.CLabel           ( CLabel )
 import GHC.Cmm.DebugBlock
@@ -63,7 +62,7 @@ import GHC.Cmm.Expr             (LocalReg (..), isWord64)

 import GHC.Data.FastString      ( FastString )
 import GHC.Types.Unique.FM
-import GHC.Types.Unique.Supply
+import GHC.Types.Unique.DSM
 import GHC.Types.Unique         ( Unique )
 import GHC.Unit.Module

@@ -74,13 +73,33 @@ import GHC.Utils.Misc
 import GHC.CmmToAsm.CFG
 import GHC.CmmToAsm.CFG.Weight

+-- | A Native Code Generator implementation is parametrised over
+-- * The type of static data (typically related to 'CmmStatics')
+-- * The type of instructions
+-- * The type of jump destinations
 data NcgImpl statics instr jumpDest = NcgImpl {
    ncgConfig                 :: !NCGConfig,
    cmmTopCodeGen             :: RawCmmDecl -> NatM [NatCmmDecl statics instr],
    generateJumpTableForInstr :: instr -> Maybe (NatCmmDecl statics instr),
+    -- | Given a jump destination, if it refers to a block, return the block id of the destination.
    getJumpDestBlockId        :: jumpDest -> Maybe BlockId,
+    -- | Does this jump always jump to a single destination and is shortcutable?
+    --
+    -- We use this to determine whether the given instruction is a shortcutable
+    -- jump to some destination - See Note [supporting shortcutting]
+    -- Note that if we return a destination here we *most* support the relevant shortcutting in
+    -- shortcutStatics for jump tables and shortcutJump for the instructions itself.
    canShortcut               :: instr -> Maybe jumpDest,
+    -- | Replace references to blockIds with other destinations - used to update jump tables.
    shortcutStatics           :: (BlockId -> Maybe jumpDest) -> statics -> statics,
+    -- | Change the jump destination(s) of an instruction.
+    --
+    -- Rewrites the destination of a jump instruction to another
+    -- destination, if the given function returns a new jump destination for
+    -- the 'BlockId' of the original destination.
+    --
+    -- For instance, for a mapping @block_a -> dest_b@ and a instruction @goto block_a@ we would
+    -- rewrite the instruction to @goto dest_b@
    shortcutJump              :: (BlockId -> Maybe jumpDest) -> instr -> instr,
    -- | 'Module' is only for printing internal labels. See Note [Internal proc
    -- labels] in CLabel.
@@ -90,11 +109,11 @@ data NcgImpl statics instr jumpDest = NcgImpl {
    maxSpillSlots             :: Int,
    allocatableRegs           :: [RealReg],
    ncgAllocMoreStack         :: Int -> NatCmmDecl statics instr
-                              -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)]),
+                              -> UniqDSM (NatCmmDecl statics instr, [(BlockId,BlockId)]),
    -- ^ The list of block ids records the redirected jumps to allow us to update
    -- the CFG.
    ncgMakeFarBranches        :: Platform -> LabelMap RawCmmStatics -> [NatBasicBlock instr]
-                              -> UniqSM [NatBasicBlock instr],
+                              -> UniqDSM [NatBasicBlock instr],
    extractUnwindPoints       :: [instr] -> [UnwindPoint],
    -- ^ given the instruction sequence of a block, produce a list of
    -- the block's 'UnwindPoint's
@@ -106,6 +125,25 @@ data NcgImpl statics instr jumpDest = NcgImpl {
    -- when possible.
    }

+{- Note [supporting shortcutting]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For the concept of shortcutting see Note [What is shortcutting].
+
+In order to support shortcutting across multiple backends uniformly we
+use canShortcut, shortcutStatics and shortcutJump.
+
+canShortcut tells us if the backend support shortcutting of a instruction
+and if so what destination we should retarget instruction to instead.
+
+shortcutStatics exists to allow us to update jump destinations in jump tables.
+
+shortcutJump updates the instructions itself.
+
+A backend can opt out of those by always returning Nothing for canShortcut
+and implementing shortcutStatics/shortcutJump as \_ x -> x
+
+-}
+
 {- Note [pprNatCmmDeclS and pprNatCmmDeclH]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Each NcgImpl provides two implementations of its CmmDecl printer, pprNatCmmDeclS
@@ -140,7 +178,7 @@ mistake would readily show up in performance tests). -}

 data NatM_State
        = NatM_State {
-                natm_us          :: UniqSupply,
+                natm_us          :: DUniqSupply,
                natm_delta       :: Int, -- ^ Stack offset for unwinding information
                natm_imports     :: [(CLabel)],
                natm_pic         :: Maybe Reg,
@@ -167,7 +205,7 @@ pattern NatM f <- NatM' (runState -> f)
 unNat :: NatM a -> NatM_State -> (a, NatM_State)
 unNat (NatM a) = a

-mkNatM_State :: UniqSupply -> Int -> NCGConfig ->
+mkNatM_State :: DUniqSupply -> Int -> NCGConfig ->
                DwarfFiles -> LabelMap DebugBlock -> CFG -> NatM_State
 mkNatM_State us delta config
        = \dwf dbg cfg ->
@@ -185,19 +223,13 @@ mkNatM_State us delta config
 initNat :: NatM_State -> NatM a -> (a, NatM_State)
 initNat = flip unNat

-instance MonadUnique NatM where
-  getUniqueSupplyM = NatM $ \st ->
-      case splitUniqSupply (natm_us st) of
-          (us1, us2) -> (us1, st {natm_us = us2})
-
+instance MonadGetUnique NatM where
  getUniqueM = NatM $ \st ->
-      case takeUniqFromSupply (natm_us st) of
-          (uniq, us') -> (uniq, st {natm_us = us'})
+      case takeUniqueFromDSupply (natm_us st) of
+        (uniq, us') -> (uniq, st {natm_us = us'})

 getUniqueNat :: NatM Unique
-getUniqueNat = NatM $ \ st ->
-    case takeUniqFromSupply $ natm_us st of
-    (uniq, us') -> (uniq, st {natm_us = us'})
+getUniqueNat = getUniqueM

 getDeltaNat :: NatM Int
 getDeltaNat = NatM $ \ st -> (natm_delta st, st)
No results found