diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index d129d601f4fccf1fe1de565ce4a75416c054992b..601b1d9b858b6e656683ac6d7b2a21352d98e984 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -6,7 +6,7 @@
 
 module CmmExpr
     ( CmmExpr(..), cmmExprType, cmmExprWidth, maybeInvertCmmExpr
-    , CmmReg(..), cmmRegType
+    , CmmReg(..), cmmRegType, cmmRegWidth
     , CmmLit(..), cmmLitType
     , LocalReg(..), localRegType
     , GlobalReg(..), isArgReg, globalRegType
@@ -273,6 +273,9 @@ cmmRegType :: DynFlags -> CmmReg -> CmmType
 cmmRegType _      (CmmLocal  reg) = localRegType reg
 cmmRegType dflags (CmmGlobal reg) = globalRegType dflags reg
 
+cmmRegWidth :: DynFlags -> CmmReg -> Width
+cmmRegWidth dflags = typeWidth . cmmRegType dflags
+
 localRegType :: LocalReg -> CmmType
 localRegType (LocalReg _ rep) = rep
 
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index c5e9d9bf27ba69ac7a145b489bf7f353863e3646..70e53d2325d9c38da690b820eae7cf192f95a61d 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -107,6 +107,14 @@ data MachOp
   | MO_FS_Conv Width Width      -- Float -> Signed int
   | MO_SS_Conv Width Width      -- Signed int -> Signed int
   | MO_UU_Conv Width Width      -- unsigned int -> unsigned int
+  | MO_XX_Conv Width Width      -- int -> int; puts no requirements on the
+                                -- contents of upper bits when extending;
+                                -- narrowing is simply truncation; the only
+                                -- expectation is that we can recover the
+                                -- original value by applying the opposite
+                                -- MO_XX_Conv, e.g.,
+                                --   MO_XX_CONV W64 W8 (MO_XX_CONV W8 W64 x)
+                                -- is equivalent to just x.
   | MO_FF_Conv Width Width      -- Float -> Float
 
   -- Vector element insertion and extraction operations
@@ -392,6 +400,7 @@ machOpResultType dflags mop tys =
 
     MO_SS_Conv _ to     -> cmmBits to
     MO_UU_Conv _ to     -> cmmBits to
+    MO_XX_Conv _ to     -> cmmBits to
     MO_FS_Conv _ to     -> cmmBits to
     MO_SF_Conv _ to     -> cmmFloat to
     MO_FF_Conv _ to     -> cmmFloat to
@@ -483,6 +492,7 @@ machOpArgReps dflags op =
 
     MO_SS_Conv from _   -> [from]
     MO_UU_Conv from _   -> [from]
+    MO_XX_Conv from _   -> [from]
     MO_SF_Conv from _   -> [from]
     MO_FS_Conv from _   -> [from]
     MO_FF_Conv from _   -> [from]
diff --git a/compiler/cmm/CmmUtils.hs b/compiler/cmm/CmmUtils.hs
index 42d64842e2d9f6004f6c49d6fd7ae7c2dddb3e59..11e4df5bf4dfe8d96809ad6d757b755f76d15362 100644
--- a/compiler/cmm/CmmUtils.hs
+++ b/compiler/cmm/CmmUtils.hs
@@ -97,6 +97,8 @@ primRepCmmType dflags LiftedRep        = gcWord dflags
 primRepCmmType dflags UnliftedRep      = gcWord dflags
 primRepCmmType dflags IntRep           = bWord dflags
 primRepCmmType dflags WordRep          = bWord dflags
+primRepCmmType _      Int8Rep          = b8
+primRepCmmType _      Word8Rep         = b8
 primRepCmmType _      Int64Rep         = b64
 primRepCmmType _      Word64Rep        = b64
 primRepCmmType dflags AddrRep          = bWord dflags
@@ -131,8 +133,10 @@ primRepForeignHint VoidRep      = panic "primRepForeignHint:VoidRep"
 primRepForeignHint LiftedRep    = AddrHint
 primRepForeignHint UnliftedRep  = AddrHint
 primRepForeignHint IntRep       = SignedHint
-primRepForeignHint WordRep      = NoHint
+primRepForeignHint Int8Rep      = SignedHint
 primRepForeignHint Int64Rep     = SignedHint
+primRepForeignHint WordRep      = NoHint
+primRepForeignHint Word8Rep     = NoHint
 primRepForeignHint Word64Rep    = NoHint
 primRepForeignHint AddrRep      = AddrHint -- NB! AddrHint, but NonPtrArg
 primRepForeignHint FloatRep     = NoHint
diff --git a/compiler/cmm/MkGraph.hs b/compiler/cmm/MkGraph.hs
index 70229d067d76aa944cca3befdbc5a93e1228c332..bcd03bfa67401bb849dae56865a119daffbfa7d2 100644
--- a/compiler/cmm/MkGraph.hs
+++ b/compiler/cmm/MkGraph.hs
@@ -38,6 +38,7 @@ import OrdList
 import SMRep (ByteOff)
 import UniqSupply
 import Util
+import Panic
 
 
 -----------------------------------------------------------------------------
@@ -309,18 +310,33 @@ copyIn :: DynFlags -> Convention -> Area
 copyIn dflags conv area formals extra_stk
   = (stk_size, [r | (_, RegisterParam r) <- args], map ci (stk_args ++ args))
   where
-     ci (reg, RegisterParam r) =
-          CmmAssign (CmmLocal reg) (CmmReg (CmmGlobal r))
-     ci (reg, StackParam off) =
-          CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot area off) ty)
-          where ty = localRegType reg
+    -- See Note [Width of parameters]
+    ci (reg, RegisterParam r@(VanillaReg {})) =
+        let local = CmmLocal reg
+            global = CmmReg (CmmGlobal r)
+            width = cmmRegWidth dflags local
+            expr
+                | width == wordWidth dflags = global
+                | width < wordWidth dflags =
+                    CmmMachOp (MO_XX_Conv (wordWidth dflags) width) [global]
+                | otherwise = panic "Parameter width greater than word width"
 
-     init_offset = widthInBytes (wordWidth dflags) -- infotable
+        in CmmAssign local expr
 
-     (stk_off, stk_args) = assignStack dflags init_offset localRegType extra_stk
+    -- Non VanillaRegs
+    ci (reg, RegisterParam r) =
+        CmmAssign (CmmLocal reg) (CmmReg (CmmGlobal r))
 
-     (stk_size, args) = assignArgumentsPos dflags stk_off conv
-                                           localRegType formals
+    ci (reg, StackParam off) =
+         CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot area off) ty)
+         where ty = localRegType reg
+
+    init_offset = widthInBytes (wordWidth dflags) -- infotable
+
+    (stk_off, stk_args) = assignStack dflags init_offset localRegType extra_stk
+
+    (stk_size, args) = assignArgumentsPos dflags stk_off conv
+                                          localRegType formals
 
 -- Factoring out the common parts of the copyout functions yielded something
 -- more complicated:
@@ -346,8 +362,21 @@ copyOutOflow dflags conv transfer area actuals updfr_off extra_stack_stuff
   where
     (regs, graph) = foldr co ([], mkNop) (setRA ++ args ++ stack_params)
 
-    co (v, RegisterParam r) (rs, ms)
-       = (r:rs, mkAssign (CmmGlobal r) v <*> ms)
+    -- See Note [Width of parameters]
+    co (v, RegisterParam r@(VanillaReg {})) (rs, ms) =
+        let width = cmmExprWidth dflags v
+            value
+                | width == wordWidth dflags = v
+                | width < wordWidth dflags =
+                    CmmMachOp (MO_XX_Conv width (wordWidth dflags)) [v]
+                | otherwise = panic "Parameter width greater than word width"
+
+        in (r:rs, mkAssign (CmmGlobal r) value <*> ms)
+
+    -- Non VanillaRegs
+    co (v, RegisterParam r) (rs, ms) =
+        (r:rs, mkAssign (CmmGlobal r) v <*> ms)
+
     co (v, StackParam off)  (rs, ms)
        = (rs, mkStore (CmmStackSlot area off) v <*> ms)
 
@@ -374,6 +403,28 @@ copyOutOflow dflags conv transfer area actuals updfr_off extra_stack_stuff
                                           (cmmExprType dflags) actuals
 
 
+-- Note [Width of parameters]
+--
+-- Consider passing a small (< word width) primitive like Int8# to a function
+-- through a register. It's actually non-trivial to do this without
+-- extending/narrowing:
+-- * Global registers are considered to have native word width (i.e., 64-bits on
+--   x86-64), so CmmLint would complain if we assigne an 8-bit parameter to a
+--   global register.
+-- * Same problem exists with LLVM IR.
+-- * Lowering gets harder since on x86-32 not every register exposes its lower
+--   8 bits (e.g., for %eax we can use %al, but there isn't a corresponding
+--   8-bit register for %edi). So we would either need to extend/narrow anyway,
+--   or complicate the calling convention.
+-- So instead, we always extend every parameter smaller than native word width
+-- in copyOutOflow and then truncate it back to the expected width in copyIn.
+-- Note that we do this in cmm using MO_XX_Conv to avoid requiring
+-- zero-/sign-extending - it's up to a backend to handle this in a most
+-- efficient way (e.g., a simple register move)
+--
+-- There was some discussion about this on this PR:
+-- https://github.com/ghc-proposals/ghc-proposals/pull/74
+
 
 mkCallEntry :: DynFlags -> Convention -> [CmmFormal] -> [CmmFormal]
             -> (Int, [GlobalReg], CmmAGraph)
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index a979d495011af7ebe528951e10a6a4d723f337b9..17fef7fc9707d875d40fbf8118140aa1fd3f5e40 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -646,6 +646,9 @@ pprMachOp_for_C mop = case mop of
         MO_SS_Conv from to | from == to -> empty
         MO_SS_Conv _from to -> parens (machRep_S_CType to)
 
+        MO_XX_Conv from to | from == to -> empty
+        MO_XX_Conv _from to -> parens (machRep_U_CType to)
+
         MO_FF_Conv from to | from == to -> empty
         MO_FF_Conv _from to -> parens (machRep_F_CType to)
 
diff --git a/compiler/codeGen/StgCmmArgRep.hs b/compiler/codeGen/StgCmmArgRep.hs
index 2ea04079d0e85c066b0e5a3ecf07d1eba0341bfc..95f96dc16faaff36f0e426ecd3b02bc4e8d631a8 100644
--- a/compiler/codeGen/StgCmmArgRep.hs
+++ b/compiler/codeGen/StgCmmArgRep.hs
@@ -70,6 +70,8 @@ toArgRep LiftedRep         = P
 toArgRep UnliftedRep       = P
 toArgRep IntRep            = N
 toArgRep WordRep           = N
+toArgRep Int8Rep           = N  -- Gets widened to native word width for calls
+toArgRep Word8Rep          = N  -- Gets widened to native word width for calls
 toArgRep AddrRep           = N
 toArgRep Int64Rep          = L
 toArgRep Word64Rep         = L
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index c90264f14fccd46ee62e7966fe01776e83a813ec..9da472e768d3cf708826c0163d44469e7225e007 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -875,19 +875,29 @@ type GenericOp = [CmmFormal] -> [CmmActual] -> FCode ()
 callishPrimOpSupported :: DynFlags -> PrimOp -> Either CallishMachOp GenericOp
 callishPrimOpSupported dflags op
   = case op of
-      IntQuotRemOp   | ncg && (x86ish
-                              || ppc) -> Left (MO_S_QuotRem  (wordWidth dflags))
-                     | otherwise      -> Right (genericIntQuotRemOp dflags)
+      IntQuotRemOp   | ncg && (x86ish || ppc) ->
+                         Left (MO_S_QuotRem  (wordWidth dflags))
+                     | otherwise              ->
+                         Right (genericIntQuotRemOp (wordWidth dflags))
 
-      WordQuotRemOp  | ncg && (x86ish
-                              || ppc) -> Left (MO_U_QuotRem  (wordWidth dflags))
-                     | otherwise      -> Right (genericWordQuotRemOp dflags)
+      Int8QuotRemOp  | (ncg && x86ish)
+                        || llvm      -> Left (MO_S_QuotRem W8)
+                     | otherwise     -> Right (genericIntQuotRemOp W8)
+
+      WordQuotRemOp  | ncg && (x86ish || ppc) ->
+                         Left (MO_U_QuotRem  (wordWidth dflags))
+                     | otherwise      ->
+                         Right (genericWordQuotRemOp (wordWidth dflags))
 
       WordQuotRem2Op | (ncg && (x86ish
                                 || ppc))
                           || llvm     -> Left (MO_U_QuotRem2 (wordWidth dflags))
                      | otherwise      -> Right (genericWordQuotRem2Op dflags)
 
+      Word8QuotRemOp | (ncg && x86ish)
+                        || llvm      -> Left (MO_U_QuotRem W8)
+                     | otherwise     -> Right (genericWordQuotRemOp W8)
+
       WordAdd2Op     | (ncg && (x86ish
                                 || ppc))
                          || llvm      -> Left (MO_Add2       (wordWidth dflags))
@@ -943,20 +953,20 @@ callishPrimOpSupported dflags op
           ArchPPC_64 _ -> True
           _            -> False
 
-genericIntQuotRemOp :: DynFlags -> GenericOp
-genericIntQuotRemOp dflags [res_q, res_r] [arg_x, arg_y]
+genericIntQuotRemOp :: Width -> GenericOp
+genericIntQuotRemOp width [res_q, res_r] [arg_x, arg_y]
    = emit $ mkAssign (CmmLocal res_q)
-              (CmmMachOp (MO_S_Quot (wordWidth dflags)) [arg_x, arg_y]) <*>
+              (CmmMachOp (MO_S_Quot width) [arg_x, arg_y]) <*>
             mkAssign (CmmLocal res_r)
-              (CmmMachOp (MO_S_Rem  (wordWidth dflags)) [arg_x, arg_y])
+              (CmmMachOp (MO_S_Rem  width) [arg_x, arg_y])
 genericIntQuotRemOp _ _ _ = panic "genericIntQuotRemOp"
 
-genericWordQuotRemOp :: DynFlags -> GenericOp
-genericWordQuotRemOp dflags [res_q, res_r] [arg_x, arg_y]
+genericWordQuotRemOp :: Width -> GenericOp
+genericWordQuotRemOp width [res_q, res_r] [arg_x, arg_y]
     = emit $ mkAssign (CmmLocal res_q)
-               (CmmMachOp (MO_U_Quot (wordWidth dflags)) [arg_x, arg_y]) <*>
+               (CmmMachOp (MO_U_Quot width) [arg_x, arg_y]) <*>
              mkAssign (CmmLocal res_r)
-               (CmmMachOp (MO_U_Rem  (wordWidth dflags)) [arg_x, arg_y])
+               (CmmMachOp (MO_U_Rem  width) [arg_x, arg_y])
 genericWordQuotRemOp _ _ _ = panic "genericWordQuotRemOp"
 
 genericWordQuotRem2Op :: DynFlags -> GenericOp
@@ -1310,6 +1320,42 @@ translateOp dflags AddrLeOp       = Just (mo_wordULe dflags)
 translateOp dflags AddrGtOp       = Just (mo_wordUGt dflags)
 translateOp dflags AddrLtOp       = Just (mo_wordULt dflags)
 
+-- Int8# signed ops
+
+translateOp dflags Int8Extend     = Just (MO_SS_Conv W8 (wordWidth dflags))
+translateOp dflags Int8Narrow     = Just (MO_SS_Conv (wordWidth dflags) W8)
+translateOp _      Int8NegOp      = Just (MO_S_Neg W8)
+translateOp _      Int8AddOp      = Just (MO_Add W8)
+translateOp _      Int8SubOp      = Just (MO_Sub W8)
+translateOp _      Int8MulOp      = Just (MO_Mul W8)
+translateOp _      Int8QuotOp     = Just (MO_S_Quot W8)
+translateOp _      Int8RemOp      = Just (MO_S_Rem W8)
+
+translateOp _      Int8EqOp       = Just (MO_Eq W8)
+translateOp _      Int8GeOp       = Just (MO_S_Ge W8)
+translateOp _      Int8GtOp       = Just (MO_S_Gt W8)
+translateOp _      Int8LeOp       = Just (MO_S_Le W8)
+translateOp _      Int8LtOp       = Just (MO_S_Lt W8)
+translateOp _      Int8NeOp       = Just (MO_Ne W8)
+
+-- Word8# unsigned ops
+
+translateOp dflags Word8Extend     = Just (MO_UU_Conv W8 (wordWidth dflags))
+translateOp dflags Word8Narrow     = Just (MO_UU_Conv (wordWidth dflags) W8)
+translateOp _      Word8NotOp      = Just (MO_Not W8)
+translateOp _      Word8AddOp      = Just (MO_Add W8)
+translateOp _      Word8SubOp      = Just (MO_Sub W8)
+translateOp _      Word8MulOp      = Just (MO_Mul W8)
+translateOp _      Word8QuotOp     = Just (MO_U_Quot W8)
+translateOp _      Word8RemOp      = Just (MO_U_Rem W8)
+
+translateOp _      Word8EqOp       = Just (MO_Eq W8)
+translateOp _      Word8GeOp       = Just (MO_U_Ge W8)
+translateOp _      Word8GtOp       = Just (MO_U_Gt W8)
+translateOp _      Word8LeOp       = Just (MO_U_Le W8)
+translateOp _      Word8LtOp       = Just (MO_U_Lt W8)
+translateOp _      Word8NeOp       = Just (MO_Ne W8)
+
 -- Char# ops
 
 translateOp dflags CharEqOp       = Just (MO_Eq (wordWidth dflags))
diff --git a/compiler/ghci/ByteCodeGen.hs b/compiler/ghci/ByteCodeGen.hs
index 9aaaa7db6442d620ec9165d57069a4d23c5eaa84..e72325891048a884ba7c08c4150ca27dac8413d5 100644
--- a/compiler/ghci/ByteCodeGen.hs
+++ b/compiler/ghci/ByteCodeGen.hs
@@ -805,7 +805,7 @@ mkConAppCode orig_d _ p con args_r_to_l =
 
             do_pushery !d (arg : args) = do
                 (push, arg_bytes) <- case arg of
-                    (Padding l _) -> pushPadding l
+                    (Padding l _) -> return $! pushPadding l
                     (FieldOff a _) -> pushConstrAtom d p (fromNonVoid a)
                 more_push_code <- do_pushery (d + arg_bytes) args
                 return (push `appOL` more_push_code)
@@ -1570,11 +1570,16 @@ pushConstrAtom d p (AnnVar v)
 
 pushConstrAtom d p expr = pushAtom d p expr
 
-pushPadding :: Int -> BcM (BCInstrList, ByteOff)
-pushPadding 1 = return (unitOL (PUSH_PAD8), 1)
-pushPadding 2 = return (unitOL (PUSH_PAD16), 2)
-pushPadding 4 = return (unitOL (PUSH_PAD32), 4)
-pushPadding x = panic $ "pushPadding x=" ++ show x
+pushPadding :: Int -> (BCInstrList, ByteOff)
+pushPadding !n = go n (nilOL, 0)
+  where
+    go n acc@(!instrs, !off) = case n of
+        0 -> acc
+        1 -> (instrs `mappend` unitOL PUSH_PAD8, off + 1)
+        2 -> (instrs `mappend` unitOL PUSH_PAD16, off + 2)
+        3 -> go 1 (go 2 acc)
+        4 -> (instrs `mappend` unitOL PUSH_PAD32, off + 4)
+        _ -> go (n - 4) (go 4 acc)
 
 -- -----------------------------------------------------------------------------
 -- Given a bunch of alts code and their discrs, do the donkey work
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 21abc65e5eec2c4f7901dacd87fb9ea387efc760..efc870993c00973284ba25f6b9281b3e35a0defa 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -1193,6 +1193,9 @@ genMachOp _ op [x] = case op of
     MO_UU_Conv from to
         -> sameConv from (widthToLlvmInt to) LM_Trunc LM_Zext
 
+    MO_XX_Conv from to
+        -> sameConv from (widthToLlvmInt to) LM_Bitcast LM_Bitcast
+
     MO_FF_Conv from to
         -> sameConv from (widthToLlvmFloat to) LM_Fptrunc LM_Fpext
 
@@ -1454,6 +1457,7 @@ genMachOp_slow opt op [x, y] = case op of
     MO_FS_Conv _ _ -> panicOp
     MO_SS_Conv _ _ -> panicOp
     MO_UU_Conv _ _ -> panicOp
+    MO_XX_Conv _ _ -> panicOp
     MO_FF_Conv _ _ -> panicOp
 
     MO_V_Insert  {} -> panicOp
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index a2e26bd68b374f8c8b770afdafc4e98e095765a2..38dc76090d369b03989f6d945aa31963c7083b5c 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -644,20 +644,27 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       -- Nop conversions
       MO_UU_Conv W32 W8  -> toI8Reg  W32 x
       MO_SS_Conv W32 W8  -> toI8Reg  W32 x
+      MO_XX_Conv W32 W8  -> toI8Reg  W32 x
       MO_UU_Conv W16 W8  -> toI8Reg  W16 x
       MO_SS_Conv W16 W8  -> toI8Reg  W16 x
+      MO_XX_Conv W16 W8  -> toI8Reg  W16 x
       MO_UU_Conv W32 W16 -> toI16Reg W32 x
       MO_SS_Conv W32 W16 -> toI16Reg W32 x
+      MO_XX_Conv W32 W16 -> toI16Reg W32 x
 
       MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
       MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+      MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x
       MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
       MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+      MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
       MO_UU_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
       MO_SS_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
+      MO_XX_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
 
       MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
       MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+      MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
 
       -- widenings
       MO_UU_Conv W8  W32 -> integerExtend W8  W32 MOVZxL x
@@ -668,16 +675,32 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
       MO_SS_Conv W8  W16 -> integerExtend W8  W16 MOVSxL x
 
+      -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we
+      -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register
+      -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
+      MO_XX_Conv W8  W32
+          | is32Bit   -> integerExtend W8 W32 MOVZxL x
+          | otherwise -> integerExtend W8 W32 MOV x
+      MO_XX_Conv W8  W16
+          | is32Bit   -> integerExtend W8 W16 MOVZxL x
+          | otherwise -> integerExtend W8 W16 MOV x
+      MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
+
       MO_UU_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVZxL x
       MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
       MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
       MO_SS_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVSxL x
       MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
       MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
-        -- for 32-to-64 bit zero extension, amd64 uses an ordinary movl.
-        -- However, we don't want the register allocator to throw it
-        -- away as an unnecessary reg-to-reg move, so we keep it in
-        -- the form of a movzl and print it as a movl later.
+      -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
+      -- However, we don't want the register allocator to throw it
+      -- away as an unnecessary reg-to-reg move, so we keep it in
+      -- the form of a movzl and print it as a movl later.
+      -- This doesn't apply to MO_XX_Conv since in this case we don't care about
+      -- the upper bits. So we can just use MOV.
+      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOV x
+      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
+      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
 
       MO_FF_Conv W32 W64
         | sse2      -> coerceFP2FP W64 x
@@ -787,6 +810,7 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
 
       MO_S_MulMayOflo rep -> imulMayOflo rep x y
 
+      MO_Mul W8  -> imulW8 x y
       MO_Mul rep -> triv_op rep IMUL
       MO_And rep -> triv_op rep AND
       MO_Or  rep -> triv_op rep OR
@@ -822,6 +846,21 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     triv_op width instr = trivialCode width op (Just op) x y
                         where op   = instr (intFormat width)
 
+    -- Special case for IMUL for bytes, since the result of IMULB will be in
+    -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider
+    -- values.
+    imulW8 :: CmmExpr -> CmmExpr -> NatM Register
+    imulW8 arg_a arg_b = do
+        (a_reg, a_code) <- getNonClobberedReg arg_a
+        b_code <- getAnyReg arg_b
+
+        let code = a_code `appOL` b_code eax `appOL`
+                   toOL [ IMUL2 format (OpReg a_reg) ]
+            format = intFormat W8
+
+        return (Fixed format eax code)
+
+
     imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
     imulMayOflo rep a b = do
          (a_reg, a_code) <- getNonClobberedReg a
@@ -916,6 +955,18 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
         return (Any format code)
 
     ----------------------
+
+    -- See Note [DIV/IDIV for bytes]
+    div_code W8 signed quotient x y = do
+        let widen | signed    = MO_SS_Conv W8 W16
+                  | otherwise = MO_UU_Conv W8 W16
+        div_code
+            W16
+            signed
+            quotient
+            (CmmMachOp widen [x])
+            (CmmMachOp widen [y])
+
     div_code width signed quotient x y = do
            (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
            x_code <- getAnyReg x
@@ -2277,6 +2328,18 @@ genCCall _ is32Bit target dest_regs args = do
             = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
         divOp2 _ _ _ _ _
             = panic "genCCall: Wrong number of arguments for divOp2"
+
+        -- See Note [DIV/IDIV for bytes]
+        divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y =
+            let widen | signed = MO_SS_Conv W8 W16
+                      | otherwise = MO_UU_Conv W8 W16
+                arg_x_low_16 = CmmMachOp widen [arg_x_low]
+                arg_y_16 = CmmMachOp widen [arg_y]
+                m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high
+            in divOp
+                  platform signed W16 [res_q, res_r]
+                  m_arg_x_high_16 arg_x_low_16 arg_y_16
+
         divOp platform signed width [res_q, res_r]
               m_arg_x_high arg_x_low arg_y
             = do let format = intFormat width
@@ -2318,6 +2381,22 @@ genCCall _ is32Bit target dest_regs args = do
         addSubIntC _ _ _ _ _ _ _ _
             = panic "genCCall: Wrong number of arguments/results for addSubIntC"
 
+-- Note [DIV/IDIV for bytes]
+--
+-- IDIV reminder:
+--   Size    Dividend   Divisor   Quotient    Remainder
+--   byte    %ax         r/m8      %al          %ah
+--   word    %dx:%ax     r/m16     %ax          %dx
+--   dword   %edx:%eax   r/m32     %eax         %edx
+--   qword   %rdx:%rax   r/m64     %rax         %rdx
+--
+-- We do a special case for the byte division because the current
+-- codegen doesn't deal well with accessing %ah register (also,
+-- accessing %ah in 64-bit mode is complicated because it cannot be an
+-- operand of many instructions). So we just widen operands to 16 bits
+-- and get the results from %al, %dl. This is not optimal, but a few
+-- register moves are probably not a huge deal when doing division.
+
 genCCall32' :: DynFlags
             -> ForeignTarget            -- function to call
             -> [CmmFormal]        -- where to put the result
@@ -2330,7 +2409,7 @@ genCCall32' dflags target dest_regs args = do
             -- Align stack to 16n for calls, assuming a starting stack
             -- alignment of 16n - word_size on procedure entry. Which we
             -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86]
-            sizes               = map (arg_size . cmmExprType dflags) (reverse args)
+            sizes               = map (arg_size_bytes . cmmExprType dflags) (reverse args)
             raw_arg_size        = sum sizes + wORD_SIZE dflags
             arg_pad_size        = (roundTo 16 $ raw_arg_size) - raw_arg_size
             tot_arg_size        = raw_arg_size + arg_pad_size - wORD_SIZE dflags
@@ -2421,8 +2500,9 @@ genCCall32' dflags target dest_regs args = do
                 assign_code dest_regs)
 
       where
-        arg_size :: CmmType -> Int  -- Width in bytes
-        arg_size ty = widthInBytes (typeWidth ty)
+        -- If the size is smaller than the word, we widen things (see maybePromoteCArg)
+        arg_size_bytes :: CmmType -> Int
+        arg_size_bytes ty = max (widthInBytes (typeWidth ty)) (widthInBytes (wordWidth dflags))
 
         roundTo a x | x `mod` a == 0 = x
                     | otherwise = x + a - (x `mod` a)
@@ -2461,6 +2541,10 @@ genCCall32' dflags target dest_regs args = do
                            )
 
           | otherwise = do
+            -- Arguments can be smaller than 32-bit, but we still use @PUSH
+            -- II32@ - the usual calling conventions expect integers to be
+            -- 4-byte aligned.
+            ASSERT((typeWidth arg_ty) <= W32) return ()
             (operand, code) <- getOperand arg
             delta <- getDeltaNat
             setDeltaNat (delta-size)
@@ -2470,7 +2554,7 @@ genCCall32' dflags target dest_regs args = do
 
           where
              arg_ty = cmmExprType dflags arg
-             size = arg_size arg_ty -- Byte size
+             size = arg_size_bytes arg_ty -- Byte size
 
 genCCall64' :: DynFlags
             -> ForeignTarget      -- function to call
@@ -2700,7 +2784,10 @@ genCCall64' dflags target dest_regs args = do
              push_args rest code'
 
            | otherwise = do
-             ASSERT(width == W64) return ()
+             -- Arguments can be smaller than 64-bit, but we still use @PUSH
+             -- II64@ - the usual calling conventions expect integers to be
+             -- 8-byte aligned.
+             ASSERT(width <= W64) return ()
              (arg_op, arg_code) <- getOperand arg
              delta <- getDeltaNat
              setDeltaNat (delta-arg_size)
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index c7000c9f4b46091987d1eb1b29018c5e9605b094..8cc61ed78936511b92f39589a4d4e57f02fc0710 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -383,7 +383,13 @@ x86_regUsageOfInstr platform instr
     SUB    _ src dst    -> usageRM src dst
     SBB    _ src dst    -> usageRM src dst
     IMUL   _ src dst    -> usageRM src dst
-    IMUL2  _ src       -> mkRU (eax:use_R src []) [eax,edx]
+
+    -- Result of IMULB will be in just in %ax
+    IMUL2  II8 src       -> mkRU (eax:use_R src []) [eax]
+    -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and
+    -- %ax/%eax/%rax.
+    IMUL2  _ src        -> mkRU (eax:use_R src []) [eax,edx]
+
     MUL    _ src dst    -> usageRM src dst
     MUL2   _ src        -> mkRU (eax:use_R src []) [eax,edx]
     DIV    _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 808d22a672c60534bfba8439988ae5dc2fede40d..acfae71b17e542f8880af1b5ccecff64e5b141ab 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -328,7 +328,7 @@ pprReg f r
       (case i of {
          0 -> sLit "%al";     1 -> sLit "%bl";
          2 -> sLit "%cl";     3 -> sLit "%dl";
-        _  -> sLit "very naughty I386 byte register"
+        _  -> sLit $ "very naughty I386 byte register: " ++ show i
       })
 
     ppr32_reg_word i = ptext
@@ -365,7 +365,7 @@ pprReg f r
         10 -> sLit "%r10b";   11 -> sLit "%r11b";
         12 -> sLit "%r12b";   13 -> sLit "%r13b";
         14 -> sLit "%r14b";   15 -> sLit "%r15b";
-        _  -> sLit "very naughty x86_64 byte register"
+        _  -> sLit $ "very naughty x86_64 byte register: " ++ show i
       })
 
     ppr64_reg_word i = ptext
@@ -790,8 +790,11 @@ pprInstr (POP format op) = pprFormatOp (sLit "pop") format op
 -- pprInstr POPA = text "\tpopal"
 
 pprInstr NOP = text "\tnop"
+pprInstr (CLTD II8) = text "\tcbtw"
+pprInstr (CLTD II16) = text "\tcwtd"
 pprInstr (CLTD II32) = text "\tcltd"
 pprInstr (CLTD II64) = text "\tcqto"
+pprInstr (CLTD x) = panic $ "pprInstr: " ++ show x
 
 pprInstr (SETCC cond op) = pprCondInstr (sLit "set") cond (pprOperand II8 op)
 
@@ -1077,9 +1080,6 @@ pprInstr (XADD format src dst) = pprFormatOpOp (sLit "xadd") format src dst
 pprInstr (CMPXCHG format src dst)
    = pprFormatOpOp (sLit "cmpxchg") format src dst
 
-pprInstr _
-        = panic "X86.Ppr.pprInstr: no match"
-
 
 pprTrigOp :: String -> Bool -> CLabel -> CLabel
           -> Reg -> Reg -> Format -> SDoc
diff --git a/compiler/prelude/PrelNames.hs b/compiler/prelude/PrelNames.hs
index d69eaebdcb3214a4738e7dce81bcfde5b0b0c828..46d4484e4769d9ac2492cae1cce7ec42b532c66b 100644
--- a/compiler/prelude/PrelNames.hs
+++ b/compiler/prelude/PrelNames.hs
@@ -1682,7 +1682,7 @@ addrPrimTyConKey, arrayPrimTyConKey, arrayArrayPrimTyConKey, boolTyConKey,
     byteArrayPrimTyConKey, charPrimTyConKey, charTyConKey, doublePrimTyConKey,
     doubleTyConKey, floatPrimTyConKey, floatTyConKey, funTyConKey,
     intPrimTyConKey, intTyConKey, int8TyConKey, int16TyConKey,
-    int32PrimTyConKey, int32TyConKey, int64PrimTyConKey, int64TyConKey,
+    int8PrimTyConKey, int32PrimTyConKey, int32TyConKey, int64PrimTyConKey, int64TyConKey,
     integerTyConKey, naturalTyConKey,
     listTyConKey, foreignObjPrimTyConKey, maybeTyConKey,
     weakPrimTyConKey, mutableArrayPrimTyConKey, mutableArrayArrayPrimTyConKey,
@@ -1703,37 +1703,39 @@ floatTyConKey                           = mkPreludeTyConUnique 12
 funTyConKey                             = mkPreludeTyConUnique 13
 intPrimTyConKey                         = mkPreludeTyConUnique 14
 intTyConKey                             = mkPreludeTyConUnique 15
-int8TyConKey                            = mkPreludeTyConUnique 16
-int16TyConKey                           = mkPreludeTyConUnique 17
-int32PrimTyConKey                       = mkPreludeTyConUnique 18
-int32TyConKey                           = mkPreludeTyConUnique 19
-int64PrimTyConKey                       = mkPreludeTyConUnique 20
-int64TyConKey                           = mkPreludeTyConUnique 21
-integerTyConKey                         = mkPreludeTyConUnique 22
-naturalTyConKey                         = mkPreludeTyConUnique 23
-
-listTyConKey                            = mkPreludeTyConUnique 24
-foreignObjPrimTyConKey                  = mkPreludeTyConUnique 25
-maybeTyConKey                           = mkPreludeTyConUnique 26
-weakPrimTyConKey                        = mkPreludeTyConUnique 27
-mutableArrayPrimTyConKey                = mkPreludeTyConUnique 28
-mutableByteArrayPrimTyConKey            = mkPreludeTyConUnique 29
-orderingTyConKey                        = mkPreludeTyConUnique 30
-mVarPrimTyConKey                        = mkPreludeTyConUnique 31
-ratioTyConKey                           = mkPreludeTyConUnique 32
-rationalTyConKey                        = mkPreludeTyConUnique 33
-realWorldTyConKey                       = mkPreludeTyConUnique 34
-stablePtrPrimTyConKey                   = mkPreludeTyConUnique 35
-stablePtrTyConKey                       = mkPreludeTyConUnique 36
-eqTyConKey                              = mkPreludeTyConUnique 38
-heqTyConKey                             = mkPreludeTyConUnique 39
-arrayArrayPrimTyConKey                  = mkPreludeTyConUnique 40
-mutableArrayArrayPrimTyConKey           = mkPreludeTyConUnique 41
+int8PrimTyConKey                        = mkPreludeTyConUnique 16
+int8TyConKey                            = mkPreludeTyConUnique 17
+int16TyConKey                           = mkPreludeTyConUnique 18
+int32PrimTyConKey                       = mkPreludeTyConUnique 19
+int32TyConKey                           = mkPreludeTyConUnique 20
+int64PrimTyConKey                       = mkPreludeTyConUnique 21
+int64TyConKey                           = mkPreludeTyConUnique 22
+integerTyConKey                         = mkPreludeTyConUnique 23
+naturalTyConKey                         = mkPreludeTyConUnique 24
+
+listTyConKey                            = mkPreludeTyConUnique 25
+foreignObjPrimTyConKey                  = mkPreludeTyConUnique 26
+maybeTyConKey                           = mkPreludeTyConUnique 27
+weakPrimTyConKey                        = mkPreludeTyConUnique 28
+mutableArrayPrimTyConKey                = mkPreludeTyConUnique 29
+mutableByteArrayPrimTyConKey            = mkPreludeTyConUnique 30
+orderingTyConKey                        = mkPreludeTyConUnique 31
+mVarPrimTyConKey                        = mkPreludeTyConUnique 32
+ratioTyConKey                           = mkPreludeTyConUnique 33
+rationalTyConKey                        = mkPreludeTyConUnique 34
+realWorldTyConKey                       = mkPreludeTyConUnique 35
+stablePtrPrimTyConKey                   = mkPreludeTyConUnique 36
+stablePtrTyConKey                       = mkPreludeTyConUnique 37
+eqTyConKey                              = mkPreludeTyConUnique 39
+heqTyConKey                             = mkPreludeTyConUnique 40
+arrayArrayPrimTyConKey                  = mkPreludeTyConUnique 41
+mutableArrayArrayPrimTyConKey           = mkPreludeTyConUnique 42
 
 statePrimTyConKey, stableNamePrimTyConKey, stableNameTyConKey,
     mutVarPrimTyConKey, ioTyConKey,
-    wordPrimTyConKey, wordTyConKey, word8TyConKey, word16TyConKey,
-    word32PrimTyConKey, word32TyConKey, word64PrimTyConKey, word64TyConKey,
+    wordPrimTyConKey, wordTyConKey, word8PrimTyConKey, word8TyConKey,
+    word16TyConKey, word32PrimTyConKey, word32TyConKey,
+    word64PrimTyConKey, word64TyConKey,
     liftedConKey, unliftedConKey, anyBoxConKey, kindConKey, boxityConKey,
     typeConKey, threadIdPrimTyConKey, bcoPrimTyConKey, ptrTyConKey,
     funPtrTyConKey, tVarPrimTyConKey, eqPrimTyConKey,
@@ -1750,24 +1752,25 @@ ioTyConKey                              = mkPreludeTyConUnique 57
 voidPrimTyConKey                        = mkPreludeTyConUnique 58
 wordPrimTyConKey                        = mkPreludeTyConUnique 59
 wordTyConKey                            = mkPreludeTyConUnique 60
-word8TyConKey                           = mkPreludeTyConUnique 61
-word16TyConKey                          = mkPreludeTyConUnique 62
-word32PrimTyConKey                      = mkPreludeTyConUnique 63
-word32TyConKey                          = mkPreludeTyConUnique 64
-word64PrimTyConKey                      = mkPreludeTyConUnique 65
-word64TyConKey                          = mkPreludeTyConUnique 66
-liftedConKey                            = mkPreludeTyConUnique 67
-unliftedConKey                          = mkPreludeTyConUnique 68
-anyBoxConKey                            = mkPreludeTyConUnique 69
-kindConKey                              = mkPreludeTyConUnique 70
-boxityConKey                            = mkPreludeTyConUnique 71
-typeConKey                              = mkPreludeTyConUnique 72
-threadIdPrimTyConKey                    = mkPreludeTyConUnique 73
-bcoPrimTyConKey                         = mkPreludeTyConUnique 74
-ptrTyConKey                             = mkPreludeTyConUnique 75
-funPtrTyConKey                          = mkPreludeTyConUnique 76
-tVarPrimTyConKey                        = mkPreludeTyConUnique 77
-compactPrimTyConKey                     = mkPreludeTyConUnique 78
+word8PrimTyConKey                       = mkPreludeTyConUnique 61
+word8TyConKey                           = mkPreludeTyConUnique 62
+word16TyConKey                          = mkPreludeTyConUnique 63
+word32PrimTyConKey                      = mkPreludeTyConUnique 64
+word32TyConKey                          = mkPreludeTyConUnique 65
+word64PrimTyConKey                      = mkPreludeTyConUnique 66
+word64TyConKey                          = mkPreludeTyConUnique 67
+liftedConKey                            = mkPreludeTyConUnique 68
+unliftedConKey                          = mkPreludeTyConUnique 69
+anyBoxConKey                            = mkPreludeTyConUnique 70
+kindConKey                              = mkPreludeTyConUnique 71
+boxityConKey                            = mkPreludeTyConUnique 72
+typeConKey                              = mkPreludeTyConUnique 73
+threadIdPrimTyConKey                    = mkPreludeTyConUnique 74
+bcoPrimTyConKey                         = mkPreludeTyConUnique 75
+ptrTyConKey                             = mkPreludeTyConUnique 76
+funPtrTyConKey                          = mkPreludeTyConUnique 77
+tVarPrimTyConKey                        = mkPreludeTyConUnique 78
+compactPrimTyConKey                     = mkPreludeTyConUnique 79
 
 -- dotnet interop
 objectTyConKey :: Unique
@@ -2041,7 +2044,7 @@ sumRepDataConKey                        = mkPreludeDataConUnique 73
 runtimeRepSimpleDataConKeys, unliftedSimpleRepDataConKeys, unliftedRepDataConKeys :: [Unique]
 liftedRepDataConKey :: Unique
 runtimeRepSimpleDataConKeys@(liftedRepDataConKey : unliftedSimpleRepDataConKeys)
-  = map mkPreludeDataConUnique [74..82]
+  = map mkPreludeDataConUnique [74..84]
 
 unliftedRepDataConKeys = vecRepDataConKey :
                          tupleRepDataConKey :
@@ -2051,29 +2054,29 @@ unliftedRepDataConKeys = vecRepDataConKey :
 -- See Note [Wiring in RuntimeRep] in TysWiredIn
 -- VecCount
 vecCountDataConKeys :: [Unique]
-vecCountDataConKeys = map mkPreludeDataConUnique [83..88]
+vecCountDataConKeys = map mkPreludeDataConUnique [85..90]
 
 -- See Note [Wiring in RuntimeRep] in TysWiredIn
 -- VecElem
 vecElemDataConKeys :: [Unique]
-vecElemDataConKeys = map mkPreludeDataConUnique [89..98]
+vecElemDataConKeys = map mkPreludeDataConUnique [91..100]
 
 -- Typeable things
 kindRepTyConAppDataConKey, kindRepVarDataConKey, kindRepAppDataConKey,
     kindRepFunDataConKey, kindRepTYPEDataConKey,
     kindRepTypeLitSDataConKey, kindRepTypeLitDDataConKey
     :: Unique
-kindRepTyConAppDataConKey = mkPreludeDataConUnique 100
-kindRepVarDataConKey      = mkPreludeDataConUnique 101
-kindRepAppDataConKey      = mkPreludeDataConUnique 102
-kindRepFunDataConKey      = mkPreludeDataConUnique 103
-kindRepTYPEDataConKey     = mkPreludeDataConUnique 104
-kindRepTypeLitSDataConKey = mkPreludeDataConUnique 105
-kindRepTypeLitDDataConKey = mkPreludeDataConUnique 106
+kindRepTyConAppDataConKey = mkPreludeDataConUnique 101
+kindRepVarDataConKey      = mkPreludeDataConUnique 102
+kindRepAppDataConKey      = mkPreludeDataConUnique 103
+kindRepFunDataConKey      = mkPreludeDataConUnique 104
+kindRepTYPEDataConKey     = mkPreludeDataConUnique 105
+kindRepTypeLitSDataConKey = mkPreludeDataConUnique 106
+kindRepTypeLitDDataConKey = mkPreludeDataConUnique 107
 
 typeLitSymbolDataConKey, typeLitNatDataConKey :: Unique
-typeLitSymbolDataConKey   = mkPreludeDataConUnique 107
-typeLitNatDataConKey      = mkPreludeDataConUnique 108
+typeLitSymbolDataConKey   = mkPreludeDataConUnique 108
+typeLitNatDataConKey      = mkPreludeDataConUnique 109
 
 
 ---------------- Template Haskell -------------------
diff --git a/compiler/prelude/TysPrim.hs b/compiler/prelude/TysPrim.hs
index 339913b683b2d5553111ac57258abccf69f4f664..7d04788d51efdcb240442030c8096becea9a662e 100644
--- a/compiler/prelude/TysPrim.hs
+++ b/compiler/prelude/TysPrim.hs
@@ -66,6 +66,9 @@ module TysPrim(
         weakPrimTyCon,                  mkWeakPrimTy,
         threadIdPrimTyCon,              threadIdPrimTy,
 
+        int8PrimTyCon,          int8PrimTy,
+        word8PrimTyCon,         word8PrimTy,
+
         int32PrimTyCon,         int32PrimTy,
         word32PrimTyCon,        word32PrimTy,
 
@@ -87,8 +90,9 @@ import GhcPrelude
 import {-# SOURCE #-} TysWiredIn
   ( runtimeRepTy, unboxedTupleKind, liftedTypeKind
   , vecRepDataConTyCon, tupleRepDataConTyCon
-  , liftedRepDataConTy, unliftedRepDataConTy, intRepDataConTy
-  , wordRepDataConTy, int64RepDataConTy, word64RepDataConTy, addrRepDataConTy
+  , liftedRepDataConTy, unliftedRepDataConTy, intRepDataConTy, int8RepDataConTy
+  , wordRepDataConTy, int64RepDataConTy, word8RepDataConTy, word64RepDataConTy
+  , addrRepDataConTy
   , floatRepDataConTy, doubleRepDataConTy
   , vec2DataConTy, vec4DataConTy, vec8DataConTy, vec16DataConTy, vec32DataConTy
   , vec64DataConTy
@@ -145,6 +149,7 @@ exposedPrimTyCons
     , doublePrimTyCon
     , floatPrimTyCon
     , intPrimTyCon
+    , int8PrimTyCon
     , int32PrimTyCon
     , int64PrimTyCon
     , bcoPrimTyCon
@@ -165,6 +170,7 @@ exposedPrimTyCons
     , proxyPrimTyCon
     , threadIdPrimTyCon
     , wordPrimTyCon
+    , word8PrimTyCon
     , word32PrimTyCon
     , word64PrimTyCon
 
@@ -188,12 +194,14 @@ mkBuiltInPrimTc fs unique tycon
                   BuiltInSyntax
 
 
-charPrimTyConName, intPrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, proxyPrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, smallArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, smallMutableArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, compactPrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName, eqReprPrimTyConName, eqPhantPrimTyConName, voidPrimTyConName :: Name
+charPrimTyConName, intPrimTyConName, int8PrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word8PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, proxyPrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, smallArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, smallMutableArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, compactPrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName, eqReprPrimTyConName, eqPhantPrimTyConName, voidPrimTyConName :: Name
 charPrimTyConName             = mkPrimTc (fsLit "Char#") charPrimTyConKey charPrimTyCon
 intPrimTyConName              = mkPrimTc (fsLit "Int#") intPrimTyConKey  intPrimTyCon
+int8PrimTyConName             = mkPrimTc (fsLit "Int8#") int8PrimTyConKey int8PrimTyCon
 int32PrimTyConName            = mkPrimTc (fsLit "Int32#") int32PrimTyConKey int32PrimTyCon
 int64PrimTyConName            = mkPrimTc (fsLit "Int64#") int64PrimTyConKey int64PrimTyCon
 wordPrimTyConName             = mkPrimTc (fsLit "Word#") wordPrimTyConKey wordPrimTyCon
+word8PrimTyConName            = mkPrimTc (fsLit "Word8#") word8PrimTyConKey word8PrimTyCon
 word32PrimTyConName           = mkPrimTc (fsLit "Word32#") word32PrimTyConKey word32PrimTyCon
 word64PrimTyConName           = mkPrimTc (fsLit "Word64#") word64PrimTyConKey word64PrimTyCon
 addrPrimTyConName             = mkPrimTc (fsLit "Addr#") addrPrimTyConKey addrPrimTyCon
@@ -513,8 +521,10 @@ primRepToRuntimeRep rep = case rep of
   LiftedRep     -> liftedRepDataConTy
   UnliftedRep   -> unliftedRepDataConTy
   IntRep        -> intRepDataConTy
+  Int8Rep       -> int8RepDataConTy
   WordRep       -> wordRepDataConTy
   Int64Rep      -> int64RepDataConTy
+  Word8Rep      -> word8RepDataConTy
   Word64Rep     -> word64RepDataConTy
   AddrRep       -> addrRepDataConTy
   FloatRep      -> floatRepDataConTy
@@ -556,6 +566,11 @@ intPrimTy       = mkTyConTy intPrimTyCon
 intPrimTyCon :: TyCon
 intPrimTyCon    = pcPrimTyCon0 intPrimTyConName IntRep
 
+int8PrimTy :: Type
+int8PrimTy     = mkTyConTy int8PrimTyCon
+int8PrimTyCon :: TyCon
+int8PrimTyCon  = pcPrimTyCon0 int8PrimTyConName Int8Rep
+
 int32PrimTy :: Type
 int32PrimTy     = mkTyConTy int32PrimTyCon
 int32PrimTyCon :: TyCon
@@ -571,6 +586,11 @@ wordPrimTy      = mkTyConTy wordPrimTyCon
 wordPrimTyCon :: TyCon
 wordPrimTyCon   = pcPrimTyCon0 wordPrimTyConName WordRep
 
+word8PrimTy :: Type
+word8PrimTy     = mkTyConTy word8PrimTyCon
+word8PrimTyCon :: TyCon
+word8PrimTyCon  = pcPrimTyCon0 word8PrimTyConName Word8Rep
+
 word32PrimTy :: Type
 word32PrimTy    = mkTyConTy word32PrimTyCon
 word32PrimTyCon :: TyCon
diff --git a/compiler/prelude/TysWiredIn.hs b/compiler/prelude/TysWiredIn.hs
index 78a8d8c25f75c9f70d28cca1e4fdd57777dfd828..7ceeeffd46efcd00346df2edf06630da7ccb1971 100644
--- a/compiler/prelude/TysWiredIn.hs
+++ b/compiler/prelude/TysWiredIn.hs
@@ -107,8 +107,9 @@ module TysWiredIn (
 
         vecRepDataConTyCon, tupleRepDataConTyCon, sumRepDataConTyCon,
 
-        liftedRepDataConTy, unliftedRepDataConTy, intRepDataConTy,
-        wordRepDataConTy, int64RepDataConTy, word64RepDataConTy, addrRepDataConTy,
+        liftedRepDataConTy, unliftedRepDataConTy, intRepDataConTy, int8RepDataConTy,
+        wordRepDataConTy, int64RepDataConTy, word8RepDataConTy, word64RepDataConTy,
+        addrRepDataConTy,
         floatRepDataConTy, doubleRepDataConTy,
 
         vec2DataConTy, vec4DataConTy, vec8DataConTy, vec16DataConTy, vec32DataConTy,
@@ -414,10 +415,18 @@ sumRepDataConName = mkWiredInDataConName UserSyntax gHC_TYPES (fsLit "SumRep") s
 runtimeRepSimpleDataConNames :: [Name]
 runtimeRepSimpleDataConNames
   = zipWith3Lazy mk_special_dc_name
-      [ fsLit "LiftedRep", fsLit "UnliftedRep"
+      [ fsLit "LiftedRep"
+      , fsLit "UnliftedRep"
       , fsLit "IntRep"
-      , fsLit "WordRep", fsLit "Int64Rep", fsLit "Word64Rep"
-      , fsLit "AddrRep", fsLit "FloatRep", fsLit "DoubleRep" ]
+      , fsLit "WordRep"
+      , fsLit "Int8Rep"
+      , fsLit "Int64Rep"
+      , fsLit "Word8Rep"
+      , fsLit "Word64Rep"
+      , fsLit "AddrRep"
+      , fsLit "FloatRep"
+      , fsLit "DoubleRep"
+      ]
       runtimeRepSimpleDataConKeys
       runtimeRepSimpleDataCons
 
@@ -1170,8 +1179,8 @@ runtimeRepSimpleDataCons :: [DataCon]
 liftedRepDataCon :: DataCon
 runtimeRepSimpleDataCons@(liftedRepDataCon : _)
   = zipWithLazy mk_runtime_rep_dc
-    [ LiftedRep, UnliftedRep, IntRep, WordRep, Int64Rep
-    , Word64Rep, AddrRep, FloatRep, DoubleRep ]
+    [ LiftedRep, UnliftedRep, IntRep, WordRep, Int8Rep, Int64Rep
+    , Word8Rep, Word64Rep, AddrRep, FloatRep, DoubleRep ]
     runtimeRepSimpleDataConNames
   where
     mk_runtime_rep_dc primrep name
@@ -1179,11 +1188,13 @@ runtimeRepSimpleDataCons@(liftedRepDataCon : _)
 
 -- See Note [Wiring in RuntimeRep]
 liftedRepDataConTy, unliftedRepDataConTy,
-  intRepDataConTy, wordRepDataConTy, int64RepDataConTy,
-  word64RepDataConTy, addrRepDataConTy, floatRepDataConTy, doubleRepDataConTy :: Type
+  intRepDataConTy, int8RepDataConTy, wordRepDataConTy, int64RepDataConTy,
+  word8RepDataConTy, word64RepDataConTy, addrRepDataConTy,
+  floatRepDataConTy, doubleRepDataConTy :: Type
 [liftedRepDataConTy, unliftedRepDataConTy,
-   intRepDataConTy, wordRepDataConTy, int64RepDataConTy,
-   word64RepDataConTy, addrRepDataConTy, floatRepDataConTy, doubleRepDataConTy]
+   intRepDataConTy, wordRepDataConTy, int8RepDataConTy, int64RepDataConTy,
+   word8RepDataConTy, word64RepDataConTy,
+   addrRepDataConTy, floatRepDataConTy, doubleRepDataConTy]
   = map (mkTyConTy . promoteDataCon) runtimeRepSimpleDataCons
 
 vecCountTyCon :: TyCon
diff --git a/compiler/prelude/TysWiredIn.hs-boot b/compiler/prelude/TysWiredIn.hs-boot
index 8cc83d7ac778fb1d10651122e3697dd705a356bf..b853290da3ee7a16a7d946397662f01f831689fe 100644
--- a/compiler/prelude/TysWiredIn.hs-boot
+++ b/compiler/prelude/TysWiredIn.hs-boot
@@ -24,9 +24,9 @@ runtimeRepTy :: Type
 
 liftedRepDataConTyCon, vecRepDataConTyCon, tupleRepDataConTyCon :: TyCon
 
-liftedRepDataConTy, unliftedRepDataConTy, intRepDataConTy,
-  wordRepDataConTy, int64RepDataConTy, word64RepDataConTy, addrRepDataConTy,
-  floatRepDataConTy, doubleRepDataConTy :: Type
+liftedRepDataConTy, unliftedRepDataConTy, intRepDataConTy, int8RepDataConTy,
+  wordRepDataConTy, int64RepDataConTy, word8RepDataConTy, word64RepDataConTy,
+  addrRepDataConTy, floatRepDataConTy, doubleRepDataConTy :: Type
 
 vec2DataConTy, vec4DataConTy, vec8DataConTy, vec16DataConTy, vec32DataConTy,
   vec64DataConTy :: Type
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 1d879c438ebd4e477bdc3530d279370e5581a300..8fceec0107790fdcac16d7d3c12e052b946062cf 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -343,6 +343,88 @@ primop   ISrlOp   "uncheckedIShiftRL#" GenPrimOp Int# -> Int# -> Int#
          {Shift right logical.  Result undefined if shift amount is not
           in the range 0 to word size - 1 inclusive.}
 
+------------------------------------------------------------------------
+section "Int8#"
+        {Operations on 8-bit integers.}
+------------------------------------------------------------------------
+
+primtype Int8#
+
+primop Int8Extend "extendInt8#" GenPrimOp Int8# -> Int#
+primop Int8Narrow "narrowInt8#" GenPrimOp Int# -> Int8#
+
+primop Int8NegOp "negateInt8#" Monadic Int8# -> Int8#
+
+primop Int8AddOp "plusInt8#" Dyadic Int8# -> Int8# -> Int8#
+  with
+    commutable = True
+
+primop Int8SubOp "subInt8#" Dyadic Int8# -> Int8# -> Int8#
+
+primop Int8MulOp "timesInt8#" Dyadic Int8# -> Int8# -> Int8#
+  with
+    commutable = True
+
+primop Int8QuotOp "quotInt8#" Dyadic Int8# -> Int8# -> Int8#
+  with
+    can_fail = True
+
+primop Int8RemOp "remInt8#" Dyadic Int8# -> Int8# -> Int8#
+  with
+    can_fail = True
+
+primop Int8QuotRemOp "quotRemInt8#" GenPrimOp Int8# -> Int8# -> (# Int8#, Int8# #)
+  with
+    can_fail = True
+
+primop Int8EqOp "eqInt8#" Compare Int8# -> Int8# -> Int#
+primop Int8GeOp "geInt8#" Compare Int8# -> Int8# -> Int#
+primop Int8GtOp "gtInt8#" Compare Int8# -> Int8# -> Int#
+primop Int8LeOp "leInt8#" Compare Int8# -> Int8# -> Int#
+primop Int8LtOp "ltInt8#" Compare Int8# -> Int8# -> Int#
+primop Int8NeOp "neInt8#" Compare Int8# -> Int8# -> Int#
+
+------------------------------------------------------------------------
+section "Word8#"
+        {Operations on 8-bit unsigned integers.}
+------------------------------------------------------------------------
+
+primtype Word8#
+
+primop Word8Extend "extendWord8#" GenPrimOp Word8# -> Word#
+primop Word8Narrow "narrowWord8#" GenPrimOp Word# -> Word8#
+
+primop Word8NotOp "notWord8#" Monadic Word8# -> Word8#
+
+primop Word8AddOp "plusWord8#" Dyadic Word8# -> Word8# -> Word8#
+  with
+    commutable = True
+
+primop Word8SubOp "subWord8#" Dyadic Word8# -> Word8# -> Word8#
+
+primop Word8MulOp "timesWord8#" Dyadic Word8# -> Word8# -> Word8#
+  with
+    commutable = True
+
+primop Word8QuotOp "quotWord8#" Dyadic Word8# -> Word8# -> Word8#
+  with
+    can_fail = True
+
+primop Word8RemOp "remWord8#" Dyadic Word8# -> Word8# -> Word8#
+  with
+    can_fail = True
+
+primop Word8QuotRemOp "quotRemWord8#" GenPrimOp Word8# -> Word8# -> (# Word8#, Word8# #)
+  with
+    can_fail = True
+
+primop Word8EqOp "eqWord8#" Compare Word8# -> Word8# -> Int#
+primop Word8GeOp "geWord8#" Compare Word8# -> Word8# -> Int#
+primop Word8GtOp "gtWord8#" Compare Word8# -> Word8# -> Int#
+primop Word8LeOp "leWord8#" Compare Word8# -> Word8# -> Int#
+primop Word8LtOp "ltWord8#" Compare Word8# -> Word8# -> Int#
+primop Word8NeOp "neWord8#" Compare Word8# -> Word8# -> Int#
+
 ------------------------------------------------------------------------
 section "Word#"
         {Operations on native-sized unsigned words (32+ bits).}
diff --git a/compiler/simplStg/RepType.hs b/compiler/simplStg/RepType.hs
index 694aa4ebf7b249828aaf79d10300595223dee510..a5b8ea67dbb1f7d726da0b42b0329be0caf1a60b 100644
--- a/compiler/simplStg/RepType.hs
+++ b/compiler/simplStg/RepType.hs
@@ -228,6 +228,9 @@ layoutUbxSum sum_slots0 arg_slots0 =
 --   - Float slots: Shared between floating point types.
 --
 --   - Void slots: Shared between void types. Not used in sums.
+--
+-- TODO(michalt): We should probably introduce `SlotTy`s for 8-/16-/32-bit
+-- values, so that we can pack things more tightly.
 data SlotTy = PtrSlot | WordSlot | Word64Slot | FloatSlot | DoubleSlot
   deriving (Eq, Ord)
     -- Constructor order is important! If slot A could fit into slot B
@@ -255,8 +258,10 @@ primRepSlot VoidRep     = pprPanic "primRepSlot" (text "No slot for VoidRep")
 primRepSlot LiftedRep   = PtrSlot
 primRepSlot UnliftedRep = PtrSlot
 primRepSlot IntRep      = WordSlot
-primRepSlot WordRep     = WordSlot
+primRepSlot Int8Rep     = WordSlot
 primRepSlot Int64Rep    = Word64Slot
+primRepSlot WordRep     = WordSlot
+primRepSlot Word8Rep    = WordSlot
 primRepSlot Word64Rep   = Word64Slot
 primRepSlot AddrRep     = WordSlot
 primRepSlot FloatRep    = FloatSlot
diff --git a/compiler/typecheck/TcGenDeriv.hs b/compiler/typecheck/TcGenDeriv.hs
index 84147c6b64d919a5543d73ec30533f301cc2ff90..f4a23851dc75a1b5cf5f44a2c1c385a05d4bd1ce 100644
--- a/compiler/typecheck/TcGenDeriv.hs
+++ b/compiler/typecheck/TcGenDeriv.hs
@@ -77,7 +77,7 @@ import FastString
 import Pair
 import Bag
 
-import Data.List  ( partition, intersperse )
+import Data.List  ( find, partition, intersperse )
 
 type BagDerivStuff = Bag DerivStuff
 
@@ -218,7 +218,7 @@ gen_Eq_binds loc tycon = do
           -- Using 'foldr1' here ensures that the derived code is correctly
           -- associated. See Trac #10859.
           where
-            nested_eq ty a b = nlHsPar (eq_Expr tycon ty (nlHsVar a) (nlHsVar b))
+            nested_eq ty a b = nlHsPar (eq_Expr ty (nlHsVar a) (nlHsVar b))
 
 {-
 ************************************************************************
@@ -456,7 +456,7 @@ gen_Ord_binds loc tycon = do
     -- Returns a case alternative  Ki b1 b2 ... bv -> compare (a1,a2,...) with (b1,b2,...)
     mkInnerEqAlt op data_con
       = mkHsCaseAlt (nlConVarPat data_con_RDR bs_needed) $
-        mkCompareFields tycon op (dataConOrigArgTys data_con)
+        mkCompareFields op (dataConOrigArgTys data_con)
       where
         data_con_RDR = getRdrName data_con
         bs_needed    = take (dataConSourceArity data_con) bs_RDRs
@@ -466,17 +466,17 @@ gen_Ord_binds loc tycon = do
     -- generates (case data2Tag a of a# -> case data2Tag b of b# -> a# `op` b#
     mkTagCmp dflags op =
       untag_Expr dflags tycon[(a_RDR, ah_RDR),(b_RDR, bh_RDR)] $
-        unliftedOrdOp tycon intPrimTy op ah_RDR bh_RDR
+        unliftedOrdOp intPrimTy op ah_RDR bh_RDR
 
-mkCompareFields :: TyCon -> OrdOp -> [Type] -> LHsExpr GhcPs
+mkCompareFields :: OrdOp -> [Type] -> LHsExpr GhcPs
 -- Generates nested comparisons for (a1,a2...) against (b1,b2,...)
 -- where the ai,bi have the given types
-mkCompareFields tycon op tys
+mkCompareFields op tys
   = go tys as_RDRs bs_RDRs
   where
     go []   _      _          = eqResult op
     go [ty] (a:_)  (b:_)
-      | isUnliftedType ty     = unliftedOrdOp tycon ty op a b
+      | isUnliftedType ty     = unliftedOrdOp ty op a b
       | otherwise             = genOpApp (nlHsVar a) (ordMethRdr op) (nlHsVar b)
     go (ty:tys) (a:as) (b:bs) = mk_compare ty a b
                                   (ltResult op)
@@ -498,10 +498,10 @@ mkCompareFields tycon op tys
       where
         a_expr = nlHsVar a
         b_expr = nlHsVar b
-        (lt_op, _, eq_op, _, _) = primOrdOps "Ord" tycon ty
+        (lt_op, _, eq_op, _, _) = primOrdOps "Ord" ty
 
-unliftedOrdOp :: TyCon -> Type -> OrdOp -> RdrName -> RdrName -> LHsExpr GhcPs
-unliftedOrdOp tycon ty op a b
+unliftedOrdOp :: Type -> OrdOp -> RdrName -> RdrName -> LHsExpr GhcPs
+unliftedOrdOp ty op a b
   = case op of
        OrdCompare -> unliftedCompare lt_op eq_op a_expr b_expr
                                      ltTag_Expr eqTag_Expr gtTag_Expr
@@ -510,7 +510,7 @@ unliftedOrdOp tycon ty op a b
        OrdGE      -> wrap ge_op
        OrdGT      -> wrap gt_op
   where
-   (lt_op, le_op, eq_op, ge_op, gt_op) = primOrdOps "Ord" tycon ty
+   (lt_op, le_op, eq_op, ge_op, gt_op) = primOrdOps "Ord" ty
    wrap prim_op = genPrimOpApp a_expr prim_op b_expr
    a_expr = nlHsVar a
    b_expr = nlHsVar b
@@ -1197,16 +1197,25 @@ gen_Show_binds get_fixity loc tycon
 
              show_arg :: RdrName -> Type -> LHsExpr GhcPs
              show_arg b arg_ty
-               | isUnliftedType arg_ty
-               -- See Note [Deriving and unboxed types] in TcDeriv
-               = nlHsApps compose_RDR [mk_shows_app boxed_arg,
-                                       mk_showString_app postfixMod]
-               | otherwise
-               = mk_showsPrec_app arg_prec arg
-                 where
-                   arg        = nlHsVar b
-                   boxed_arg  = box "Show" tycon arg arg_ty
-                   postfixMod = assoc_ty_id "Show" tycon postfixModTbl arg_ty
+                 | isUnliftedType arg_ty
+                 -- See Note [Deriving and unboxed types] in TcDerivInfer
+                 = with_conv $
+                    nlHsApps compose_RDR
+                        [mk_shows_app boxed_arg, mk_showString_app postfixMod]
+                 | otherwise
+                 = mk_showsPrec_app arg_prec arg
+               where
+                 arg        = nlHsVar b
+                 boxed_arg  = box "Show" arg arg_ty
+                 postfixMod = assoc_ty_id "Show" postfixModTbl arg_ty
+                 with_conv expr
+                    | (Just conv) <- assoc_ty_id_maybe primConvTbl arg_ty =
+                        nested_compose_Expr
+                            [ mk_showString_app ("(" ++ conv ++ " ")
+                            , expr
+                            , mk_showString_app ")"
+                            ]
+                    | otherwise = expr
 
                 -- Fixity stuff
              is_infix = dataConIsInfix data_con
@@ -1442,10 +1451,13 @@ gfoldl_RDR, gunfold_RDR, toConstr_RDR, dataTypeOf_RDR, mkConstr_RDR,
     constr_RDR, dataType_RDR,
     eqChar_RDR  , ltChar_RDR  , geChar_RDR  , gtChar_RDR  , leChar_RDR  ,
     eqInt_RDR   , ltInt_RDR   , geInt_RDR   , gtInt_RDR   , leInt_RDR   ,
+    eqInt8_RDR  , ltInt8_RDR  , geInt8_RDR  , gtInt8_RDR  , leInt8_RDR  ,
     eqWord_RDR  , ltWord_RDR  , geWord_RDR  , gtWord_RDR  , leWord_RDR  ,
+    eqWord8_RDR , ltWord8_RDR , geWord8_RDR , gtWord8_RDR , leWord8_RDR ,
     eqAddr_RDR  , ltAddr_RDR  , geAddr_RDR  , gtAddr_RDR  , leAddr_RDR  ,
     eqFloat_RDR , ltFloat_RDR , geFloat_RDR , gtFloat_RDR , leFloat_RDR ,
-    eqDouble_RDR, ltDouble_RDR, geDouble_RDR, gtDouble_RDR, leDouble_RDR :: RdrName
+    eqDouble_RDR, ltDouble_RDR, geDouble_RDR, gtDouble_RDR, leDouble_RDR,
+    extendWord8_RDR, extendInt8_RDR :: RdrName
 gfoldl_RDR     = varQual_RDR  gENERICS (fsLit "gfoldl")
 gunfold_RDR    = varQual_RDR  gENERICS (fsLit "gunfold")
 toConstr_RDR   = varQual_RDR  gENERICS (fsLit "toConstr")
@@ -1474,12 +1486,24 @@ leInt_RDR      = varQual_RDR  gHC_PRIM (fsLit "<=#")
 gtInt_RDR      = varQual_RDR  gHC_PRIM (fsLit ">#" )
 geInt_RDR      = varQual_RDR  gHC_PRIM (fsLit ">=#")
 
+eqInt8_RDR     = varQual_RDR  gHC_PRIM (fsLit "eqInt8#")
+ltInt8_RDR     = varQual_RDR  gHC_PRIM (fsLit "ltInt8#" )
+leInt8_RDR     = varQual_RDR  gHC_PRIM (fsLit "leInt8#")
+gtInt8_RDR     = varQual_RDR  gHC_PRIM (fsLit "gtInt8#" )
+geInt8_RDR     = varQual_RDR  gHC_PRIM (fsLit "geInt8#")
+
 eqWord_RDR     = varQual_RDR  gHC_PRIM (fsLit "eqWord#")
 ltWord_RDR     = varQual_RDR  gHC_PRIM (fsLit "ltWord#")
 leWord_RDR     = varQual_RDR  gHC_PRIM (fsLit "leWord#")
 gtWord_RDR     = varQual_RDR  gHC_PRIM (fsLit "gtWord#")
 geWord_RDR     = varQual_RDR  gHC_PRIM (fsLit "geWord#")
 
+eqWord8_RDR     = varQual_RDR  gHC_PRIM (fsLit "eqWord8#")
+ltWord8_RDR     = varQual_RDR  gHC_PRIM (fsLit "ltWord8#" )
+leWord8_RDR     = varQual_RDR  gHC_PRIM (fsLit "leWord8#")
+gtWord8_RDR     = varQual_RDR  gHC_PRIM (fsLit "gtWord8#" )
+geWord8_RDR     = varQual_RDR  gHC_PRIM (fsLit "geWord8#")
+
 eqAddr_RDR     = varQual_RDR  gHC_PRIM (fsLit "eqAddr#")
 ltAddr_RDR     = varQual_RDR  gHC_PRIM (fsLit "ltAddr#")
 leAddr_RDR     = varQual_RDR  gHC_PRIM (fsLit "leAddr#")
@@ -1498,6 +1522,10 @@ leDouble_RDR   = varQual_RDR  gHC_PRIM (fsLit "<=##")
 gtDouble_RDR   = varQual_RDR  gHC_PRIM (fsLit ">##" )
 geDouble_RDR   = varQual_RDR  gHC_PRIM (fsLit ">=##")
 
+extendWord8_RDR = varQual_RDR  gHC_PRIM (fsLit "extendWord8#")
+extendInt8_RDR  = varQual_RDR  gHC_PRIM (fsLit "extendInt8#")
+
+
 {-
 ************************************************************************
 *                                                                      *
@@ -1555,7 +1583,7 @@ gen_Lift_binds loc tycon = (unitBag lift_bind, emptyBag)
                                                   (nlHsVar a)
               | otherwise = nlHsApp (nlHsVar litE_RDR)
                               (primLitOp (mkBoxExp (nlHsVar a)))
-              where (primLitOp, mkBoxExp) = primLitOps "Lift" tycon ty
+              where (primLitOp, mkBoxExp) = primLitOps "Lift" ty
 
             pkg_name = unitIdString . moduleUnitId
                      . nameModule $ tycon_name
@@ -2076,55 +2104,60 @@ mkRdrFunBindSE arity
 
 
 box ::         String           -- The class involved
-            -> TyCon            -- The tycon involved
             -> LHsExpr GhcPs    -- The argument
             -> Type             -- The argument type
             -> LHsExpr GhcPs    -- Boxed version of the arg
--- See Note [Deriving and unboxed types] in TcDeriv
-box cls_str tycon arg arg_ty = nlHsApp (nlHsVar box_con) arg
-  where
-    box_con = assoc_ty_id cls_str tycon boxConTbl arg_ty
+-- See Note [Deriving and unboxed types] in TcDerivInfer
+box cls_str arg arg_ty = assoc_ty_id cls_str boxConTbl arg_ty arg
 
 ---------------------
 primOrdOps :: String    -- The class involved
-           -> TyCon     -- The tycon involved
            -> Type      -- The type
            -> (RdrName, RdrName, RdrName, RdrName, RdrName)  -- (lt,le,eq,ge,gt)
--- See Note [Deriving and unboxed types] in TcDeriv
-primOrdOps str tycon ty = assoc_ty_id str tycon ordOpTbl ty
+-- See Note [Deriving and unboxed types] in TcDerivInfer
+primOrdOps str ty = assoc_ty_id str ordOpTbl ty
 
 primLitOps :: String -- The class involved
-           -> TyCon  -- The tycon involved
            -> Type   -- The type
            -> ( LHsExpr GhcPs -> LHsExpr GhcPs -- Constructs a Q Exp value
               , LHsExpr GhcPs -> LHsExpr GhcPs -- Constructs a boxed value
               )
-primLitOps str tycon ty = ( assoc_ty_id str tycon litConTbl ty
-                          , \v -> nlHsVar boxRDR `nlHsApp` v
-                          )
+primLitOps str ty = (assoc_ty_id str litConTbl ty, \v -> boxed v)
   where
-    boxRDR
-      | ty `eqType` addrPrimTy = unpackCString_RDR
-      | otherwise = assoc_ty_id str tycon boxConTbl ty
+    boxed v
+      | ty `eqType` addrPrimTy = nlHsVar unpackCString_RDR `nlHsApp` v
+      | otherwise = assoc_ty_id str boxConTbl ty v
 
 ordOpTbl :: [(Type, (RdrName, RdrName, RdrName, RdrName, RdrName))]
 ordOpTbl
  =  [(charPrimTy  , (ltChar_RDR  , leChar_RDR  , eqChar_RDR  , geChar_RDR  , gtChar_RDR  ))
     ,(intPrimTy   , (ltInt_RDR   , leInt_RDR   , eqInt_RDR   , geInt_RDR   , gtInt_RDR   ))
+    ,(int8PrimTy  , (ltInt8_RDR  , leInt8_RDR  , eqInt8_RDR  , geInt8_RDR  , gtInt8_RDR   ))
     ,(wordPrimTy  , (ltWord_RDR  , leWord_RDR  , eqWord_RDR  , geWord_RDR  , gtWord_RDR  ))
+    ,(word8PrimTy , (ltWord8_RDR , leWord8_RDR , eqWord8_RDR , geWord8_RDR , gtWord8_RDR   ))
     ,(addrPrimTy  , (ltAddr_RDR  , leAddr_RDR  , eqAddr_RDR  , geAddr_RDR  , gtAddr_RDR  ))
     ,(floatPrimTy , (ltFloat_RDR , leFloat_RDR , eqFloat_RDR , geFloat_RDR , gtFloat_RDR ))
     ,(doublePrimTy, (ltDouble_RDR, leDouble_RDR, eqDouble_RDR, geDouble_RDR, gtDouble_RDR)) ]
 
-boxConTbl :: [(Type, RdrName)]
-boxConTbl
-  = [(charPrimTy  , getRdrName charDataCon  )
-    ,(intPrimTy   , getRdrName intDataCon   )
-    ,(wordPrimTy  , getRdrName wordDataCon  )
-    ,(floatPrimTy , getRdrName floatDataCon )
-    ,(doublePrimTy, getRdrName doubleDataCon)
+-- A mapping from a primitive type to a function that constructs its boxed
+-- version.
+-- NOTE: Int8#/Word8# will become Int/Word.
+boxConTbl :: [(Type, LHsExpr GhcPs -> LHsExpr GhcPs)]
+boxConTbl =
+    [ (charPrimTy  , nlHsApp (nlHsVar $ getRdrName charDataCon))
+    , (intPrimTy   , nlHsApp (nlHsVar $ getRdrName intDataCon))
+    , (wordPrimTy  , nlHsApp (nlHsVar $ getRdrName wordDataCon ))
+    , (floatPrimTy , nlHsApp (nlHsVar $ getRdrName floatDataCon ))
+    , (doublePrimTy, nlHsApp (nlHsVar $ getRdrName doubleDataCon))
+    , (int8PrimTy,
+        nlHsApp (nlHsVar $ getRdrName intDataCon)
+        . nlHsApp (nlHsVar extendInt8_RDR))
+    , (word8PrimTy,
+        nlHsApp (nlHsVar $ getRdrName wordDataCon)
+        .  nlHsApp (nlHsVar extendWord8_RDR))
     ]
 
+
 -- | A table of postfix modifiers for unboxed values.
 postfixModTbl :: [(Type, String)]
 postfixModTbl
@@ -2133,6 +2166,14 @@ postfixModTbl
     ,(wordPrimTy  , "##")
     ,(floatPrimTy , "#" )
     ,(doublePrimTy, "##")
+    ,(int8PrimTy, "#")
+    ,(word8PrimTy, "##")
+    ]
+
+primConvTbl :: [(Type, String)]
+primConvTbl =
+    [ (int8PrimTy, "narrowInt8#")
+    , (word8PrimTy, "narrowWord8#")
     ]
 
 litConTbl :: [(Type, LHsExpr GhcPs -> LHsExpr GhcPs)]
@@ -2156,17 +2197,20 @@ litConTbl
     ]
 
 -- | Lookup `Type` in an association list.
-assoc_ty_id :: String           -- The class involved
-            -> TyCon            -- The tycon involved
+assoc_ty_id :: HasCallStack => String           -- The class involved
             -> [(Type,a)]       -- The table
             -> Type             -- The type
             -> a                -- The result of the lookup
-assoc_ty_id cls_str _ tbl ty
-  | null res = pprPanic "Error in deriving:" (text "Can't derive" <+> text cls_str <+>
-                                              text "for primitive type" <+> ppr ty)
-  | otherwise = head res
-  where
-    res = [id | (ty',id) <- tbl, ty `eqType` ty']
+assoc_ty_id cls_str tbl ty
+  | Just a <- assoc_ty_id_maybe tbl ty = a
+  | otherwise =
+      pprPanic "Error in deriving:"
+          (text "Can't derive" <+> text cls_str <+>
+           text "for primitive type" <+> ppr ty)
+
+-- | Lookup `Type` in an association list.
+assoc_ty_id_maybe :: [(Type, a)] -> Type -> Maybe a
+assoc_ty_id_maybe tbl ty = snd <$> find (\(t, _) -> t `eqType` ty) tbl
 
 -----------------------------------------------------------------------
 
@@ -2175,12 +2219,12 @@ and_Expr a b = genOpApp a and_RDR    b
 
 -----------------------------------------------------------------------
 
-eq_Expr :: TyCon -> Type -> LHsExpr GhcPs -> LHsExpr GhcPs -> LHsExpr GhcPs
-eq_Expr tycon ty a b
+eq_Expr :: Type -> LHsExpr GhcPs -> LHsExpr GhcPs -> LHsExpr GhcPs
+eq_Expr ty a b
     | not (isUnliftedType ty) = genOpApp a eq_RDR b
     | otherwise               = genPrimOpApp a prim_eq b
  where
-   (_, _, prim_eq, _, _) = primOrdOps "Eq" tycon ty
+   (_, _, prim_eq, _, _) = primOrdOps "Eq" ty
 
 untag_Expr :: DynFlags -> TyCon -> [( RdrName,  RdrName)]
               -> LHsExpr GhcPs -> LHsExpr GhcPs
diff --git a/compiler/types/TyCon.hs b/compiler/types/TyCon.hs
index eeebf8b9cfd21f9ad23e96ae4554f806752b5051..7322a168ab172f297824b9ef6e3de5ef3cccf5c7 100644
--- a/compiler/types/TyCon.hs
+++ b/compiler/types/TyCon.hs
@@ -1325,9 +1325,11 @@ data PrimRep
   = VoidRep
   | LiftedRep
   | UnliftedRep   -- ^ Unlifted pointer
+  | Int8Rep       -- ^ Signed, 8-bit value
   | IntRep        -- ^ Signed, word-sized value
   | WordRep       -- ^ Unsigned, word-sized value
   | Int64Rep      -- ^ Signed, 64 bit value (with 32-bit words only)
+  | Word8Rep      -- ^ Unsigned, 8 bit value
   | Word64Rep     -- ^ Unsigned, 64 bit value (with 32-bit words only)
   | AddrRep       -- ^ A pointer, but /not/ to a Haskell value (use '(Un)liftedRep')
   | FloatRep
@@ -1373,7 +1375,9 @@ isGcPtrRep _           = False
 primRepSizeB :: DynFlags -> PrimRep -> Int
 primRepSizeB dflags IntRep           = wORD_SIZE dflags
 primRepSizeB dflags WordRep          = wORD_SIZE dflags
+primRepSizeB _      Int8Rep          = 1
 primRepSizeB _      Int64Rep         = wORD64_SIZE
+primRepSizeB _      Word8Rep         = 1
 primRepSizeB _      Word64Rep        = wORD64_SIZE
 primRepSizeB _      FloatRep         = fLOAT_SIZE
 primRepSizeB dflags DoubleRep        = dOUBLE_SIZE dflags
diff --git a/compiler/utils/Binary.hs b/compiler/utils/Binary.hs
index 447317ca476c6757e93c817d5c1684c82e553688..a38af74efe0270967aa065aa763bdb83b21e3185 100644
--- a/compiler/utils/Binary.hs
+++ b/compiler/utils/Binary.hs
@@ -637,6 +637,10 @@ instance Binary RuntimeRep where
     put_ bh AddrRep         = putByte bh 9
     put_ bh FloatRep        = putByte bh 10
     put_ bh DoubleRep       = putByte bh 11
+#if __GLASGOW_HASKELL__ >= 807
+    put_ bh Int8Rep         = putByte bh 12
+    put_ bh Word8Rep        = putByte bh 13
+#endif
 
     get bh = do
         tag <- getByte bh
@@ -653,6 +657,10 @@ instance Binary RuntimeRep where
           9  -> pure AddrRep
           10 -> pure FloatRep
           11 -> pure DoubleRep
+#if __GLASGOW_HASKELL__ >= 807
+          12 -> pure Int8Rep
+          13 -> pure Word8Rep
+#endif
           _  -> fail "Binary.putRuntimeRep: invalid tag"
 
 instance Binary KindRep where
diff --git a/includes/CodeGen.Platform.hs b/includes/CodeGen.Platform.hs
index 8e936900386b9cc8a8726a4063567cde5e8a05cd..664942d8781acc7d2c3a49777f44d8ac338a61ab 100644
--- a/includes/CodeGen.Platform.hs
+++ b/includes/CodeGen.Platform.hs
@@ -837,15 +837,16 @@ freeReg :: RegNo -> Bool
 
 # if defined(MACHREGS_i386)
 freeReg esp = False -- %esp is the C stack pointer
-freeReg esi = False -- Note [esi/edi not allocatable]
+freeReg esi = False -- Note [esi/edi/ebp not allocatable]
 freeReg edi = False
+freeReg ebp = False
 # endif
 # if defined(MACHREGS_x86_64)
 freeReg rsp = False  --        %rsp is the C stack pointer
 # endif
 
 {-
-Note [esi/edi not allocatable]
+Note [esi/edi/ebp not allocatable]
 
 %esi is mapped to R1, so %esi would normally be allocatable while it
 is not being used for R1.  However, %esi has no 8-bit version on x86,
@@ -855,7 +856,7 @@ graph-colouring allocator also cannot handle this - it was designed
 with more flexibility in mind, but the current implementation is
 restricted to the same set of classes as the linear allocator.
 
-Hence, on x86 esi and edi are treated as not allocatable.
+Hence, on x86 esi, edi and ebp are treated as not allocatable.
 -}
 
 -- split patterns in two functions to prevent overlaps
diff --git a/libraries/base/Data/Typeable/Internal.hs b/libraries/base/Data/Typeable/Internal.hs
index 06d225adb76044ad7705064adcac1601e4e9c935..cc295b31b8bec0fd0f6a0d49e3c1779c3ebf11df 100644
--- a/libraries/base/Data/Typeable/Internal.hs
+++ b/libraries/base/Data/Typeable/Internal.hs
@@ -664,8 +664,10 @@ runtimeRepTypeRep r =
       SumRep rs   -> kindedTypeRep @_ @'SumRep
                      `kApp` buildList (map runtimeRepTypeRep rs)
       IntRep      -> rep @'IntRep
-      WordRep     -> rep @'WordRep
+      Int8Rep     -> rep @'Int8Rep
       Int64Rep    -> rep @'Int64Rep
+      WordRep     -> rep @'WordRep
+      Word8Rep    -> rep @'Word8Rep
       Word64Rep   -> rep @'Word64Rep
       AddrRep     -> rep @'AddrRep
       FloatRep    -> rep @'FloatRep
diff --git a/libraries/binary b/libraries/binary
index 38adf7ce1ad6a497fba61de500c3f35b186303a9..0318374b832ebe52a8d01bff2dd7bab8e747fbd4 160000
--- a/libraries/binary
+++ b/libraries/binary
@@ -1 +1 @@
-Subproject commit 38adf7ce1ad6a497fba61de500c3f35b186303a9
+Subproject commit 0318374b832ebe52a8d01bff2dd7bab8e747fbd4
diff --git a/libraries/ghc-prim/GHC/Types.hs b/libraries/ghc-prim/GHC/Types.hs
index d06c0be307711f27da9d4866ecbd67601d9d1eb0..7ab870684de226a87ec01e6663b2176865736944 100644
--- a/libraries/ghc-prim/GHC/Types.hs
+++ b/libraries/ghc-prim/GHC/Types.hs
@@ -394,8 +394,10 @@ data RuntimeRep = VecRep VecCount VecElem   -- ^ a SIMD vector type
                 | LiftedRep       -- ^ lifted; represented by a pointer
                 | UnliftedRep     -- ^ unlifted; represented by a pointer
                 | IntRep          -- ^ signed, word-sized value
-                | WordRep         -- ^ unsigned, word-sized value
+                | Int8Rep         -- ^ signed, 8-bit value
                 | Int64Rep        -- ^ signed, 64-bit value (on 32-bit only)
+                | WordRep         -- ^ unsigned, word-sized value
+                | Word8Rep        -- ^ unsigned, 8-bit value
                 | Word64Rep       -- ^ unsigned, 64-bit value (on 32-bit only)
                 | AddrRep         -- ^ A pointer, but /not/ to a Haskell value
                 | FloatRep        -- ^ a 32-bit floating point number
diff --git a/testsuite/tests/ffi/should_run/PrimFFIInt8.hs b/testsuite/tests/ffi/should_run/PrimFFIInt8.hs
new file mode 100644
index 0000000000000000000000000000000000000000..4124e074aa4d34a3781a3f5d7432bb44d66d8398
--- /dev/null
+++ b/testsuite/tests/ffi/should_run/PrimFFIInt8.hs
@@ -0,0 +1,28 @@
+{-# LANGUAGE ForeignFunctionInterface #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+module Main where
+
+import GHC.Exts
+
+foreign import ccall "add_all_int8"
+    add_all_int8
+        :: Int8# -> Int8# -> Int8# -> Int8# -> Int8#
+        -> Int8# -> Int8# -> Int8# -> Int8# -> Int8#
+        -> Int8#
+
+main :: IO ()
+main = do
+    let a = narrowInt8# 0#
+        b = narrowInt8# 1#
+        c = narrowInt8# 2#
+        d = narrowInt8# 3#
+        e = narrowInt8# 4#
+        f = narrowInt8# 5#
+        g = narrowInt8# 6#
+        h = narrowInt8# 7#
+        i = narrowInt8# 8#
+        j = narrowInt8# 9#
+        x = I# (extendInt8# (add_all_int8 a b c d e f g h i j))
+    print x
diff --git a/testsuite/tests/ffi/should_run/PrimFFIInt8.stdout b/testsuite/tests/ffi/should_run/PrimFFIInt8.stdout
new file mode 100644
index 0000000000000000000000000000000000000000..ea90ee31980757b2e469741512bcb39e73494e78
--- /dev/null
+++ b/testsuite/tests/ffi/should_run/PrimFFIInt8.stdout
@@ -0,0 +1 @@
+45
diff --git a/testsuite/tests/ffi/should_run/PrimFFIInt8_c.c b/testsuite/tests/ffi/should_run/PrimFFIInt8_c.c
new file mode 100644
index 0000000000000000000000000000000000000000..dc51687530765a4636cdb71db3a2c6166313a65e
--- /dev/null
+++ b/testsuite/tests/ffi/should_run/PrimFFIInt8_c.c
@@ -0,0 +1,7 @@
+#include <stdint.h>
+
+int8_t add_all_int8(
+        int8_t a, int8_t b, int8_t c, int8_t d, int8_t e,
+        int8_t f, int8_t g, int8_t h, int8_t i, int8_t j) {
+    return a + b + c + d + e + f + g + h + i + j;
+}
diff --git a/testsuite/tests/ffi/should_run/PrimFFIWord8.hs b/testsuite/tests/ffi/should_run/PrimFFIWord8.hs
new file mode 100644
index 0000000000000000000000000000000000000000..87e46636d17ffc2ed6c05a7c2203ab84584b6d29
--- /dev/null
+++ b/testsuite/tests/ffi/should_run/PrimFFIWord8.hs
@@ -0,0 +1,28 @@
+{-# LANGUAGE ForeignFunctionInterface #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+module Main where
+
+import GHC.Exts
+
+foreign import ccall "add_all_word8"
+    add_all_word8
+        :: Word8# -> Word8# -> Word8# -> Word8# -> Word8#
+        -> Word8# -> Word8# -> Word8# -> Word8# -> Word8#
+        -> Word8#
+
+main :: IO ()
+main = do
+    let a = narrowWord8# 0##
+        b = narrowWord8# 1##
+        c = narrowWord8# 2##
+        d = narrowWord8# 3##
+        e = narrowWord8# 4##
+        f = narrowWord8# 5##
+        g = narrowWord8# 6##
+        h = narrowWord8# 7##
+        i = narrowWord8# 8##
+        j = narrowWord8# 9##
+        x = W# (extendWord8# (add_all_word8 a b c d e f g h i j))
+    print x
diff --git a/testsuite/tests/ffi/should_run/PrimFFIWord8.stdout b/testsuite/tests/ffi/should_run/PrimFFIWord8.stdout
new file mode 100644
index 0000000000000000000000000000000000000000..ea90ee31980757b2e469741512bcb39e73494e78
--- /dev/null
+++ b/testsuite/tests/ffi/should_run/PrimFFIWord8.stdout
@@ -0,0 +1 @@
+45
diff --git a/testsuite/tests/ffi/should_run/PrimFFIWord8_c.c b/testsuite/tests/ffi/should_run/PrimFFIWord8_c.c
new file mode 100644
index 0000000000000000000000000000000000000000..535ed4185cba9ec5c4a93c6961df00ea9306bc3f
--- /dev/null
+++ b/testsuite/tests/ffi/should_run/PrimFFIWord8_c.c
@@ -0,0 +1,7 @@
+#include <stdint.h>
+
+uint8_t add_all_word8(
+        uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e,
+        uint8_t f, uint8_t g, uint8_t h, uint8_t i, uint8_t j) {
+    return a + b + c + d + e + f + g + h + i + j;
+}
diff --git a/testsuite/tests/ffi/should_run/all.T b/testsuite/tests/ffi/should_run/all.T
index fd0af7ebc31630a8a877f110a43612ff1dab88c6..9223b3d1b346a9580faa792c9312b8c95784e4ce 100644
--- a/testsuite/tests/ffi/should_run/all.T
+++ b/testsuite/tests/ffi/should_run/all.T
@@ -188,3 +188,7 @@ test('ffi023', [ omit_ways(['ghci']),
 test('T12134', [omit_ways(['ghci'])], compile_and_run, ['T12134_c.c'])
 
 test('T12614', [omit_ways(['ghci'])], compile_and_run, ['T12614_c.c'])
+
+test('PrimFFIInt8', [omit_ways(['ghci'])], compile_and_run, ['PrimFFIInt8_c.c'])
+
+test('PrimFFIWord8', [omit_ways(['ghci'])], compile_and_run, ['PrimFFIWord8_c.c'])
diff --git a/testsuite/tests/primops/should_run/ArithInt8.hs b/testsuite/tests/primops/should_run/ArithInt8.hs
new file mode 100644
index 0000000000000000000000000000000000000000..77f4cea21ad8c938e06b296e0a5da43bab399404
--- /dev/null
+++ b/testsuite/tests/primops/should_run/ArithInt8.hs
@@ -0,0 +1,201 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+
+module Main where
+
+import Data.Int
+import Data.List
+import GHC.Prim
+import GHC.Exts
+
+main :: IO ()
+main = do
+
+    --
+    -- Check if passing Int8# on the stack works (16 parameter function will
+    -- need to use stack for some of the them)
+    --
+    let input =
+            [ ( (a + 0), (a + 1), (a + 2), (a + 3),
+                (a + 4), (a + 5), (a + 6), (a + 7),
+                (a + 8), (a + 9), (a + 10), (a + 11),
+                (a + 12), (a + 13), (a + 14), (a + 15) )
+            | a <- allInt8
+            ]
+        expected =
+            [ toInt8
+                  (a + b + c + d + e + f + g + h +
+                   i + j + k + l + m + n + o + p)
+            | (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) <- input
+            ]
+        actual =
+            [ addMany a b c d e f g h i j k l m n o p
+            | (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) <- input
+            ]
+    checkResults "passing Int8# on the stack" input expected actual
+
+    --
+    -- negateInt8#
+    --
+    let input = allInt8
+        expected = [ toInt8 (negate a) | a <- input ]
+        actual = [ apply1 negateInt8# a | a <- input ]
+    checkResults "negateInt8#" input expected actual
+
+    --
+    -- plusInt8#
+    --
+    let input = [ (a, b) | a <- allInt8, b <- allInt8 ]
+        expected = [ toInt8 (a + b) | (a, b) <- input ]
+        actual = [ apply2 plusInt8# a b | (a, b) <- input ]
+    checkResults "plusInt8#" input expected actual
+
+    --
+    -- subInt8#
+    --
+    let input = [ (a, b) | a <- allInt8, b <- allInt8 ]
+        expected = [ toInt8 (a - b) | (a, b) <- input ]
+        actual = [ apply2 subInt8# a b | (a, b) <- input ]
+    checkResults "subInt8#" input expected actual
+
+    --
+    -- timesInt8#
+    --
+    let input = [ (a, b) | a <- allInt8, b <- allInt8 ]
+        expected = [ toInt8 (a * b) | (a, b) <- input ]
+        actual = [ apply2 timesInt8# a b | (a, b) <- input ]
+    checkResults "timesInt8#" input expected actual
+
+    --
+    -- remInt8#
+    --
+    let input =
+            [ (a, b) | a <- allInt8, b <- allInt8
+            -- Don't divide by 0 or cause overflow
+            , b /= 0, not (a == -128 && b == -1)
+            ]
+        expected = [ toInt8 (a `rem` b) | (a, b) <- input ]
+        actual = [ apply2 remInt8# a b | (a, b) <- input ]
+    checkResults "remInt8#" input expected actual
+
+    --
+    -- quotInt8#
+    --
+    let input =
+            [ (a, b) | a <- allInt8, b <- allInt8
+            , b /= 0, not (a == -128 && b == -1)
+            ]
+        expected = [ toInt8 (a `quot` b) | (a, b) <- input ]
+        actual = [ apply2 quotInt8# a b | (a, b) <- input ]
+    checkResults "quotInt8#" input expected actual
+
+    --
+    -- quotRemInt8#
+    --
+    let input =
+            [ (a, b) | a <- allInt8, b <- allInt8
+            , b /= 0, not (a == -128 && b == -1)
+            ]
+        expected =
+            [ (toInt8 q, toInt8 r)  | (a, b) <- input
+            , let (q, r) = a `quotRem` b
+            ]
+        actual = [ apply3 quotRemInt8# a b | (a, b) <- input ]
+    checkResults "quotRemInt8#" input expected actual
+
+
+checkResults
+    :: (Eq a, Eq b, Show a, Show b) => String -> [a] -> [b] -> [b] -> IO ()
+checkResults test inputs expected actual =
+    case findIndex (\(e, a) -> e /= a) (zip expected actual) of
+        Nothing -> putStrLn $ "Pass: " ++ test
+        Just i -> error $
+            "FAILED: " ++ test ++ " for input: " ++ show (inputs !! i)
+              ++ " expected: " ++ show (expected !! i)
+              ++ " but got: " ++ show (actual !! i)
+
+allInt8 :: [Int]
+allInt8 = [ minInt8 .. maxInt8 ]
+
+minInt8 :: Int
+minInt8 = fromIntegral (minBound :: Int8)
+
+maxInt8 :: Int
+maxInt8 = fromIntegral (maxBound :: Int8)
+
+toInt8 :: Int -> Int
+toInt8 a = fromIntegral (fromIntegral a :: Int8)
+
+addMany#
+    :: Int8# -> Int8# -> Int8# -> Int8#
+    -> Int8# -> Int8# -> Int8# -> Int8#
+    -> Int8# -> Int8# -> Int8# -> Int8#
+    -> Int8# -> Int8# -> Int8# -> Int8#
+    -> Int8#
+addMany# a b c d e f g h i j k l m n o p =
+    a `plusInt8#` b `plusInt8#` c `plusInt8#` d `plusInt8#`
+    e `plusInt8#` f `plusInt8#` g `plusInt8#` h `plusInt8#`
+    i `plusInt8#` j `plusInt8#` k `plusInt8#` l `plusInt8#`
+    m `plusInt8#` n `plusInt8#` o `plusInt8#` p
+{-# NOINLINE addMany# #-}
+
+addMany
+    :: Int -> Int -> Int -> Int
+    -> Int -> Int -> Int -> Int
+    -> Int -> Int -> Int -> Int
+    -> Int -> Int -> Int -> Int
+    -> Int
+addMany (I# a) (I# b) (I# c) (I# d)
+        (I# e) (I# f) (I# g) (I# h)
+        (I# i) (I# j) (I# k) (I# l)
+        (I# m) (I# n) (I# o) (I# p)
+            = I# (extendInt8# int8)
+  where
+    !int8 = addMany#
+                (narrowInt8# a) (narrowInt8# b) (narrowInt8# c) (narrowInt8# d)
+                (narrowInt8# e) (narrowInt8# f) (narrowInt8# g) (narrowInt8# h)
+                (narrowInt8# i) (narrowInt8# j) (narrowInt8# k) (narrowInt8# l)
+                (narrowInt8# m) (narrowInt8# n) (narrowInt8# o) (narrowInt8# p)
+{-# NOINLINE addMany #-}
+
+-- Convenient and also tests higher order functions on Int8#
+apply1 :: (Int8# -> Int8#) -> Int -> Int
+apply1 opToTest (I# a) = I# (extendInt8# (opToTest (narrowInt8# a)))
+{-# NOINLINE apply1 #-}
+
+apply2 :: (Int8# -> Int8# -> Int8#) -> Int -> Int -> Int
+apply2 opToTest (I# a) (I# b) =
+    let (# sa, sb #) = (# narrowInt8# a, narrowInt8# b #)
+        r = opToTest sa sb
+    in I# (extendInt8# r)
+{-# NOINLINE apply2 #-}
+
+apply3 :: (Int8# -> Int8# -> (# Int8#, Int8# #)) -> Int -> Int -> (Int, Int)
+apply3 opToTest (I# a) (I# b) =
+    let (# sa, sb #) = (# narrowInt8# a, narrowInt8# b #)
+        (# ra, rb #) = opToTest sa sb
+    in (I# (extendInt8# ra), I# (extendInt8# rb))
+{-# NOINLINE apply3 #-}
+
+instance
+        (Eq a, Eq b, Eq c, Eq d, Eq e, Eq f, Eq g, Eq h,
+         Eq i, Eq j, Eq k, Eq l, Eq m, Eq n, Eq o, Eq p)
+      => Eq (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) where
+    (a1, b1, c1, d1, e1, f1, g1, h1, i1, j1, k1, l1, m1, n1, o1, p1) ==
+        (a2, b2, c2, d2, e2, f2, g2, h2, i2, j2, k2, l2, m2, n2, o2, p2) =
+            a1 == a2 && b1 == b2 && c1 == c2 && d1 == d2 &&
+            e1 == e2 && f1 == f2 && g1 == g2 && h1 == h2 &&
+            i1 == i2 && j1 == j2 && k1 == k2 && l1 == l2 &&
+            m1 == m2 && n1 == n2 && o1 == o2 && p1 == p2
+
+instance
+        (Show a, Show b, Show c, Show d, Show e, Show f, Show g, Show h,
+         Show i, Show j, Show k, Show l, Show m, Show n, Show o, Show p)
+      => Show (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) where
+    show (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) =
+        "(" ++ show a ++ "," ++ show b ++ "," ++ show c ++ "," ++ show d ++
+        "," ++ show e ++ "," ++ show f ++ "," ++ show g ++ "," ++ show h ++
+        "," ++ show i ++ "," ++ show j ++ "," ++ show k ++ "," ++ show l ++
+        "," ++ show m ++ "," ++ show n ++ "," ++ show o ++ "," ++ show p ++
+        ")"
diff --git a/testsuite/tests/primops/should_run/ArithInt8.stdout b/testsuite/tests/primops/should_run/ArithInt8.stdout
new file mode 100644
index 0000000000000000000000000000000000000000..16990fb3c510e851a4fd8f686b2e8c32ad778c5e
--- /dev/null
+++ b/testsuite/tests/primops/should_run/ArithInt8.stdout
@@ -0,0 +1,8 @@
+Pass: passing Int8# on the stack
+Pass: negateInt8#
+Pass: plusInt8#
+Pass: subInt8#
+Pass: timesInt8#
+Pass: remInt8#
+Pass: quotInt8#
+Pass: quotRemInt8#
diff --git a/testsuite/tests/primops/should_run/ArithWord8.hs b/testsuite/tests/primops/should_run/ArithWord8.hs
new file mode 100644
index 0000000000000000000000000000000000000000..ceac789878fc36102480fd1320df0c263f0e7199
--- /dev/null
+++ b/testsuite/tests/primops/should_run/ArithWord8.hs
@@ -0,0 +1,198 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+
+module Main where
+
+import Data.Word
+import Data.Bits
+import Data.List
+import GHC.Prim
+import GHC.Exts
+
+main :: IO ()
+main = do
+
+    --
+    -- Check if passing Word8# on the stack works (16 parameter function will
+    -- need to use stack for some of the them)
+    --
+    let input =
+            [ ( (a + 0), (a + 1), (a + 2), (a + 3),
+                (a + 4), (a + 5), (a + 6), (a + 7),
+                (a + 8), (a + 9), (a + 10), (a + 11),
+                (a + 12), (a + 13), (a + 14), (a + 15) )
+            | a <- allWord8
+            ]
+        expected =
+            [ toWord8
+                  (a + b + c + d + e + f + g + h +
+                   i + j + k + l + m + n + o + p)
+            | (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) <- input
+            ]
+        actual =
+            [ addMany a b c d e f g h i j k l m n o p
+            | (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) <- input
+            ]
+    checkResults "passing Word8# on the stack" input expected actual
+
+    --
+    -- notWord8#
+    --
+    let input = allWord8
+        expected = [ toWord8 (complement a) | a <- input ]
+        actual = [ apply1 notWord8# a | a <- input ]
+    checkResults "notWord8#" input expected actual
+
+    --
+    -- plusWord8#
+    --
+    let input = [ (a, b) | a <- allWord8, b <- allWord8 ]
+        expected = [ toWord8 (a + b) | (a, b) <- input ]
+        actual = [ apply2 plusWord8# a b | (a, b) <- input ]
+    checkResults "plusWord8#" input expected actual
+
+    --
+    -- subWord8#
+    --
+    let input = [ (a, b) | a <- allWord8, b <- allWord8 ]
+        expected = [ toWord8 (a - b) | (a, b) <- input ]
+        actual = [ apply2 subWord8# a b | (a, b) <- input ]
+    checkResults "subWord8#" input expected actual
+
+    --
+    -- timesWord8#
+    --
+    let input = [ (a, b) | a <- allWord8, b <- allWord8 ]
+        expected = [ toWord8 (a * b) | (a, b) <- input ]
+        actual = [ apply2 timesWord8# a b | (a, b) <- input ]
+    checkResults "timesWord8#" input expected actual
+
+    --
+    -- remWord8#
+    --
+    let input =
+            -- Don't divide by 0.
+            [ (a, b) | a <- allWord8, b <- allWord8 , b /= 0 ]
+        expected = [ toWord8 (a `rem` b) | (a, b) <- input ]
+        actual = [ apply2 remWord8# a b | (a, b) <- input ]
+    checkResults "remWord8#" input expected actual
+
+    --
+    -- quotWord8#
+    --
+    let input =
+            [ (a, b) | a <- allWord8, b <- allWord8, b /= 0 ]
+        expected = [ toWord8 (a `quot` b) | (a, b) <- input ]
+        actual = [ apply2 quotWord8# a b | (a, b) <- input ]
+    checkResults "quotWord8#" input expected actual
+
+    --
+    -- quotRemWord8#
+    --
+    let input =
+            [ (a, b) | a <- allWord8, b <- allWord8, b /= 0 ]
+        expected =
+            [ (toWord8 q, toWord8 r)  | (a, b) <- input
+            , let (q, r) = a `quotRem` b
+            ]
+        actual = [ apply3 quotRemWord8# a b | (a, b) <- input ]
+    checkResults "quotRemWord8#" input expected actual
+
+
+checkResults
+    :: (Eq a, Eq b, Show a, Show b) => String -> [a] -> [b] -> [b] -> IO ()
+checkResults test inputs expected actual =
+    case findIndex (\(e, a) -> e /= a) (zip expected actual) of
+        Nothing -> putStrLn $ "Pass: " ++ test
+        Just i -> error $
+            "FAILED: " ++ test ++ " for input: " ++ show (inputs !! i)
+              ++ " expected: " ++ show (expected !! i)
+              ++ " but got: " ++ show (actual !! i)
+
+allWord8 :: [Word]
+allWord8 = [ minWord8 .. maxWord8 ]
+
+minWord8 :: Word
+minWord8 = fromIntegral (minBound :: Word8)
+
+maxWord8 :: Word
+maxWord8 = fromIntegral (maxBound :: Word8)
+
+toWord8 :: Word -> Word
+toWord8 a = fromIntegral (fromIntegral a :: Word8)
+
+addMany#
+    :: Word8# -> Word8# -> Word8# -> Word8#
+    -> Word8# -> Word8# -> Word8# -> Word8#
+    -> Word8# -> Word8# -> Word8# -> Word8#
+    -> Word8# -> Word8# -> Word8# -> Word8#
+    -> Word8#
+addMany# a b c d e f g h i j k l m n o p =
+    a `plusWord8#` b `plusWord8#` c `plusWord8#` d `plusWord8#`
+    e `plusWord8#` f `plusWord8#` g `plusWord8#` h `plusWord8#`
+    i `plusWord8#` j `plusWord8#` k `plusWord8#` l `plusWord8#`
+    m `plusWord8#` n `plusWord8#` o `plusWord8#` p
+{-# NOINLINE addMany# #-}
+
+addMany
+    :: Word -> Word -> Word -> Word
+    -> Word -> Word -> Word -> Word
+    -> Word -> Word -> Word -> Word
+    -> Word -> Word -> Word -> Word
+    -> Word
+addMany (W# a) (W# b) (W# c) (W# d)
+        (W# e) (W# f) (W# g) (W# h)
+        (W# i) (W# j) (W# k) (W# l)
+        (W# m) (W# n) (W# o) (W# p)
+            = W# (extendWord8# word8)
+  where
+    !word8 =
+        addMany#
+            (narrowWord8# a) (narrowWord8# b) (narrowWord8# c) (narrowWord8# d)
+            (narrowWord8# e) (narrowWord8# f) (narrowWord8# g) (narrowWord8# h)
+            (narrowWord8# i) (narrowWord8# j) (narrowWord8# k) (narrowWord8# l)
+            (narrowWord8# m) (narrowWord8# n) (narrowWord8# o) (narrowWord8# p)
+{-# NOINLINE addMany #-}
+
+-- Convenient and also tests higher order functions on Word8#
+apply1 :: (Word8# -> Word8#) -> Word -> Word
+apply1 opToTest (W# a) = W# (extendWord8# (opToTest (narrowWord8# a)))
+{-# NOINLINE apply1 #-}
+
+apply2 :: (Word8# -> Word8# -> Word8#) -> Word -> Word -> Word
+apply2 opToTest (W# a) (W# b) =
+    let (# sa, sb #) = (# narrowWord8# a, narrowWord8# b #)
+        r = opToTest sa sb
+    in W# (extendWord8# r)
+{-# NOINLINE apply2 #-}
+
+apply3
+  :: (Word8# -> Word8# -> (# Word8#, Word8# #)) -> Word -> Word -> (Word, Word)
+apply3 opToTest (W# a) (W# b) =
+    let (# sa, sb #) = (# narrowWord8# a, narrowWord8# b #)
+        (# ra, rb #) = opToTest sa sb
+    in (W# (extendWord8# ra), W# (extendWord8# rb))
+{-# NOINLINE apply3 #-}
+
+instance
+        (Eq a, Eq b, Eq c, Eq d, Eq e, Eq f, Eq g, Eq h,
+         Eq i, Eq j, Eq k, Eq l, Eq m, Eq n, Eq o, Eq p)
+      => Eq (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) where
+    (a1, b1, c1, d1, e1, f1, g1, h1, i1, j1, k1, l1, m1, n1, o1, p1) ==
+        (a2, b2, c2, d2, e2, f2, g2, h2, i2, j2, k2, l2, m2, n2, o2, p2) =
+            a1 == a2 && b1 == b2 && c1 == c2 && d1 == d2 &&
+            e1 == e2 && f1 == f2 && g1 == g2 && h1 == h2 &&
+            i1 == i2 && j1 == j2 && k1 == k2 && l1 == l2 &&
+            m1 == m2 && n1 == n2 && o1 == o2 && p1 == p2
+
+instance
+        (Show a, Show b, Show c, Show d, Show e, Show f, Show g, Show h,
+         Show i, Show j, Show k, Show l, Show m, Show n, Show o, Show p)
+      => Show (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) where
+    show (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) =
+        "(" ++ show a ++ "," ++ show b ++ "," ++ show c ++ "," ++ show d ++
+        "," ++ show e ++ "," ++ show f ++ "," ++ show g ++ "," ++ show h ++
+        "," ++ show i ++ "," ++ show j ++ "," ++ show k ++ "," ++ show l ++
+        "," ++ show m ++ "," ++ show n ++ "," ++ show o ++ "," ++ show p ++
+        ")"
diff --git a/testsuite/tests/primops/should_run/ArithWord8.stdout b/testsuite/tests/primops/should_run/ArithWord8.stdout
new file mode 100644
index 0000000000000000000000000000000000000000..b745ea0a48ae8ba7620fb4c0343e60a7998779c7
--- /dev/null
+++ b/testsuite/tests/primops/should_run/ArithWord8.stdout
@@ -0,0 +1,8 @@
+Pass: passing Word8# on the stack
+Pass: notWord8#
+Pass: plusWord8#
+Pass: subWord8#
+Pass: timesWord8#
+Pass: remWord8#
+Pass: quotWord8#
+Pass: quotRemWord8#
diff --git a/testsuite/tests/primops/should_run/CmpInt8.hs b/testsuite/tests/primops/should_run/CmpInt8.hs
new file mode 100644
index 0000000000000000000000000000000000000000..daea22701d2735edad78ca46d574cafa9f078cdb
--- /dev/null
+++ b/testsuite/tests/primops/should_run/CmpInt8.hs
@@ -0,0 +1,84 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE MagicHash #-}
+
+module Main where
+
+import Data.Int
+import Data.List
+import GHC.Prim
+import GHC.Exts
+
+
+-- Having a wrapper gives us two things:
+-- * it's easier to test everything (no need for code using raw primops)
+-- * we test the deriving mechanism for Int8#
+data TestInt8 = T8 Int8#
+    deriving (Eq, Ord)
+
+mkT8 :: Int -> TestInt8
+mkT8 (I# a) = T8 (narrowInt8# a)
+
+main :: IO ()
+main = do
+    let input = [ (a, b) | a <- allInt8, b <- allInt8 ]
+
+    --
+    -- (==)
+    --
+    let expected = [ a == b | (a, b) <- input ]
+        actual = [ mkT8 a == mkT8 b | (a, b) <- input ]
+    checkResults "(==)" input expected actual
+
+    --
+    -- (/=)
+    --
+    let expected = [ a /= b | (a, b) <- input ]
+        actual = [ mkT8 a /= mkT8 b | (a, b) <- input ]
+    checkResults "(/=)" input expected actual
+
+    --
+    -- (<)
+    --
+    let expected = [ a < b | (a, b) <- input ]
+        actual = [ mkT8 a < mkT8 b | (a, b) <- input ]
+    checkResults "(<)" input expected actual
+
+    --
+    -- (>)
+    --
+    let expected = [ a > b | (a, b) <- input ]
+        actual = [ mkT8 a > mkT8 b | (a, b) <- input ]
+    checkResults "(>)" input expected actual
+
+    --
+    -- (<=)
+    --
+    let expected = [ a <= b | (a, b) <- input ]
+        actual = [ mkT8 a <= mkT8 b | (a, b) <- input ]
+    checkResults "(<=)" input expected actual
+
+    --
+    -- (>=)
+    --
+    let expected = [ a >= b | (a, b) <- input ]
+        actual = [ mkT8 a >= mkT8 b | (a, b) <- input ]
+    checkResults "(>=)" input expected actual
+
+checkResults
+    :: (Eq a, Eq b, Show a, Show b) => String -> [a] -> [b] -> [b] -> IO ()
+checkResults test inputs expected actual =
+    case findIndex (\(e, a) -> e /= a) (zip expected actual) of
+        Nothing -> putStrLn $ "Pass: " ++ test
+        Just i -> error $
+            "FAILED: " ++ test ++ " for input: " ++ show (inputs !! i)
+              ++ " expected: " ++ show (expected !! i)
+              ++ " but got: " ++ show (actual !! i)
+
+allInt8 :: [Int]
+allInt8 = [ minInt8 .. maxInt8 ]
+
+minInt8 :: Int
+minInt8 = fromIntegral (minBound :: Int8)
+
+maxInt8 :: Int
+maxInt8 = fromIntegral (maxBound :: Int8)
diff --git a/testsuite/tests/primops/should_run/CmpInt8.stdout b/testsuite/tests/primops/should_run/CmpInt8.stdout
new file mode 100644
index 0000000000000000000000000000000000000000..191d2b4b2634567932879619fb1b85e48ed6d314
--- /dev/null
+++ b/testsuite/tests/primops/should_run/CmpInt8.stdout
@@ -0,0 +1,6 @@
+Pass: (==)
+Pass: (/=)
+Pass: (<)
+Pass: (>)
+Pass: (<=)
+Pass: (>=)
diff --git a/testsuite/tests/primops/should_run/CmpWord8.hs b/testsuite/tests/primops/should_run/CmpWord8.hs
new file mode 100644
index 0000000000000000000000000000000000000000..101f7837b59e5c7d772ddd1f5c0a64ccd74379ff
--- /dev/null
+++ b/testsuite/tests/primops/should_run/CmpWord8.hs
@@ -0,0 +1,84 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE MagicHash #-}
+
+module Main where
+
+import Data.Word
+import Data.List
+import GHC.Prim
+import GHC.Exts
+
+
+-- Having a wrapper gives us two things:
+-- * it's easier to test everything (no need for code using raw primops)
+-- * we test the deriving mechanism for Word8#
+data TestWord8 = T8 Word8#
+    deriving (Eq, Ord)
+
+mkT8 :: Word -> TestWord8
+mkT8 (W# a) = T8 (narrowWord8# a)
+
+main :: IO ()
+main = do
+    let input = [ (a, b) | a <- allWord8, b <- allWord8 ]
+
+    --
+    -- (==)
+    --
+    let expected = [ a == b | (a, b) <- input ]
+        actual = [ mkT8 a == mkT8 b | (a, b) <- input ]
+    checkResults "(==)" input expected actual
+
+    --
+    -- (/=)
+    --
+    let expected = [ a /= b | (a, b) <- input ]
+        actual = [ mkT8 a /= mkT8 b | (a, b) <- input ]
+    checkResults "(/=)" input expected actual
+
+    --
+    -- (<)
+    --
+    let expected = [ a < b | (a, b) <- input ]
+        actual = [ mkT8 a < mkT8 b | (a, b) <- input ]
+    checkResults "(<)" input expected actual
+
+    --
+    -- (>)
+    --
+    let expected = [ a > b | (a, b) <- input ]
+        actual = [ mkT8 a > mkT8 b | (a, b) <- input ]
+    checkResults "(>)" input expected actual
+
+    --
+    -- (<=)
+    --
+    let expected = [ a <= b | (a, b) <- input ]
+        actual = [ mkT8 a <= mkT8 b | (a, b) <- input ]
+    checkResults "(<=)" input expected actual
+
+    --
+    -- (>=)
+    --
+    let expected = [ a >= b | (a, b) <- input ]
+        actual = [ mkT8 a >= mkT8 b | (a, b) <- input ]
+    checkResults "(>=)" input expected actual
+
+checkResults
+    :: (Eq a, Eq b, Show a, Show b) => String -> [a] -> [b] -> [b] -> IO ()
+checkResults test inputs expected actual =
+    case findIndex (\(e, a) -> e /= a) (zip expected actual) of
+        Nothing -> putStrLn $ "Pass: " ++ test
+        Just i -> error $
+            "FAILED: " ++ test ++ " for input: " ++ show (inputs !! i)
+              ++ " expected: " ++ show (expected !! i)
+              ++ " but got: " ++ show (actual !! i)
+
+allWord8 :: [Word]
+allWord8 = [ minWord8 .. maxWord8 ]
+
+minWord8 :: Word
+minWord8 = fromIntegral (minBound :: Word8)
+
+maxWord8 :: Word
+maxWord8 = fromIntegral (maxBound :: Word8)
diff --git a/testsuite/tests/primops/should_run/CmpWord8.stdout b/testsuite/tests/primops/should_run/CmpWord8.stdout
new file mode 100644
index 0000000000000000000000000000000000000000..191d2b4b2634567932879619fb1b85e48ed6d314
--- /dev/null
+++ b/testsuite/tests/primops/should_run/CmpWord8.stdout
@@ -0,0 +1,6 @@
+Pass: (==)
+Pass: (/=)
+Pass: (<)
+Pass: (>)
+Pass: (<=)
+Pass: (>=)
diff --git a/testsuite/tests/primops/should_run/ShowPrim.hs b/testsuite/tests/primops/should_run/ShowPrim.hs
new file mode 100644
index 0000000000000000000000000000000000000000..5670032f4ae209a7e31fe99c9345cf21995f15e5
--- /dev/null
+++ b/testsuite/tests/primops/should_run/ShowPrim.hs
@@ -0,0 +1,14 @@
+{-# LANGUAGE MagicHash #-}
+
+module Main where
+
+import GHC.Exts
+
+data Test = Test Int8# Word8#
+    deriving (Show)
+
+test1 :: Test
+test1 = Test (narrowInt8# 1#) (narrowWord8# 2##)
+
+main :: IO ()
+main = print test1
diff --git a/testsuite/tests/primops/should_run/ShowPrim.stdout b/testsuite/tests/primops/should_run/ShowPrim.stdout
new file mode 100644
index 0000000000000000000000000000000000000000..5720effb8bb4d71d16a175bbb5cc5855ab26fffe
--- /dev/null
+++ b/testsuite/tests/primops/should_run/ShowPrim.stdout
@@ -0,0 +1 @@
+Test (narrowInt8# 1#) (narrowWord8# 2##)
diff --git a/testsuite/tests/primops/should_run/all.T b/testsuite/tests/primops/should_run/all.T
index 742206d93d82238bd12593e20a46acd8db5e5921..ecf995bea8d26f026f7c9d0c9b10103a7f84aaf6 100644
--- a/testsuite/tests/primops/should_run/all.T
+++ b/testsuite/tests/primops/should_run/all.T
@@ -17,3 +17,8 @@ test('T10678',
      compile_and_run, ['-O'])
 test('T11296', normal, compile_and_run, [''])
 test('T13825-compile', normal, compile_and_run, [''])
+test('ArithInt8', omit_ways(['ghci']), compile_and_run, [''])
+test('ArithWord8', omit_ways(['ghci']), compile_and_run, [''])
+test('CmpInt8', normal, compile_and_run, [''])
+test('CmpWord8', normal, compile_and_run, [''])
+test('ShowPrim', normal, compile_and_run, [''])
diff --git a/utils/genprimopcode/Main.hs b/utils/genprimopcode/Main.hs
index e4779bf9163d9a528842f08a9e700d9d6b11ac32..e422c1fa5852fb9be06270edbdc282aee1077d9c 100644
--- a/utils/genprimopcode/Main.hs
+++ b/utils/genprimopcode/Main.hs
@@ -834,6 +834,8 @@ ppType (TyApp (TyCon "Any")         []) = "anyTy"
 ppType (TyApp (TyCon "Bool")        []) = "boolTy"
 
 ppType (TyApp (TyCon "Int#")        []) = "intPrimTy"
+ppType (TyApp (TyCon "Int8#")       []) = "int8PrimTy"
+ppType (TyApp (TyCon "Word8#")      []) = "word8PrimTy"
 ppType (TyApp (TyCon "Int32#")      []) = "int32PrimTy"
 ppType (TyApp (TyCon "Int64#")      []) = "int64PrimTy"
 ppType (TyApp (TyCon "Char#")       []) = "charPrimTy"