From 4b3667e04ca14d0b3ecf70d0a8135e482314b633 Mon Sep 17 00:00:00 2001
From: Bodigrim <andrew.lelechenko@gmail.com>
Date: Fri, 21 May 2021 01:54:41 +0100
Subject: [PATCH] Switch internal representation to UTF-8

---
 README.markdown                               |   2 +-
 cbits/cbits.c                                 | 232 +++---------------
 cbits/utils.c                                 |  11 +
 src/Data/Text.hs                              |  50 ++--
 src/Data/Text/Array.hs                        |  38 +--
 src/Data/Text/Encoding.hs                     | 139 ++++-------
 src/Data/Text/Foreign.hs                      | 105 ++++----
 src/Data/Text/Internal.hs                     |  32 +--
 src/Data/Text/Internal/Builder.hs             |   4 +-
 src/Data/Text/Internal/Encoding/Fusion.hs     |   6 +-
 src/Data/Text/Internal/Encoding/Utf8.hs       |  23 +-
 src/Data/Text/Internal/Fusion.hs              |  81 +++---
 src/Data/Text/Internal/Fusion/Common.hs       |  35 ++-
 src/Data/Text/Internal/Fusion/Size.hs         |  12 +-
 src/Data/Text/Internal/Fusion/Types.hs        |   2 +-
 .../Text/Internal/Lazy/Encoding/Fusion.hs     |  10 +-
 src/Data/Text/Internal/Lazy/Fusion.hs         |   4 +-
 src/Data/Text/Internal/Lazy/Search.hs         |  18 +-
 src/Data/Text/Internal/Search.hs              |  10 +-
 src/Data/Text/Internal/Unsafe/Char.hs         |  47 ++--
 src/Data/Text/Lazy.hs                         |  30 +--
 src/Data/Text/Lazy/Builder/Int.hs             |   9 +-
 src/Data/Text/Show.hs                         |   4 +-
 src/Data/Text/Unsafe.hs                       | 101 ++++----
 tests/Tests/Properties/LowLevel.hs            |  14 +-
 tests/Tests/QuickCheckUtils.hs                |   4 +-
 tests/Tests/Regressions.hs                    |   6 +-
 text.cabal                                    |  16 +-
 28 files changed, 428 insertions(+), 617 deletions(-)
 create mode 100644 cbits/utils.c

diff --git a/README.markdown b/README.markdown
index cf355297..8c27cb0e 100644
--- a/README.markdown
+++ b/README.markdown
@@ -29,4 +29,4 @@ based on the stream fusion framework developed by Roman Leshchinskiy,
 Duncan Coutts, and Don Stewart.
 
 The core library was fleshed out, debugged, and tested by Bryan
-O'Sullivan <bos@serpentine.com>, and he is the current maintainer.
+O'Sullivan. Transition from UTF-16 to UTF-8 is by Andrew Lelechenko.
diff --git a/cbits/cbits.c b/cbits/cbits.c
index 1d8322ae..33bab908 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -16,12 +16,6 @@
 
 #include "text_cbits.h"
 
-int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
-                   size_t n)
-{
-  return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
-}
-
 #define UTF8_ACCEPT 0
 #define UTF8_REJECT 12
 
@@ -61,60 +55,24 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
   return *state = utf8d[256 + *state + type];
 }
 
-/*
- * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
- * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
- * an UTF16 array
- */
-void
-_hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
+size_t
+_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
                        const uint8_t *srcend)
 {
+  const uint8_t *dest0 = dest;
   const uint8_t *p = src;
 
-#if defined(__i386__) || defined(__x86_64__)
-  /* This optimization works on a little-endian systems by using
-     (aligned) 32-bit loads instead of 8-bit loads
-   */
-
-  /* consume unaligned prefix */
-  while (p != srcend && (uintptr_t)p & 0x3)
-    *dest++ = *p++;
-
-#if defined(__x86_64__)
-  /* All the intrinsics used here are from SSE2,
-   * so every x86_64 CPU supports them.
-   */
-  const __m128i zeros = _mm_set1_epi32(0);
-  while (p < srcend - 7) {
-    /* Load 8 bytes of ASCII data */
-    const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p));
-    /* Interleave with zeros */
-    const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros);
-    /* Store the resulting 16 bytes into destination */
-    _mm_storeu_si128((__m128i *)dest, utf16);
-
-    dest += 8;
-    p += 8;
-  }
-#else
-  /* iterate over 32-bit aligned loads */
-  while (p < srcend - 3) {
-    const uint32_t w = *((const uint32_t *)p);
-
-    *dest++ =  w        & 0xff;
-    *dest++ = (w >> 8)  & 0xff;
-    *dest++ = (w >> 16) & 0xff;
-    *dest++ = (w >> 24) & 0xff;
-
-    p += 4;
+  while (p != srcend){
+    uint8_t codepoint = *p++;
+    if(codepoint < 0x80){
+      *dest++ = (uint8_t)codepoint;
+    } else {
+      *dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
+      *dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
+    }
   }
-#endif
-#endif
 
-  /* handle unaligned suffix */
-  while (p != srcend)
-    *dest++ = *p++;
+  return (dest - dest0);
 }
 
 /*
@@ -146,82 +104,45 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
  */
 #if defined(__GNUC__) || defined(__clang__)
 static inline uint8_t const *
-_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
 			 const uint8_t **src, const uint8_t *srcend,
 			 uint32_t *codepoint0, uint32_t *state0)
   __attribute((always_inline));
 #endif
 
 static inline uint8_t const *
-_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
 			 const uint8_t **src, const uint8_t *srcend,
 			 uint32_t *codepoint0, uint32_t *state0)
 {
-  uint16_t *d = dest + *destoff;
+  uint8_t *d = dest + *destoff;
   const uint8_t *s = *src, *last = *src;
   uint32_t state = *state0;
   uint32_t codepoint = *codepoint0;
 
   while (s < srcend) {
-#if defined(__i386__) || defined(__x86_64__)
-    /*
-     * This code will only work on a little-endian system that
-     * supports unaligned loads.
-     *
-     * It gives a substantial speed win on data that is purely or
-     * partly ASCII (e.g. HTML), at only a slight cost on purely
-     * non-ASCII text.
-     */
-
-    if (state == UTF8_ACCEPT) {
-#if defined(__x86_64__)
-      const __m128i zeros = _mm_set1_epi32(0);
-      while (s < srcend - 8) {
-        const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s);
-        if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL)
-          break;
-        s += 8;
-
-        /* Load 8 bytes of ASCII data */
-        const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars);
-        /* Interleave with zeros */
-        const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros);
-        /* Store the resulting 16 bytes into destination */
-        _mm_storeu_si128((__m128i *)d, eight_utf16_chars);
-        d += 8;
-      }
-#else
-      while (s < srcend - 4) {
-        codepoint = *((uint32_t *) s);
-        if ((codepoint & 0x80808080) != 0)
-          break;
-        s += 4;
-        /*
-         * Tried 32-bit stores here, but the extra bit-twiddling
-         * slowed the code down.
-         */
-        *d++ = (uint16_t) (codepoint & 0xff);
-        *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
-        *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
-        *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
-      }
-#endif
-      last = s;
-    } /* end if (state == UTF8_ACCEPT) */
-#endif
-
     if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
       if (state != UTF8_REJECT)
-	continue;
+	      continue;
       break;
     }
 
-    if (codepoint <= 0xffff)
-      *d++ = (uint16_t) codepoint;
-    else {
-      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
-      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+    if(codepoint < 0x80){
+      *d++ = (uint8_t) codepoint;
+    } else if(codepoint < 0x800){
+      *d++ = (uint8_t) (0xC0 + (codepoint >> 6));
+      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
+    } else if(codepoint < 0x10000){
+      *d++ = (uint8_t) (0xE0 + (codepoint >> 12));
+      *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
+      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
+    } else {
+      *d++ = (uint8_t) (0xF0 + (codepoint >> 18));
+      *d++ = (uint8_t) (0x80 + ((codepoint >> 12) & 0x3F));
+      *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
+      *d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
     }
+
     last = s;
   }
 
@@ -234,7 +155,7 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
 }
 
 uint8_t const *
-_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8_state(uint8_t *const dest, size_t *destoff,
                            const uint8_t **src,
                            const uint8_t *srcend,
                            uint32_t *codepoint0, uint32_t *state0)
@@ -248,7 +169,7 @@ _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
  * Helper to decode buffer and discard final decoder state
  */
 const uint8_t *
-_hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
+_hs_text_decode_utf8(uint8_t *const dest, size_t *destoff,
                      const uint8_t *src, const uint8_t *const srcend)
 {
   uint32_t codepoint;
@@ -257,90 +178,3 @@ _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
                           &codepoint, &state);
   return src;
 }
-
-void
-_hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
-		     size_t srclen)
-{
-  const uint16_t *srcend;
-  uint8_t *dest = *destp;
-
-  src += srcoff;
-  srcend = src + srclen;
-
- ascii:
-#if defined(__x86_64__)
-  while (srcend - src >= 8) {
-    union { uint64_t halves[2]; __m128i whole; } eight_chars;
-    eight_chars.whole = _mm_loadu_si128((__m128i *) src);
-
-    const uint64_t w = eight_chars.halves[0];
-    if (w & 0xFF80FF80FF80FF80ULL) {
-      if (!(w & 0x000000000000FF80ULL)) {
-        *dest++ = w & 0xFFFF;
-        src++;
-        if (!(w & 0x00000000FF800000ULL)) {
-          *dest++ = (w >> 16) & 0xFFFF;
-          src++;
-          if (!(w & 0x0000FF8000000000ULL)) {
-            *dest++ = (w >> 32) & 0xFFFF;
-            src++;
-          }
-        }
-      }
-      break;
-    }
-
-    if (eight_chars.halves[1] & 0xFF80FF80FF80FF80ULL) {
-      break;
-    }
-
-    const __m128i eight_ascii_chars = _mm_packus_epi16(eight_chars.whole, eight_chars.whole);
-    _mm_storel_epi64((__m128i *)dest, eight_ascii_chars);
-
-    dest += 8;
-    src += 8;
-  }
-#endif
-
-#if defined(__i386__)
-  while (srcend - src >= 2) {
-    uint32_t w = *((uint32_t *) src);
-
-    if (w & 0xFF80FF80)
-      break;
-    *dest++ = w & 0xFFFF;
-    *dest++ = w >> 16;
-    src += 2;
-  }
-#endif
-
-  while (src < srcend) {
-    uint16_t w = *src++;
-
-    if (w <= 0x7F) {
-      *dest++ = w;
-      /* An ASCII byte is likely to begin a run of ASCII bytes.
-	 Falling back into the fast path really helps performance. */
-      goto ascii;
-    }
-    else if (w <= 0x7FF) {
-      *dest++ = (w >> 6) | 0xC0;
-      *dest++ = (w & 0x3f) | 0x80;
-    }
-    else if (w < 0xD800 || w > 0xDBFF) {
-      *dest++ = (w >> 12) | 0xE0;
-      *dest++ = ((w >> 6) & 0x3F) | 0x80;
-      *dest++ = (w & 0x3F) | 0x80;
-    } else {
-      uint32_t c = ((((uint32_t) w) - 0xD800) << 10) +
-	(((uint32_t) *src++) - 0xDC00) + 0x10000;
-      *dest++ = (c >> 18) | 0xF0;
-      *dest++ = ((c >> 12) & 0x3F) | 0x80;
-      *dest++ = ((c >> 6) & 0x3F) | 0x80;
-      *dest++ = (c & 0x3F) | 0x80;
-    }
-  }
-
-  *destp = dest;
-}
diff --git a/cbits/utils.c b/cbits/utils.c
new file mode 100644
index 00000000..2baa78c7
--- /dev/null
+++ b/cbits/utils.c
@@ -0,0 +1,11 @@
+/*
+ * Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com>
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+int _hs_text_memcmp(const void *arr1, size_t off1, const void *arr2, size_t off2, size_t len)
+{
+  return memcmp(arr1 + off1, arr2 + off2, len);
+}
diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 9bac1b9c..b035ed71 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -9,6 +9,7 @@
 -- Copyright   : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
 --               (c) 2009 Duncan Coutts,
 --               (c) 2008, 2009 Tom Harper
+--               (c) 2021 Andrew Lelechenko
 --
 -- License     : BSD-style
 -- Maintainer  : bos@serpentine.com
@@ -227,10 +228,8 @@ import Data.Text.Internal.Private (span_)
 import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
 import Data.Text.Show (singleton, unpack, unpackCString#)
 import qualified Prelude as P
-import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
+import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
                          reverseIter_, unsafeHead, unsafeTail)
-import Data.Text.Internal.Unsafe.Char (unsafeChr)
-import qualified Data.Text.Internal.Encoding.Utf16 as U16
 import Data.Text.Internal.Search (indices)
 #if defined(__HADDOCK__)
 import Data.ByteString (ByteString)
@@ -291,7 +290,8 @@ import Text.Printf (PrintfArg, formatArg, formatString)
 -- points
 -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
 -- as 'Char' values, including code points from this invalid range.
--- This means that there are some 'Char' values that are not valid
+-- This means that there are some 'Char' values
+-- (corresponding to 'Data.Char.Surrogate' category) that are not valid
 -- Unicode scalar values, and the functions in this module must handle
 -- those cases.
 --
@@ -300,12 +300,7 @@ import Text.Printf (PrintfArg, formatArg, formatString)
 -- that are not valid Unicode scalar values with the replacement
 -- character \"&#xfffd;\" (U+FFFD).  Functions that perform this
 -- inspection and replacement are documented with the phrase
--- \"Performs replacement on invalid scalar values\".
---
--- (One reason for this policy of replacement is that internally, a
--- 'Text' value is represented as packed UTF-16 data. Values in the
--- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
--- code points, and so cannot be represented. The functions replace
+-- \"Performs replacement on invalid scalar values\". The functions replace
 -- invalid scalar values, instead of dropping them, as a security
 -- measure. For details, see
 -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
@@ -487,12 +482,9 @@ second f (a, b) = (a, f b)
 -- | /O(1)/ Returns the last character of a 'Text', which must be
 -- non-empty.
 last :: Text -> Char
-last (Text arr off len)
-    | len <= 0                 = emptyError "last"
-    | n < 0xDC00 || n > 0xDFFF = unsafeChr n
-    | otherwise                = U16.chr2 n0 n
-    where n  = A.unsafeIndex arr (off+len-1)
-          n0 = A.unsafeIndex arr (off+len-2)
+last t@(Text _ _ len)
+    | len <= 0  = emptyError "last"
+    | otherwise = let Iter c _ = reverseIter t (len - 1) in c
 {-# INLINE [1] last #-}
 
 -- | /O(1)/ Returns all characters after the head of a 'Text', which
@@ -507,11 +499,9 @@ tail t@(Text arr off len)
 -- | /O(1)/ Returns all but the last character of a 'Text', which must
 -- be non-empty.
 init :: Text -> Text
-init (Text arr off len) | len <= 0                   = emptyError "init"
-                        | n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2)
-                        | otherwise                  = text arr off (len-1)
-    where
-      n = A.unsafeIndex arr (off+len-1)
+init t@(Text arr off len)
+    | len <= 0  = emptyError "init"
+    | otherwise = text arr off (len + reverseIter_ t (len - 1))
 {-# INLINE [1] init #-}
 
 -- | /O(1)/ Returns all but the last character and the last character of a
@@ -519,12 +509,11 @@ init (Text arr off len) | len <= 0                   = emptyError "init"
 --
 -- @since 1.2.3.0
 unsnoc :: Text -> Maybe (Text, Char)
-unsnoc (Text arr off len)
-    | len <= 0                 = Nothing
-    | n < 0xDC00 || n > 0xDFFF = Just (text arr off (len-1), unsafeChr n)
-    | otherwise                = Just (text arr off (len-2), U16.chr2 n0 n)
-    where n  = A.unsafeIndex arr (off+len-1)
-          n0 = A.unsafeIndex arr (off+len-2)
+unsnoc t@(Text arr off len)
+    | len <= 0  = Nothing
+    | otherwise = Just (text arr off (len + d), c)
+        where
+            Iter c d = reverseIter t (len - 1)
 {-# INLINE [1] unsnoc #-}
 
 -- | /O(1)/ Tests whether a 'Text' is empty or not.
@@ -911,7 +900,7 @@ concat ts = case ts' of
               _ -> Text (A.run go) 0 len
   where
     ts' = L.filter (not . null) ts
-    len = sumP "concat" $ L.map lengthWord16 ts'
+    len = sumP "concat" $ L.map lengthWord8 ts'
     go :: ST s (A.MArray s)
     go = do
       arr <- A.new len
@@ -1263,7 +1252,7 @@ groupBy p = loop
         where Iter c d = iter t 0
               n     = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d))
 
--- | Returns the /array/ index (in units of 'Word16') at which a
+-- | Returns the /array/ index (in units of 'Word8') at which a
 -- character may be found.  This is /not/ the same as the logical
 -- index returned by e.g. 'findIndex'.
 findAIndexOrEnd :: (Char -> Bool) -> Text -> Int
@@ -1569,9 +1558,10 @@ words t@(Text arr off len) = loop 0 0
         | n >= len = if start == n
                      then []
                      else [Text arr (start+off) (n-start)]
+        -- Spaces in UTF-8 can take from 1 byte for 0x09 and up to 3 bytes for 0x3000.
         | isSpace c =
             if start == n
-            then loop (start+1) (start+1)
+            then loop (n+d) (n+d)
             else Text arr (start+off) (n-start) : loop (n+d) (n+d)
         | otherwise = loop start (n+d)
         where Iter c d = iter t n
diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
index c406c983..6566f40a 100644
--- a/src/Data/Text/Array.hs
+++ b/src/Data/Text/Array.hs
@@ -44,14 +44,14 @@ module Data.Text.Array
 import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
-import Data.Bits ((.&.), xor, shiftL, shiftR)
+import Data.Bits ((.&.), xor, shiftR)
 #if !MIN_VERSION_base(4,11,0)
 import Data.Text.Internal.Unsafe (inlinePerformIO)
-#endif
 import Foreign.C.Types (CInt(..))
+#endif
 import GHC.Exts hiding (toList)
 import GHC.ST (ST(..), runST)
-import GHC.Word (Word16(..))
+import GHC.Word (Word8(..))
 import Prelude hiding (length, read)
 
 -- | Immutable array type.
@@ -88,7 +88,7 @@ unsafeFreeze MArray{..} = ST $ \s1# ->
 -- | Indicate how many bytes would be used for an array of the given
 -- size.
 bytesInArray :: Int -> Int
-bytesInArray n = n `shiftL` 1
+bytesInArray n = n
 {-# INLINE bytesInArray #-}
 
 -- | Unchecked read of an immutable array.  May return garbage or
@@ -97,15 +97,15 @@ unsafeIndex ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  Array -> Int -> Word16
+  Array -> Int -> Word8
 unsafeIndex a@Array{..} i@(I# i#) =
 #if defined(ASSERTS)
-  let word16len = I# (sizeofByteArray# aBA) `quot` 2 in
-  if i < 0 || i >= word16len
-  then error ("Data.Text.Array.unsafeIndex: bounds error, offset " ++ show i ++ ", length " ++ show word16len)
+  let word8len = I# (sizeofByteArray# aBA) in
+  if i < 0 || i >= word8len
+  then error ("Data.Text.Array.unsafeIndex: bounds error, offset " ++ show i ++ ", length " ++ show word8len)
   else
 #endif
-  case indexWord16Array# aBA i# of r# -> (W16# r#)
+  case indexWord8Array# aBA i# of r# -> (W8# r#)
 {-# INLINE unsafeIndex #-}
 
 #if defined(ASSERTS)
@@ -130,17 +130,17 @@ unsafeWrite ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  MArray s -> Int -> Word16 -> ST s ()
-unsafeWrite ma@MArray{..} i@(I# i#) (W16# e#) =
+  MArray s -> Int -> Word8 -> ST s ()
+unsafeWrite ma@MArray{..} i@(I# i#) (W8# e#) =
 #if defined(ASSERTS)
-  checkBoundsM ma (i * 2) 2 >>
+  checkBoundsM ma i 1 >>
 #endif
-  (ST $ \s1# -> case writeWord16Array# maBA i# e# s1# of
+  (ST $ \s1# -> case writeWord8Array# maBA i# e# s1# of
     s2# -> (# s2#, () #))
 {-# INLINE unsafeWrite #-}
 
 -- | Convert an immutable array to a list.
-toList :: Array -> Int -> Int -> [Word16]
+toList :: Array -> Int -> Int -> [Word8]
 toList ary off len = loop 0
     where loop i | i < len   = unsafeIndex ary (off+i) : loop (i+1)
                  | otherwise = []
@@ -176,10 +176,10 @@ copyM dst@(MArray dst#) dstOff@(I# dstOff#) src@(MArray src#) srcOff@(I# srcOff#
 #if defined(ASSERTS)
     srcLen <- getSizeofMArray src
     dstLen <- getSizeofMArray dst
-    assert (srcOff + count <= srcLen `quot` 2) .
-      assert (dstOff + count <= dstLen `quot` 2) .
+    assert (srcOff + count <= srcLen) .
+      assert (dstOff + count <= dstLen) .
 #endif
-      ST $ \s1# -> case copyMutableByteArray# src# (2# *# srcOff#) dst# (2# *# dstOff#) (2# *# count#) s1# of
+      ST $ \s1# -> case copyMutableByteArray# src# srcOff# dst# dstOff# count# s1# of
         s2# -> (# s2#, () #)
 {-# INLINE copyM #-}
 
@@ -194,7 +194,7 @@ copyI :: MArray s               -- ^ Destination
 copyI (MArray dst#) dstOff@(I# dstOff#) (Array src#) (I# srcOff#) top@(I# top#)
     | dstOff >= top = return ()
     | otherwise = ST $ \s1# ->
-      case copyByteArray# src# (2# *# srcOff#) dst# (2# *# dstOff#) (2# *# (top# -# dstOff#)) s1# of
+      case copyByteArray# src# srcOff# dst# dstOff# (top# -# dstOff#) s1# of
         s2# -> (# s2#, () #)
 {-# INLINE copyI #-}
 
@@ -209,7 +209,7 @@ equal :: Array                  -- ^ First
 equal (Array src1#) (I# off1#) (Array src2#) (I# off2#) (I# count#) = i == 0
   where
 #if MIN_VERSION_base(4,11,0)
-    i = I# (compareByteArrays# src1# (2# *# off1#) src2# (2# *# off2#) (2# *# count#))
+    i = I# (compareByteArrays# src1# off1# src2# off2# count#)
 #else
     i = fromIntegral (inlinePerformIO (memcmp src1# off1# src2# off2# count#))
 
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index d44e73ff..186e12b4 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -7,6 +7,7 @@
 -- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan,
 --               (c) 2009 Duncan Coutts,
 --               (c) 2008, 2009 Tom Harper
+--               (c) 2021 Andrew Lelechenko
 --
 -- License     : BSD-style
 -- Maintainer  : bos@serpentine.com
@@ -63,30 +64,29 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
 import Control.Monad.ST (runST)
-import Data.Bits ((.&.), shiftR)
 import Data.ByteString as B
-import qualified Data.ByteString.Internal as B
+import qualified Data.ByteString.Short.Internal as SBS
 import Data.Foldable (traverse_)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), safe, text)
 import Data.Text.Internal.Private (runText)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Internal.Unsafe.Char (ord, unsafeWrite)
+import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show ()
 import Data.Text.Unsafe (unsafeDupablePerformIO)
-import Data.Word (Word8, Word16, Word32)
-import Foreign.C.Types (CSize(CSize))
+import Data.Word (Word8, Word32)
+import Foreign.C.Types (CSize)
 import Foreign.Marshal.Utils (with)
 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
 import Foreign.Storable (Storable, peek, poke)
-import GHC.Base (ByteArray#, MutableByteArray#)
+import GHC.Base (MutableByteArray#)
 import qualified Data.ByteString.Builder as B
 import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
 import qualified Data.ByteString.Builder.Prim as BP
 import qualified Data.ByteString.Builder.Prim.Internal as BP
+import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Encoding.Fusion as E
-import qualified Data.Text.Internal.Encoding.Utf16 as U16
 import qualified Data.Text.Internal.Fusion as F
 import Data.Text.Internal.ByteStringCompat
 #if defined(ASSERTS)
@@ -124,12 +124,12 @@ decodeLatin1 ::
 #endif
   ByteString -> Text
 decodeLatin1 bs = withBS bs aux where
-  aux fp len = text a 0 len
+  aux fp len = text a 0 actualLen
    where
-    a = A.run (A.new len >>= unsafeIOToST . go)
-    go dest = unsafeWithForeignPtr fp $ \ptr -> do
-      c_decode_latin1 (A.maBA dest) ptr (ptr `plusPtr` len)
-      return dest
+    (a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go)
+    go dest = unsafeWithForeignPtr fp $ \src -> do
+      destLen <- c_decode_latin1 (A.maBA dest) src (src `plusPtr` len)
+      return (dest, destLen)
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
@@ -161,6 +161,8 @@ decodeUtf8With onErr bs = withBS bs aux
                       case onErr desc (Just x) of
                         Nothing -> loop $ curPtr' `plusPtr` 1
                         Just c
+                          -- TODO This is problematic, because even BMP replacement characters
+                          -- can take longer than one UTF8 code unit (which is byte).
                           | c > '\xFFFF' -> throwUnsupportedReplChar
                           | otherwise -> do
                               destOff <- peek destOffPtr
@@ -170,43 +172,14 @@ decodeUtf8With onErr bs = withBS bs aux
                               poke destOffPtr (destOff + intToCSize w)
                               loop $ curPtr' `plusPtr` 1
             loop ptr
-    (unsafeIOToST . go) =<< A.new len
+    -- TODO (len * 2 + 100) assumes that invalid input is asymptotically rare.
+    -- This is incorrect in general, but for now we just want to pass tests.
+    (unsafeIOToST . go) =<< A.new (len * 2 + 100)
    where
     desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
 
     throwUnsupportedReplChar = throwIO $
       ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
-  -- TODO: The code currently assumes that the transcoded UTF-16
-  -- stream is at most twice as long (in bytes) as the input UTF-8
-  -- stream. To justify this assumption one has to assume that the
-  -- error handler replacement character also satisfies this
-  -- invariant, by emitting at most one UTF16 code unit.
-  --
-  -- One easy way to support the full range of code-points for
-  -- replacement characters in the error handler is to simply change
-  -- the (over-)allocation to `A.new (2*len)` and then shrink back the
-  -- `ByteArray#` to the real size (recent GHCs have a cheap
-  -- `ByteArray#` resize-primop for that which allow the GC to reclaim
-  -- the overallocation). However, this would require 4 times as much
-  -- (temporary) storage as the original UTF-8 required.
-  --
-  -- Another strategy would be to optimistically assume that
-  -- replacement characters are within the BMP, and if the case of a
-  -- non-BMP replacement occurs reallocate the target buffer (or throw
-  -- an exception, and fallback to a pessimistic codepath, like e.g.
-  -- `decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)`)
-  --
-  -- Alternatively, `OnDecodeError` could become a datastructure which
-  -- statically encodes the replacement-character range,
-  -- e.g. something isomorphic to
-  --
-  --   Either (... -> Maybe Word16) (... -> Maybe Char)
-  --
-  -- And allow to statically switch between the BMP/non-BMP
-  -- replacement-character codepaths. There's multiple ways to address
-  -- this with different tradeoffs; but ideally we should optimise for
-  -- the optimistic/error-free case.
-{- INLINE[0] decodeUtf8With #-}
 
 -- $stream
 --
@@ -304,14 +277,15 @@ streamDecodeUtf8With ::
 streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
  where
   -- We create a slightly larger than necessary buffer to accommodate a
-  -- potential surrogate pair started in the last buffer (@undecoded0@), or
+  -- potential code point started in the last buffer (@undecoded0@), or
   -- replacement characters for each byte in @undecoded0@ if the
   -- sequence turns out to be invalid. There can be up to three bytes there,
-  -- hence we allocate @len+3@ 16-bit words.
+  -- hence we allocate @len+3@ bytes.
   decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
               -> Decoding
   decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
-    aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3)
+    -- TODO Replace (+100) with something sensible.
+    aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+100)
        where
         decodeChunkToBuffer :: A.MArray s -> IO Decoding
         decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr ->
@@ -342,7 +316,7 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
                         -- the previous chunk, we invalidate the bytes from
                         -- @undecoded0@ and retry decoding the current chunk from
                         -- the initial state.
-                        traverse_ skipByte (B.unpack undecoded0 )
+                        traverse_ skipByte (B.unpack undecoded0)
                         loop lastPtr
                       else do
                         peek lastPtr >>= skipByte
@@ -436,50 +410,33 @@ encodeUtf8BuilderEscaped be =
             goPartial !iendTmp = go i0 op0
               where
                 go !i !op
-                  | i < iendTmp = case A.unsafeIndex arr i of
-                      w | w <= 0x7F -> do
-                            BP.runB be (word16ToWord8 w) op >>= go (i + 1)
-                        | w <= 0x7FF -> do
-                            poke8 @Word16 0 $ (w `shiftR` 6) + 0xC0
-                            poke8 @Word16 1 $ (w .&. 0x3f) + 0x80
-                            go (i + 1) (op `plusPtr` 2)
-                        | 0xD800 <= w && w <= 0xDBFF -> do
-                            let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1))
-                            poke8 @Int 0 $ (c `shiftR` 18) + 0xF0
-                            poke8 @Int 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80
-                            poke8 @Int 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80
-                            poke8 @Int 3 $ (c .&. 0x3F) + 0x80
-                            go (i + 2) (op `plusPtr` 4)
-                        | otherwise -> do
-                            poke8 @Word16 0 $ (w `shiftR` 12) + 0xE0
-                            poke8 @Word16 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80
-                            poke8 @Word16 2 $ (w .&. 0x3F) + 0x80
-                            go (i + 1) (op `plusPtr` 3)
-                  | otherwise =
-                      outerLoop i (B.BufferRange op ope)
+                  | i < iendTmp = case utf8LengthByLeader w of
+                    1 -> do
+                      BP.runB be w op >>= go (i + 1)
+                    2 -> do
+                      poke (op `plusPtr` 0) w
+                      poke (op `plusPtr` 1) (A.unsafeIndex arr (i+1))
+                      go (i + 2) (op `plusPtr` 2)
+                    3 -> do
+                      poke (op `plusPtr` 0) w
+                      poke (op `plusPtr` 1) (A.unsafeIndex arr (i+1))
+                      poke (op `plusPtr` 2) (A.unsafeIndex arr (i+2))
+                      go (i + 3) (op `plusPtr` 3)
+                    _ -> do
+                      poke (op `plusPtr` 0) w
+                      poke (op `plusPtr` 1) (A.unsafeIndex arr (i+1))
+                      poke (op `plusPtr` 2) (A.unsafeIndex arr (i+2))
+                      poke (op `plusPtr` 3) (A.unsafeIndex arr (i+3))
+                      go (i + 4) (op `plusPtr` 4)
+                  | otherwise = outerLoop i (B.BufferRange op ope)
                   where
-                    -- Take care, a is either Word16 or Int above
-                    poke8 :: Integral a => Int -> a -> IO ()
-                    poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8)
+                    w = A.unsafeIndex arr i
 
 -- | Encode text using UTF-8 encoding.
 encodeUtf8 :: Text -> ByteString
-encodeUtf8 (Text arr off len)
+encodeUtf8 (Text (A.Array arr) off len)
   | len == 0  = B.empty
-  | otherwise = unsafeDupablePerformIO $ do
-  fp <- B.mallocByteString (len*3) -- see https://github.com/haskell/text/issues/194 for why len*3 is enough
-  unsafeWithForeignPtr fp $ \ptr ->
-    with ptr $ \destPtr -> do
-      c_encode_utf8 destPtr (A.aBA arr) (intToCSize off) (intToCSize len)
-      newDest <- peek destPtr
-      let utf8len = newDest `minusPtr` ptr
-      if utf8len >= len `shiftR` 1
-        then return (mkBS fp utf8len)
-        else do
-          fp' <- B.mallocByteString utf8len
-          unsafeWithForeignPtr fp' $ \ptr' -> do
-            B.memcpy ptr' ptr utf8len
-            return (mkBS fp' utf8len)
+  | otherwise = B.take len $ B.drop off $ SBS.fromShort $ SBS.SBS arr
 
 -- | Decode text from little endian UTF-16 encoding.
 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
@@ -563,9 +520,6 @@ cSizeToInt = fromIntegral
 intToCSize :: Int -> CSize
 intToCSize = fromIntegral
 
-word16ToWord8 :: Word16 -> Word8
-word16ToWord8 = fromIntegral
-
 foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
     :: MutableByteArray# s -> Ptr CSize
     -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
@@ -576,7 +530,4 @@ foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_stat
     -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
 
 foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
-    :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO ()
-
-foreign import ccall unsafe "_hs_text_encode_utf8" c_encode_utf8
-    :: Ptr (Ptr Word8) -> ByteArray# -> CSize -> CSize -> IO ()
+    :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO Int
diff --git a/src/Data/Text/Foreign.hs b/src/Data/Text/Foreign.hs
index 65805b2f..87742011 100644
--- a/src/Data/Text/Foreign.hs
+++ b/src/Data/Text/Foreign.hs
@@ -14,7 +14,7 @@ module Data.Text.Foreign
     (
     -- * Interoperability with native code
     -- $interop
-      I16
+      I8
     -- * Safe conversion functions
     , fromPtr
     , useAsPtr
@@ -23,12 +23,12 @@ module Data.Text.Foreign
     , peekCStringLen
     , withCStringLen
     -- * Unsafe conversion code
-    , lengthWord16
+    , lengthWord8
     , unsafeCopyToPtr
     -- * Low-level manipulation
     -- $lowlevel
-    , dropWord16
-    , takeWord16
+    , dropWord8
+    , takeWord8
     ) where
 
 #if defined(ASSERTS)
@@ -39,8 +39,8 @@ import Data.ByteString.Unsafe (unsafePackCStringLen, unsafeUseAsCStringLen)
 import Data.Text.Encoding (decodeUtf8, encodeUtf8)
 import Data.Text.Internal (Text(..), empty)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
-import Data.Text.Unsafe (lengthWord16)
-import Data.Word (Word16)
+import Data.Text.Unsafe (lengthWord8)
+import Data.Word (Word8)
 import Foreign.C.String (CStringLen)
 import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray)
 import Foreign.Marshal.Alloc (allocaBytes)
@@ -54,24 +54,22 @@ import qualified Data.Text.Array as A
 -- to have a fixed address in the Haskell heap. All communication with
 -- native code must thus occur by copying data back and forth.
 --
--- The 'Text' type's internal representation is UTF-16, using the
--- platform's native endianness.  This makes copied data suitable for
--- use with native libraries that use a similar representation, such
--- as ICU.  To interoperate with native libraries that use different
--- internal representations, such as UTF-8 or UTF-32, consider using
+-- The 'Text' type's internal representation is UTF-8.
+-- To interoperate with native libraries that use different
+-- internal representations, such as UTF-16 or UTF-32, consider using
 -- the functions in the 'Data.Text.Encoding' module.
 
--- | A type representing a number of UTF-16 code units.
-newtype I16 = I16 Int
+-- | A type representing a number of UTF-8 code units.
+newtype I8 = I8 Int
     deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show)
 
--- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word16' by copying the
+-- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word8' by copying the
 -- contents of the array.
-fromPtr :: Ptr Word16           -- ^ source array
-        -> I16                  -- ^ length of source array (in 'Word16' units)
+fromPtr :: Ptr Word8           -- ^ source array
+        -> I8                  -- ^ length of source array (in 'Word8' units)
         -> IO Text
-fromPtr _   (I16 0)   = return empty
-fromPtr ptr (I16 len) =
+fromPtr _   (I8 0)   = return empty
+fromPtr ptr (I8 len) =
 #if defined(ASSERTS)
     assert (len > 0) $
 #endif
@@ -83,72 +81,77 @@ fromPtr ptr (I16 len) =
         loop !p !i | i == len = return marr
                    | otherwise = do
           A.unsafeWrite marr i =<< unsafeIOToST (peek p)
-          loop (p `plusPtr` 2) (i + 1)
+          loop (p `plusPtr` 1) (i + 1)
 
 -- $lowlevel
 --
--- Foreign functions that use UTF-16 internally may return indices in
--- units of 'Word16' instead of characters.  These functions may
+-- Foreign functions that use UTF-8 internally may return indices in
+-- units of 'Word8' instead of characters.  These functions may
 -- safely be used with such indices, as they will adjust offsets if
 -- necessary to preserve the validity of a Unicode string.
 
--- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word16' units in
+-- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word8' units in
 -- length.
 --
--- If @n@ would cause the 'Text' to end inside a surrogate pair, the
--- end of the prefix will be advanced by one additional 'Word16' unit
+-- If @n@ would cause the 'Text' to end inside a code point, the
+-- end of the prefix will be advanced by several additional 'Word8' units
 -- to maintain its validity.
-takeWord16 :: I16 -> Text -> Text
-takeWord16 (I16 n) t@(Text arr off len)
-    | n <= 0               = empty
-    | n >= len || m >= len = t
-    | otherwise            = Text arr off m
-  where
-    m | w < 0xD800 || w > 0xDBFF = n
-      | otherwise                = n+1
-    w = A.unsafeIndex arr (off+n-1)
+takeWord8 :: I8 -> Text -> Text
+takeWord8 = (fst .) . splitAtWord8
 
--- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word16' units
+-- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word8' units
 -- dropped from its beginning.
 --
--- If @n@ would cause the 'Text' to begin inside a surrogate pair, the
--- beginning of the suffix will be advanced by one additional 'Word16'
+-- If @n@ would cause the 'Text' to begin inside a code point, the
+-- beginning of the suffix will be advanced by several additional 'Word8'
 -- unit to maintain its validity.
-dropWord16 :: I16 -> Text -> Text
-dropWord16 (I16 n) t@(Text arr off len)
-    | n <= 0               = t
-    | n >= len || m >= len = empty
-    | otherwise            = Text arr (off+m) (len-m)
+dropWord8 :: I8 -> Text -> Text
+dropWord8 = (snd .) . splitAtWord8
+
+splitAtWord8 :: I8 -> Text -> (Text, Text)
+splitAtWord8 (I8 n) t@(Text arr off len)
+    | n <= 0               = (empty, t)
+    | n >= len || m >= len = (t, empty)
+    | otherwise            = (Text arr off m, Text arr (off+m) (len-m))
   where
-    m | w < 0xD800 || w > 0xDBFF = n
-      | otherwise                = n+1
-    w = A.unsafeIndex arr (off+n-1)
+    m | w0 <  0x80 = n   -- last char is ASCII
+      | w0 >= 0xF0 = n+3 -- last char starts 4-byte sequence
+      | w0 >= 0xE0 = n+2 -- last char starts 3-byte sequence
+      | w0 >= 0xC0 = n+1 -- last char starts 2-byte sequence
+      | w1 >= 0xF0 = n+2 -- pre-last char starts 4-byte sequence
+      | w1 >= 0xE0 = n+1 -- pre-last char starts 3-byte sequence
+      | w1 >= 0xC0 = n   -- pre-last char starts 2-byte sequence
+      | w2 >= 0xF0 = n+1 -- pre-pre-last char starts 4-byte sequence
+      | otherwise  = n   -- pre-pre-last char starts 3-byte sequence
+    w0 = A.unsafeIndex arr (off+n-1)
+    w1 = A.unsafeIndex arr (off+n-2)
+    w2 = A.unsafeIndex arr (off+n-3)
 
 -- | /O(n)/ Copy a 'Text' to an array.  The array is assumed to be big
 -- enough to hold the contents of the entire 'Text'.
-unsafeCopyToPtr :: Text -> Ptr Word16 -> IO ()
+unsafeCopyToPtr :: Text -> Ptr Word8 -> IO ()
 unsafeCopyToPtr (Text arr off len) ptr = loop ptr off
   where
     end = off + len
     loop !p !i | i == end  = return ()
                | otherwise = do
       poke p (A.unsafeIndex arr i)
-      loop (p `plusPtr` 2) (i + 1)
+      loop (p `plusPtr` 1) (i + 1)
 
 -- | /O(n)/ Perform an action on a temporary, mutable copy of a
 -- 'Text'.  The copy is freed as soon as the action returns.
-useAsPtr :: Text -> (Ptr Word16 -> I16 -> IO a) -> IO a
+useAsPtr :: Text -> (Ptr Word8 -> I8 -> IO a) -> IO a
 useAsPtr t@(Text _arr _off len) action =
-    allocaBytes (len * 2) $ \buf -> do
+    allocaBytes len $ \buf -> do
       unsafeCopyToPtr t buf
-      action (castPtr buf) (I16 len)
+      action (castPtr buf) (I8 len)
 
 -- | /O(n)/ Make a mutable copy of a 'Text'.
-asForeignPtr :: Text -> IO (ForeignPtr Word16, I16)
+asForeignPtr :: Text -> IO (ForeignPtr Word8, I8)
 asForeignPtr t@(Text _arr _off len) = do
   fp <- mallocForeignPtrArray len
   unsafeWithForeignPtr fp $ unsafeCopyToPtr t
-  return (fp, I16 len)
+  return (fp, I8 len)
 
 -- | /O(n)/ Decode a C string with explicit length, which is assumed
 -- to have been encoded as UTF-8. If decoding fails, a
diff --git a/src/Data/Text/Internal.hs b/src/Data/Text/Internal.hs
index 36fa36e0..b5a1d443 100644
--- a/src/Data/Text/Internal.hs
+++ b/src/Data/Text/Internal.hs
@@ -55,9 +55,9 @@ import qualified Data.Text.Array as A
 
 -- | A space efficient, packed, unboxed Unicode text type.
 data Text = Text
-    {-# UNPACK #-} !A.Array          -- payload (Word16 elements)
-    {-# UNPACK #-} !Int              -- offset (units of Word16, not Char)
-    {-# UNPACK #-} !Int              -- length (units of Word16, not Char)
+    {-# UNPACK #-} !A.Array -- bytearray encoded as UTF-8
+    {-# UNPACK #-} !Int     -- offset in bytes (not in Char!), pointing to a start of UTF-8 sequence
+    {-# UNPACK #-} !Int     -- length in bytes (not in Char!), pointing to an end of UTF-8 sequence
     deriving (Typeable)
 
 -- | Smart constructor.
@@ -65,13 +65,16 @@ text_ ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  A.Array -> Int -> Int -> Text
+     A.Array -- ^ bytearray encoded as UTF-8
+  -> Int     -- ^ offset in bytes (not in Char!), pointing to a start of UTF-8 sequence
+  -> Int     -- ^ length in bytes (not in Char!), pointing to an end of UTF-8 sequence
+  -> Text
 text_ arr off len =
 #if defined(ASSERTS)
   let c    = A.unsafeIndex arr off
   in assert (len >= 0) .
      assert (off >= 0) .
-     assert (len == 0 || c < 0xDC00 || c > 0xDFFF) $
+     assert (len == 0 || c < 0x80 || c >= 0xC0) $
 #endif
      Text arr off len
 {-# INLINE text_ #-}
@@ -92,7 +95,10 @@ text ::
 #if defined(ASSERTS)
   HasCallStack =>
 #endif
-  A.Array -> Int -> Int -> Text
+     A.Array -- ^ bytearray encoded as UTF-8
+  -> Int     -- ^ offset in bytes (not in Char!), pointing to a start of UTF-8 sequence
+  -> Int     -- ^ length in bytes (not in Char!), pointing to an end of UTF-8 sequence
+  -> Text
 text arr off len | len == 0  = empty
                  | otherwise = text_ arr off len
 {-# INLINE text #-}
@@ -109,7 +115,7 @@ showText (Text arr off len) =
 
 -- | Map a 'Char' to a 'Text'-safe value.
 --
--- UTF-16 surrogate code points are not included in the set of Unicode
+-- Unicode 'Data.Char.Surrogate' code points are not included in the set of Unicode
 -- scalar values, but are unfortunately admitted as valid 'Char'
 -- values by Haskell.  They cannot be represented in a 'Text'.  This
 -- function remaps those code points to the Unicode replacement
@@ -191,19 +197,17 @@ int64ToInt32 = fromIntegral
 
 -- $internals
 --
--- Internally, the 'Text' type is represented as an array of 'Word16'
--- UTF-16 code units. The offset and length fields in the constructor
+-- Internally, the 'Text' type is represented as an array of 'Word8'
+-- UTF-8 code units. The offset and length fields in the constructor
 -- are in these units, /not/ units of 'Char'.
 --
 -- Invariants that all functions must maintain:
 --
--- * Since the 'Text' type uses UTF-16 internally, it cannot represent
+-- * Since the 'Text' type uses UTF-8 internally, it cannot represent
 --   characters in the reserved surrogate code point range U+D800 to
 --   U+DFFF. To maintain this invariant, the 'safe' function maps
 --   'Char' values in this range to the replacement character (U+FFFD,
 --   \'&#xfffd;\').
 --
--- * A leading (or \"high\") surrogate code unit (0xD800–0xDBFF) must
---   always be followed by a trailing (or \"low\") surrogate code unit
---   (0xDC00-0xDFFF). A trailing surrogate code unit must always be
---   preceded by a leading surrogate code unit.
+-- * Offset and length must point to a valid UTF-8 sequence of bytes.
+--   Violation of this may cause memory access violation and divergence.
diff --git a/src/Data/Text/Internal/Builder.hs b/src/Data/Text/Internal/Builder.hs
index 3da66c42..9d181276 100644
--- a/src/Data/Text/Internal/Builder.hs
+++ b/src/Data/Text/Internal/Builder.hs
@@ -140,7 +140,7 @@ singleton ::
   HasCallStack =>
 #endif
   Char -> Builder
-singleton c = writeAtMost 2 $ \ marr o -> unsafeWrite marr o (safe c)
+singleton c = writeAtMost 4 $ \ marr o -> unsafeWrite marr o (safe c)
 {-# INLINE singleton #-}
 
 ------------------------------------------------------------------------
@@ -185,7 +185,7 @@ fromString :: String -> Builder
 fromString str = Builder $ \k (Buffer p0 o0 u0 l0) ->
     let loop !marr !o !u !l [] = k (Buffer marr o u l)
         loop marr o u l s@(c:cs)
-            | l <= 1 = do
+            | l <= 3 = do
                 arr <- A.unsafeFreeze marr
                 let !t = Text arr o u
                 marr' <- A.new chunkSize
diff --git a/src/Data/Text/Internal/Encoding/Fusion.hs b/src/Data/Text/Internal/Encoding/Fusion.hs
index b42315e0..aa8f0d02 100644
--- a/src/Data/Text/Internal/Encoding/Fusion.hs
+++ b/src/Data/Text/Internal/Encoding/Fusion.hs
@@ -43,7 +43,7 @@ import Data.Text.Internal.Fusion (Step(..), Stream(..))
 import Data.Text.Internal.Fusion.Size
 import Data.Text.Encoding.Error
 import Data.Text.Internal.Encoding.Fusion.Common
-import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32)
+import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Word (Word8, Word16, Word32)
 import Foreign.ForeignPtr (ForeignPtr)
@@ -99,7 +99,7 @@ streamUtf16LE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
       {-# INLINE next #-}
       next i
           | i >= l                         = Done
-          | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
+          | i+1 < l && U16.validate1 x1    = Yield (unsafeChr16 x1) (i+2)
           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
           | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1)
           where
@@ -117,7 +117,7 @@ streamUtf16BE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
       {-# INLINE next #-}
       next i
           | i >= l                         = Done
-          | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
+          | i+1 < l && U16.validate1 x1    = Yield (unsafeChr16 x1) (i+2)
           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
           | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1)
           where
diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs
index f0a04fa7..fa69b7d9 100644
--- a/src/Data/Text/Internal/Encoding/Utf8.hs
+++ b/src/Data/Text/Internal/Encoding/Utf8.hs
@@ -5,6 +5,7 @@
 -- Copyright   : (c) 2008, 2009 Tom Harper,
 --               (c) 2009, 2010 Bryan O'Sullivan,
 --               (c) 2009 Duncan Coutts
+--               (c) 2021 Andrew Lelechenko
 --
 -- License     : BSD-style
 -- Maintainer  : bos@serpentine.com
@@ -17,9 +18,10 @@
 --
 -- Basic UTF-8 validation and character manipulation.
 module Data.Text.Internal.Encoding.Utf8
-    (
+    ( utf8Length
+    , utf8LengthByLeader
     -- Decomposition
-      ord2
+    , ord2
     , ord3
     , ord4
     -- Construction
@@ -34,7 +36,7 @@ module Data.Text.Internal.Encoding.Utf8
     ) where
 
 import Data.Bits ((.&.), shiftR)
-import Data.Text.Internal.Unsafe.Char (ord)
+import Data.Char (ord)
 import GHC.Exts
 import GHC.Word (Word8(..))
 
@@ -52,6 +54,21 @@ between :: Word8                -- ^ byte to check
 between x y z = x >= y && x <= z
 {-# INLINE between #-}
 
+-- TODO make branchless by looking into Word64 by clz (ord c)
+utf8Length :: Char -> Int
+utf8Length c
+  | ord c < 0x80    = 1
+  | ord c < 0x800   = 2
+  | ord c < 0x10000 = 3
+  | otherwise       = 4
+
+utf8LengthByLeader :: Word8 -> Int
+utf8LengthByLeader w
+  | w < 0x80  = 1
+  | w < 0xE0  = 2
+  | w < 0xF0  = 3
+  | otherwise = 4
+
 ord2 :: Char -> (Word8,Word8)
 ord2 c =
     -- ord2 is used only in test suite to construct a deliberately invalid ByteString,
diff --git a/src/Data/Text/Internal/Fusion.hs b/src/Data/Text/Internal/Fusion.hs
index a96d2a17..01b781a1 100644
--- a/src/Data/Text/Internal/Fusion.hs
+++ b/src/Data/Text/Internal/Fusion.hs
@@ -50,19 +50,18 @@ module Data.Text.Internal.Fusion
     ) where
 
 import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int,
-                Num(..), Ord(..), ($), (&&),
-                fromIntegral, otherwise)
-import Data.Bits ((.&.), shiftL, shiftR)
+                Num(..), Ord(..), ($),
+                otherwise)
+import Data.Bits (shiftL, shiftR)
 import Data.Text.Internal (Text(..))
 import Data.Text.Internal.Private (runText)
-import Data.Text.Internal.Unsafe.Char (ord, unsafeChr, unsafeWrite)
+import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeWrite)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Internal.Fusion.Common as S
 import Data.Text.Internal.Fusion.Types
 import Data.Text.Internal.Fusion.Size
 import qualified Data.Text.Internal as I
-import qualified Data.Text.Internal.Encoding.Utf16 as U16
-import Data.Word (Word16)
+import qualified Data.Text.Internal.Encoding.Utf8 as U8
 
 #if defined(ASSERTS)
 import GHC.Stack (HasCallStack)
@@ -82,16 +81,24 @@ stream ::
   HasCallStack =>
 #endif
   Text -> Stream Char
-stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
+stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 2) len)
     where
       !end = off+len
       next !i
-          | i >= end                   = Done
-          | n >= 0xD800 && n <= 0xDBFF = Yield (U16.chr2 n n2) (i + 2)
-          | otherwise                  = Yield (unsafeChr n) (i + 1)
+          | i >= end  = Done
+          | otherwise = Yield chr (i + l)
           where
-            n  = A.unsafeIndex arr i
-            n2 = A.unsafeIndex arr (i + 1)
+            n0 = A.unsafeIndex arr i
+            n1 = A.unsafeIndex arr (i + 1)
+            n2 = A.unsafeIndex arr (i + 2)
+            n3 = A.unsafeIndex arr (i + 3)
+
+            l  = U8.utf8LengthByLeader n0
+            chr = case l of
+              1 -> unsafeChr8 n0
+              2 -> U8.chr2 n0 n1
+              3 -> U8.chr3 n0 n1 n2
+              _ -> U8.chr4 n0 n1 n2 n3
 {-# INLINE [0] stream #-}
 
 -- | /O(n)/ Converts 'Text' into a 'Stream' 'Char', but iterates
@@ -101,16 +108,20 @@ stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
 --
 -- @'unstream' . 'reverseStream' = 'Data.Text.reverse' @
 reverseStream :: Text -> Stream Char
-reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 1) len)
+reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 2) len)
     where
       {-# INLINE next #-}
       next !i
-          | i < off                    = Done
-          | n >= 0xDC00 && n <= 0xDFFF = Yield (U16.chr2 n2 n) (i - 2)
-          | otherwise                  = Yield (unsafeChr n) (i - 1)
+          | i < off    = Done
+          | n0 <  0x80 = Yield (unsafeChr8 n0)       (i - 1)
+          | n1 >= 0xC0 = Yield (U8.chr2 n1 n0)       (i - 2)
+          | n2 >= 0xC0 = Yield (U8.chr3 n2 n1 n0)    (i - 3)
+          | otherwise  = Yield (U8.chr4 n3 n2 n1 n0) (i - 4)
           where
-            n  = A.unsafeIndex arr i
-            n2 = A.unsafeIndex arr (i - 1)
+            n0 = A.unsafeIndex arr i
+            n1 = A.unsafeIndex arr (i - 1)
+            n2 = A.unsafeIndex arr (i - 2)
+            n3 = A.unsafeIndex arr (i - 3)
 {-# INLINE [0] reverseStream #-}
 
 -- | /O(n)/ Convert 'Stream' 'Char' into a 'Text'.
@@ -123,10 +134,10 @@ reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `sh
 unstream :: Stream Char -> Text
 unstream (Stream next0 s0 len) = runText $ \done -> do
   -- Before encoding each char we perform a buffer realloc check assuming
-  -- worst case encoding size of two 16-bit units for the char. Just add an
+  -- worst case encoding size of four 8-bit units for the char. Just add an
   -- extra space to the buffer so that we do not end up reallocating even when
   -- all the chars are encoded as single unit.
-  let mlen = upperBound 4 len + 1
+  let mlen = upperBound 4 len + 3
   arr0 <- A.new mlen
   let outer !arr !maxi = encode
        where
@@ -137,7 +148,7 @@ unstream (Stream next0 s0 len) = runText $ \done -> do
                 Skip si'    -> encode si' di
                 Yield c si'
                     -- simply check for the worst case
-                    | maxi < di + 1 -> realloc si di
+                    | maxi < di + 3 -> realloc si di
                     | otherwise -> do
                             n <- unsafeWrite arr di c
                             encode si' (di + n)
@@ -192,22 +203,12 @@ reverse (Stream next s len0)
                        let newLen = len `shiftL` 1
                        marr' <- A.new newLen
                        A.copyM marr' (newLen-len) marr 0 len
-                       write s1 (len+i) newLen marr'
-                     | otherwise -> write s1 i len marr
-            where n = ord x
-                  least | n < 0x10000 = 0
-                        | otherwise   = 1
-                  m = n - 0x10000
-                  lo = intToWord16 $ (m `shiftR` 10) + 0xD800
-                  hi = intToWord16 $ (m .&. 0x3FF) + 0xDC00
-                  write t j l mar
-                      | n < 0x10000 = do
-                          A.unsafeWrite mar j (intToWord16 n)
-                          loop t (j-1) l mar
-                      | otherwise = do
-                          A.unsafeWrite mar (j-1) lo
-                          A.unsafeWrite mar j hi
-                          loop t (j-2) l mar
+                       _ <- unsafeWrite marr' (len + i - least) x
+                       loop s1 (len + i - least - 1) newLen marr'
+                     | otherwise -> do
+                       _ <- unsafeWrite marr (i - least) x
+                       loop s1 (i - least - 1) len marr
+            where least = U8.utf8Length x - 1
 {-# INLINE [0] reverse #-}
 
 -- | /O(n)/ Perform the equivalent of 'scanr' over a list, only with
@@ -304,9 +305,5 @@ mapAccumL f z0 (Stream next0 s0 len) = (nz, I.text na 0 nl)
                 | otherwise -> do d <- unsafeWrite arr i c
                                   loop z' s' (i+d)
                 where (z',c) = f z x
-                      j | ord c < 0x10000 = i
-                        | otherwise       = i + 1
+                      j = i + U8.utf8Length c - 1
 {-# INLINE [0] mapAccumL #-}
-
-intToWord16 :: Int -> Word16
-intToWord16 = fromIntegral
diff --git a/src/Data/Text/Internal/Fusion/Common.hs b/src/Data/Text/Internal/Fusion/Common.hs
index d9f83498..dc19cf1b 100644
--- a/src/Data/Text/Internal/Fusion/Common.hs
+++ b/src/Data/Text/Internal/Fusion/Common.hs
@@ -124,15 +124,17 @@ import Prelude (Bool(..), Char, Eq(..), Int, Integral, Maybe(..),
                 (&&), fromIntegral, otherwise)
 import qualified Data.List as L
 import qualified Prelude as P
-import Data.Bits (shiftL)
 import Data.Char (isLetter, isSpace)
 import Data.Int (Int64)
+import Data.Text.Internal.Encoding.Utf8 (chr2, chr3, chr4, utf8LengthByLeader)
 import Data.Text.Internal.Fusion.Types
 import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, titleMapping,
                                      upperMapping)
 import Data.Text.Internal.Fusion.Size
-import GHC.Prim (Addr#, chr#, indexCharOffAddr#, ord#)
-import GHC.Types (Char(..), Int(..))
+import GHC.Prim (Addr#, indexWord8OffAddr#)
+import GHC.Types (Int(..))
+import Data.Text.Internal.Unsafe.Char (unsafeChr8)
+import GHC.Word
 
 -- | /O(1)/ Convert a character into a 'Stream'
 --
@@ -185,23 +187,16 @@ streamCString# addr = Stream step 0 unknownSize
   where
     step !i
         | b == 0    = Done
-        | b <= 0x7f = Yield (C# b#) (i+1)
-        | b <= 0xdf = let !c = chr $ ((b-0xc0) `shiftL` 6) + next 1
-                      in Yield c (i+2)
-        | b <= 0xef = let !c = chr $ ((b-0xe0) `shiftL` 12) +
-                                      (next 1  `shiftL` 6) +
-                                       next 2
-                      in Yield c (i+3)
-        | otherwise = let !c = chr $ ((b-0xf0) `shiftL` 18) +
-                                      (next 1  `shiftL` 12) +
-                                      (next 2  `shiftL` 6) +
-                                       next 3
-                      in Yield c (i+4)
-      where b      = I# (ord# b#)
-            next n = I# (ord# (at# (i+n))) - 0x80
-            !b#    = at# i
-    at# (I# i#) = indexCharOffAddr# addr i#
-    chr (I# i#) = C# (chr# i#)
+        | otherwise = Yield chr (i + l)
+      where b = at# i
+            l = utf8LengthByLeader b
+            next n = at# (i+n)
+            chr = case l of
+              1 -> unsafeChr8 b
+              2 -> chr2 b (next 1)
+              3 -> chr3 b (next 1) (next 2)
+              _ -> chr4 b (next 1) (next 2) (next 3)
+    at# (I# i#) = W8# (indexWord8OffAddr# addr i#)
 {-# INLINE [0] streamCString# #-}
 
 -- ----------------------------------------------------------------------------
diff --git a/src/Data/Text/Internal/Fusion/Size.hs b/src/Data/Text/Internal/Fusion/Size.hs
index 50118c97..d6420555 100644
--- a/src/Data/Text/Internal/Fusion/Size.hs
+++ b/src/Data/Text/Internal/Fusion/Size.hs
@@ -37,13 +37,13 @@ module Data.Text.Internal.Fusion.Size
     , isEmpty
     ) where
 
-import Data.Char (ord)
+import Data.Text.Internal.Encoding.Utf8 (utf8Length)
 import Data.Text.Internal (mul)
 #if defined(ASSERTS)
 import Control.Exception (assert)
 #endif
 
--- | A size in UTF-16 code units.
+-- | A size in UTF-8 code units (which is bytes).
 data Size = Between {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ Lower and upper bounds on size.
           | Unknown                                         -- ^ Unknown size.
             deriving (Eq, Show)
@@ -55,9 +55,7 @@ exactly _ = Nothing
 
 -- | The 'Size' of the given code point.
 charSize :: Char -> Size
-charSize c
-  | ord c < 0x10000 = exactSize 1
-  | otherwise       = exactSize 2
+charSize = exactSize . utf8Length
 
 -- | The 'Size' of @n@ code points.
 codePointsSize :: Int -> Size
@@ -65,7 +63,7 @@ codePointsSize n =
 #if defined(ASSERTS)
     assert (n >= 0)
 #endif
-    Between n (2*n)
+    Between n (4*n)
 {-# INLINE codePointsSize #-}
 
 exactSize :: Int -> Size
@@ -160,7 +158,7 @@ upperBound _ (Between _ n) = n
 upperBound k _             = k
 {-# INLINE upperBound #-}
 
--- | Compute the maximum size from a size hint, if possible.
+-- | Compute the minimum size from a size hint, if possible.
 lowerBound :: Int -> Size -> Int
 lowerBound _ (Between n _) = n
 lowerBound k _             = k
diff --git a/src/Data/Text/Internal/Fusion/Types.hs b/src/Data/Text/Internal/Fusion/Types.hs
index df05773d..b97784f8 100644
--- a/src/Data/Text/Internal/Fusion/Types.hs
+++ b/src/Data/Text/Internal/Fusion/Types.hs
@@ -75,7 +75,7 @@ instance (Ord a) => Ord (Stream a) where
 -- unstreaming functions must be able to cope with the hint being too
 -- small or too large.
 --
--- The size hint tries to track the UTF-16 code units in a stream,
+-- The size hint tries to track the UTF-8 code units in a stream,
 -- but often counts the number of code points instead.  It can easily
 -- undercount if, for instance, a transformed stream contains astral
 -- plane code points (those above 0x10000).
diff --git a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
index dba6db34..69149779 100644
--- a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
+++ b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs
@@ -41,7 +41,7 @@ import Data.Text.Internal.Encoding.Fusion.Common
 import Data.Text.Encoding.Error
 import Data.Text.Internal.Fusion (Step(..), Stream(..))
 import Data.Text.Internal.Fusion.Size
-import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32)
+import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32)
 import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
 import Data.Word (Word8, Word16, Word32)
 import qualified Data.Text.Internal.Encoding.Utf8 as U8
@@ -112,7 +112,7 @@ streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
   where
     next (T bs@(Chunk ps _) S0 i)
       | i + 1 < len && U16.validate1 x1 =
-          Yield (unsafeChr x1)         (T bs S0 (i+2))
+          Yield (unsafeChr16 x1)         (T bs S0 (i+2))
       | i + 3 < len && U16.validate2 x1 x2 =
           Yield (U16.chr2 x1 x2)       (T bs S0 (i+4))
       where len = B.length ps
@@ -123,7 +123,7 @@ streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
     next st@(T bs s i) =
       case s of
         S2 w1 w2       | U16.validate1 (c w1 w2)           ->
-          Yield (unsafeChr (c w1 w2))   es
+          Yield (unsafeChr16 (c w1 w2))   es
         S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
           Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
         _ -> consume st
@@ -152,7 +152,7 @@ streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
   where
     next (T bs@(Chunk ps _) S0 i)
       | i + 1 < len && U16.validate1 x1 =
-          Yield (unsafeChr x1)         (T bs S0 (i+2))
+          Yield (unsafeChr16 x1)         (T bs S0 (i+2))
       | i + 3 < len && U16.validate2 x1 x2 =
           Yield (U16.chr2 x1 x2)       (T bs S0 (i+4))
       where len = B.length ps
@@ -163,7 +163,7 @@ streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
     next st@(T bs s i) =
       case s of
         S2 w1 w2       | U16.validate1 (c w1 w2)           ->
-          Yield (unsafeChr (c w1 w2))   es
+          Yield (unsafeChr16 (c w1 w2))   es
         S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
           Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
         _ -> consume st
diff --git a/src/Data/Text/Internal/Lazy/Fusion.hs b/src/Data/Text/Internal/Lazy/Fusion.hs
index c9b71c86..867d0ac4 100644
--- a/src/Data/Text/Internal/Lazy/Fusion.hs
+++ b/src/Data/Text/Internal/Lazy/Fusion.hs
@@ -79,8 +79,8 @@ unstreamChunks !chunkSize (Stream next s0 len0)
                     where unknownLength = 4
       where
         inner marr !len s !i
-            | i + 1 >= chunkSize = finish marr i s
-            | i + 1 >= len       = {-# SCC "unstreamChunks/resize" #-} do
+            | i + 3 >= chunkSize = finish marr i s
+            | i + 3 >= len       = {-# SCC "unstreamChunks/resize" #-} do
                 let newLen = min (len `shiftL` 1) chunkSize
                 marr' <- A.new newLen
                 A.copyM marr' 0 marr 0 len
diff --git a/src/Data/Text/Internal/Lazy/Search.hs b/src/Data/Text/Internal/Lazy/Search.hs
index 72930482..78450cbf 100644
--- a/src/Data/Text/Internal/Lazy/Search.hs
+++ b/src/Data/Text/Internal/Lazy/Search.hs
@@ -25,7 +25,7 @@ module Data.Text.Internal.Lazy.Search
 import Data.Bits (unsafeShiftL)
 import qualified Data.Text.Array as A
 import Data.Int (Int64)
-import Data.Word (Word16, Word64)
+import Data.Word (Word8, Word64)
 import qualified Data.Text.Internal as T
 import Data.Text.Internal.Fusion.Types (PairS(..))
 import Data.Text.Internal.Lazy (Text(..), foldlChunks)
@@ -75,8 +75,8 @@ indices needle@(Chunk n ns) _haystack@(Chunk k ks)
         where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
     (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2)
 
-    swizzle :: Word16 -> Word64
-    swizzle w = 1 `unsafeShiftL` (word16ToInt w .&. 0x3f)
+    swizzle :: Word8 -> Word64
+    swizzle w = 1 `unsafeShiftL` (word8ToInt w .&. 0x3f)
 
     buildTable (T.Text xarr xoff xlen) xs = go
       where
@@ -105,7 +105,7 @@ indices _ _ = []
 -- | Fast index into a partly unpacked 'Text'.  We take into account
 -- the possibility that the caller might try to access one element
 -- past the end.
-index :: T.Text -> Text -> Int64 -> Word16
+index :: T.Text -> Text -> Int64 -> Word8
 index (T.Text arr off len) xs !i
     | j < len   = A.unsafeIndex arr (off+j)
     | otherwise = case xs of
@@ -117,8 +117,8 @@ index (T.Text arr off len) xs !i
                     Chunk c cs -> index c cs (i-intToInt64 len)
     where j = int64ToInt i
 
--- | A variant of 'indices' that scans linearly for a single 'Word16'.
-indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64]
+-- | A variant of 'indices' that scans linearly for a single 'Word8'.
+indicesOne :: Word8 -> Int64 -> T.Text -> Text -> [Int64]
 indicesOne c = chunk
   where
     chunk :: Int64 -> T.Text -> Text -> [Int64]
@@ -131,7 +131,7 @@ indicesOne c = chunk
              | otherwise = go (h+1)
              where on = A.unsafeIndex oarr (ooff+h)
 
--- | The number of 'Word16' values in a 'Text'.
+-- | The number of 'Word8' values in a 'Text'.
 wordLength :: Text -> Int64
 wordLength = foldlChunks sumLength 0
   where
@@ -147,5 +147,5 @@ intToInt64 = fromIntegral
 int64ToInt :: Int64 -> Int
 int64ToInt = fromIntegral
 
-word16ToInt :: Word16 -> Int
-word16ToInt = fromIntegral
+word8ToInt :: Word8 -> Int
+word8ToInt = fromIntegral
diff --git a/src/Data/Text/Internal/Search.hs b/src/Data/Text/Internal/Search.hs
index 0aaab2ab..5688917f 100644
--- a/src/Data/Text/Internal/Search.hs
+++ b/src/Data/Text/Internal/Search.hs
@@ -32,7 +32,7 @@ module Data.Text.Internal.Search
     ) where
 
 import qualified Data.Text.Array as A
-import Data.Word (Word64, Word16)
+import Data.Word (Word64, Word8)
 import Data.Text.Internal (Text(..))
 import Data.Bits ((.|.), (.&.), unsafeShiftL)
 
@@ -67,8 +67,8 @@ indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
               skp' | c == z    = nlen - i - 2
                    | otherwise = skp
 
-    swizzle :: Word16 -> Word64
-    swizzle k = 1 `unsafeShiftL` (word16ToInt k .&. 0x3f)
+    swizzle :: Word8 -> Word64
+    swizzle k = 1 `unsafeShiftL` (word8ToInt k .&. 0x3f)
 
     scan !i
         | i > ldiff                  = []
@@ -90,5 +90,5 @@ indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
                       | otherwise     = loop (i+1)
 {-# INLINE indices #-}
 
-word16ToInt :: Word16 -> Int
-word16ToInt = fromIntegral
+word8ToInt :: Word8 -> Int
+word8ToInt = fromIntegral
diff --git a/src/Data/Text/Internal/Unsafe/Char.hs b/src/Data/Text/Internal/Unsafe/Char.hs
index 7982e276..3f3372c8 100644
--- a/src/Data/Text/Internal/Unsafe/Char.hs
+++ b/src/Data/Text/Internal/Unsafe/Char.hs
@@ -19,14 +19,14 @@
 module Data.Text.Internal.Unsafe.Char
     (
       ord
-    , unsafeChr
+    , unsafeChr16
     , unsafeChr8
     , unsafeChr32
     , unsafeWrite
     ) where
 
 import Control.Monad.ST (ST)
-import Data.Bits ((.&.), shiftR)
+import Data.Text.Internal.Encoding.Utf8
 import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
 import GHC.Word (Word8(..), Word16(..), Word32(..))
 import qualified Data.Text.Array as A
@@ -39,9 +39,9 @@ ord :: Char -> Int
 ord (C# c#) = I# (ord# c#)
 {-# INLINE ord #-}
 
-unsafeChr :: Word16 -> Char
-unsafeChr (W16# w#) = C# (chr# (word2Int# (word16ToWord# w#)))
-{-# INLINE unsafeChr #-}
+unsafeChr16 :: Word16 -> Char
+unsafeChr16 (W16# w#) = C# (chr# (word2Int# (word16ToWord# w#)))
+{-# INLINE unsafeChr16 #-}
 
 unsafeChr8 :: Word8 -> Char
 unsafeChr8 (W8# w#) = C# (chr# (word2Int# (word8ToWord# w#)))
@@ -52,25 +52,36 @@ unsafeChr32 (W32# w#) = C# (chr# (word2Int# (word32ToWord# w#)))
 {-# INLINE unsafeChr32 #-}
 
 -- | Write a character into the array at the given offset.  Returns
--- the number of 'Word16's written.
+-- the number of 'Word8's written.
 unsafeWrite ::
 #if defined(ASSERTS)
     HasCallStack =>
 #endif
     A.MArray s -> Int -> Char -> ST s Int
-unsafeWrite marr i c
-    | n < 0x10000 = do
-        A.unsafeWrite marr i (intToWord16 n)
+unsafeWrite marr i c = case utf8Length c of
+    1 -> do
+        let n0 = intToWord8 (ord c)
+        A.unsafeWrite marr i n0
         return 1
-    | otherwise = do
-        A.unsafeWrite marr i lo
-        A.unsafeWrite marr (i+1) hi
+    2 -> do
+        let (n0, n1) = ord2 c
+        A.unsafeWrite marr i     n0
+        A.unsafeWrite marr (i+1) n1
         return 2
-    where n = ord c
-          m = n - 0x10000
-          lo = intToWord16 $ (m `shiftR` 10) + 0xD800
-          hi = intToWord16 $ (m .&. 0x3FF) + 0xDC00
+    3 -> do
+        let (n0, n1, n2) = ord3 c
+        A.unsafeWrite marr i     n0
+        A.unsafeWrite marr (i+1) n1
+        A.unsafeWrite marr (i+2) n2
+        return 3
+    _ -> do
+        let (n0, n1, n2, n3) = ord4 c
+        A.unsafeWrite marr i     n0
+        A.unsafeWrite marr (i+1) n1
+        A.unsafeWrite marr (i+2) n2
+        A.unsafeWrite marr (i+3) n3
+        return 4
 {-# INLINE unsafeWrite #-}
 
-intToWord16 :: Int -> Word16
-intToWord16 = fromIntegral
+intToWord8 :: Int -> Word8
+intToWord8 = fromIntegral
diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs
index 8f67b653..bb716286 100644
--- a/src/Data/Text/Lazy.hs
+++ b/src/Data/Text/Lazy.hs
@@ -253,7 +253,8 @@ import GHC.Stack (HasCallStack)
 -- points
 -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
 -- as 'Char' values, including code points from this invalid range.
--- This means that there are some 'Char' values that are not valid
+-- This means that there are some 'Char' values
+-- (corresponding to 'Data.Char.Surrogate' category) that are not valid
 -- Unicode scalar values, and the functions in this module must handle
 -- those cases.
 --
@@ -262,12 +263,7 @@ import GHC.Stack (HasCallStack)
 -- that are not valid Unicode scalar values with the replacement
 -- character \"&#xfffd;\" (U+FFFD).  Functions that perform this
 -- inspection and replacement are documented with the phrase
--- \"Performs replacement on invalid scalar values\".
---
--- (One reason for this policy of replacement is that internally, a
--- 'Text' value is represented as packed UTF-16 data. Values in the
--- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
--- code points, and so cannot be represented. The functions replace
+-- \"Performs replacement on invalid scalar values\". The functions replace
 -- invalid scalar values, instead of dropping them, as a security
 -- measure. For details, see
 -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
@@ -283,13 +279,13 @@ equal Empty _     = False
 equal _ Empty     = False
 equal (Chunk a as) (Chunk b bs) =
     case compare lenA lenB of
-      LT -> a == (T.takeWord16 lenA b) &&
-            as `equal` Chunk (T.dropWord16 lenA b) bs
+      LT -> a == (T.takeWord8 lenA b) &&
+            as `equal` Chunk (T.dropWord8 lenA b) bs
       EQ -> a == b && as `equal` bs
-      GT -> T.takeWord16 lenB a == b &&
-            Chunk (T.dropWord16 lenB a) as `equal` bs
-  where lenA = T.lengthWord16 a
-        lenB = T.lengthWord16 b
+      GT -> T.takeWord8 lenB a == b &&
+            Chunk (T.dropWord8 lenB a) as `equal` bs
+  where lenA = T.lengthWord8 a
+        lenB = T.lengthWord8 b
 
 instance Eq Text where
     (==) = equal
@@ -1040,9 +1036,9 @@ dropEnd n t0
                     T.dropEnd (int64ToInt m) t : ts
       where l = intToInt64 (T.length t)
 
--- | /O(n)/ 'dropWords' @n@ returns the suffix with @n@ 'Word16'
+-- | /O(n)/ 'dropWords' @n@ returns the suffix with @n@ 'Word8'
 -- values dropped, or the empty 'Text' if @n@ is greater than the
--- number of 'Word16' values present.
+-- number of 'Word8' values present.
 dropWords :: Int64 -> Text -> Text
 dropWords i t0
     | i <= 0    = t0
@@ -1082,7 +1078,7 @@ takeWhileEnd :: (Char -> Bool) -> Text -> Text
 takeWhileEnd p = takeChunk empty . L.reverse . toChunks
   where takeChunk acc []     = acc
         takeChunk acc (t:ts)
-          | T.lengthWord16 t' < T.lengthWord16 t
+          | T.lengthWord8 t' < T.lengthWord8 t
                              = chunk t' acc
           | otherwise        = takeChunk (Chunk t' acc) ts
           where t' = T.takeWhileEnd p t
@@ -1164,7 +1160,7 @@ splitAt = loop
          where len = intToInt64 (T.length t)
 
 -- | /O(n)/ 'splitAtWord' @n t@ returns a strict pair whose first
--- element is a prefix of @t@ whose chunks contain @n@ 'Word16'
+-- element is a prefix of @t@ whose chunks contain @n@ 'Word8'
 -- values, and whose second is the remainder of the string.
 splitAtWord :: Int64 -> Text -> PairS Text Text
 splitAtWord _ Empty = empty :*: empty
diff --git a/src/Data/Text/Lazy/Builder/Int.hs b/src/Data/Text/Lazy/Builder/Int.hs
index a6a79d64..f861d1e5 100644
--- a/src/Data/Text/Lazy/Builder/Int.hs
+++ b/src/Data/Text/Lazy/Builder/Int.hs
@@ -102,15 +102,15 @@ posDecimal marr off0 ds v0 = go (off0 + ds - 1) v0
           let i = fromIntegral i0; j = i + i
           unsafeWrite marr off $ get (j + 1)
           unsafeWrite marr (off - 1) $ get j
-        get = word8ToWord16 . B.unsafeIndex digits
+        get = B.unsafeIndex digits
 
-minus, zero :: Word16
+minus, zero :: Word8
 {-# INLINE minus #-}
 {-# INLINE zero #-}
 minus = 45
 zero = 48
 
-i2w :: (Integral a) => a -> Word16
+i2w :: (Integral a) => a -> Word8
 {-# INLINE i2w #-}
 i2w v = zero + fromIntegral v
 
@@ -242,6 +242,3 @@ integer base i
             | otherwise = loop (d-1) q <> hexDigit r
             where q = n `quotInt` base
                   r = n `remInt` base
-
-word8ToWord16 :: Word8 -> Word16
-word8ToWord16 = fromIntegral
diff --git a/src/Data/Text/Show.hs b/src/Data/Text/Show.hs
index 8d7ceb0e..9baeaeee 100644
--- a/src/Data/Text/Show.hs
+++ b/src/Data/Text/Show.hs
@@ -21,6 +21,7 @@ module Data.Text.Show
 
 import Control.Monad.ST (ST)
 import Data.Text.Internal (Text(..), empty_, safe)
+import Data.Text.Internal.Encoding.Utf8 (utf8Length)
 import Data.Text.Internal.Fusion (stream, unstream)
 import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import GHC.Prim (Addr#)
@@ -95,7 +96,6 @@ singleton_ c = Text (A.run x) 0 len
         x = do arr <- A.new len
                _ <- unsafeWrite arr 0 d
                return arr
-        len | d < '\x10000' = 1
-            | otherwise     = 2
+        len = utf8Length d
         d = safe c
 {-# NOINLINE singleton_ #-}
diff --git a/src/Data/Text/Unsafe.hs b/src/Data/Text/Unsafe.hs
index 88727074..64cc83c4 100644
--- a/src/Data/Text/Unsafe.hs
+++ b/src/Data/Text/Unsafe.hs
@@ -20,19 +20,19 @@ module Data.Text.Unsafe
     , reverseIter_
     , unsafeHead
     , unsafeTail
-    , lengthWord16
-    , takeWord16
-    , dropWord16
+    , lengthWord8
+    , takeWord8
+    , dropWord8
     ) where
 
 #if defined(ASSERTS)
 import Control.Exception (assert)
 import GHC.Stack (HasCallStack)
 #endif
-import Data.Text.Internal.Encoding.Utf16 (chr2)
+import Data.Text.Internal.Encoding.Utf8 (chr2, chr3, chr4, utf8LengthByLeader)
 import Data.Text.Internal (Text(..))
 import Data.Text.Internal.Unsafe (inlineInterleaveST, inlinePerformIO)
-import Data.Text.Internal.Unsafe.Char (unsafeChr)
+import Data.Text.Internal.Unsafe.Char (unsafeChr8)
 import qualified Data.Text.Array as A
 import GHC.IO (unsafeDupablePerformIO)
 
@@ -40,11 +40,15 @@ import GHC.IO (unsafeDupablePerformIO)
 -- omits the check for the empty case, so there is an obligation on
 -- the programmer to provide a proof that the 'Text' is non-empty.
 unsafeHead :: Text -> Char
-unsafeHead (Text arr off _len)
-    | m < 0xD800 || m > 0xDBFF = unsafeChr m
-    | otherwise                = chr2 m n
-    where m = A.unsafeIndex arr off
-          n = A.unsafeIndex arr (off+1)
+unsafeHead (Text arr off _len) = case utf8LengthByLeader m0 of
+    1 -> unsafeChr8 m0
+    2 -> chr2 m0 m1
+    3 -> chr3 m0 m1 m2
+    _ -> chr4 m0 m1 m2 m3
+    where m0 = A.unsafeIndex arr off
+          m1 = A.unsafeIndex arr (off+1)
+          m2 = A.unsafeIndex arr (off+2)
+          m3 = A.unsafeIndex arr (off+3)
 {-# INLINE unsafeHead #-}
 
 -- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeTail'
@@ -60,8 +64,9 @@ unsafeTail t@(Text arr off len) =
 {-# INLINE unsafeTail #-}
 
 data Iter = Iter {-# UNPACK #-} !Char {-# UNPACK #-} !Int
+  deriving (Show)
 
--- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-16
+-- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-8
 -- array, returning the current character and the delta to add to give
 -- the next offset to iterate at.
 iter ::
@@ -69,61 +74,73 @@ iter ::
   HasCallStack =>
 #endif
   Text -> Int -> Iter
-iter (Text arr off _len) i
-    | m < 0xD800 || m > 0xDBFF = Iter (unsafeChr m) 1
-    | otherwise                = Iter (chr2 m n) 2
-  where m = A.unsafeIndex arr j
-        n = A.unsafeIndex arr k
+iter (Text arr off _len) i = Iter chr l
+  where m0 = A.unsafeIndex arr j
+        m1 = A.unsafeIndex arr (j+1)
+        m2 = A.unsafeIndex arr (j+2)
+        m3 = A.unsafeIndex arr (j+3)
         j = off + i
-        k = j + 1
+        l = utf8LengthByLeader m0
+        chr = case l of
+            1 -> unsafeChr8 m0
+            2 -> chr2 m0 m1
+            3 -> chr3 m0 m1 m2
+            _ -> chr4 m0 m1 m2 m3
 {-# INLINE iter #-}
 
--- | /O(1)/ Iterate one step through a UTF-16 array, returning the
+-- | /O(1)/ Iterate one step through a UTF-8 array, returning the
 -- delta to add to give the next offset to iterate at.
 iter_ :: Text -> Int -> Int
-iter_ (Text arr off _len) i | m < 0xD800 || m > 0xDBFF = 1
-                            | otherwise                = 2
+iter_ (Text arr off _len) i = utf8LengthByLeader m
   where m = A.unsafeIndex arr (off+i)
 {-# INLINE iter_ #-}
 
--- | /O(1)/ Iterate one step backwards through a UTF-16 array,
+-- | /O(1)/ Iterate one step backwards through a UTF-8 array,
 -- returning the current character and the delta to add (i.e. a
 -- negative number) to give the next offset to iterate at.
 reverseIter :: Text -> Int ->  Iter
 reverseIter (Text arr off _len) i
-    | m < 0xDC00 || m > 0xDFFF = Iter (unsafeChr m) (-1)
-    | otherwise                = Iter (chr2 n m) (-2)
-  where m = A.unsafeIndex arr j
-        n = A.unsafeIndex arr k
+    | m0 <  0x80 = Iter (unsafeChr8 m0) (-1)
+    | m1 >= 0xC0 = Iter (chr2 m1 m0) (-2)
+    | m2 >= 0xC0 = Iter (chr3 m2 m1 m0) (-3)
+    | otherwise  = Iter (chr4 m3 m2 m1 m0) (-4)
+  where m0 = A.unsafeIndex arr j
+        m1 = A.unsafeIndex arr (j-1)
+        m2 = A.unsafeIndex arr (j-2)
+        m3 = A.unsafeIndex arr (j-3)
         j = off + i
-        k = j - 1
 {-# INLINE reverseIter #-}
 
--- | /O(1)/ Iterate one step backwards through a UTF-16 array,
+-- | /O(1)/ Iterate one step backwards through a UTF-8 array,
 -- returning the delta to add (i.e. a negative number) to give the
 -- next offset to iterate at.
 --
 -- @since 1.1.1.0
 reverseIter_ :: Text -> Int -> Int
 reverseIter_ (Text arr off _len) i
-    | m < 0xDC00 || m > 0xDFFF = -1
-    | otherwise                = -2
-  where m = A.unsafeIndex arr (off+i)
+    | m0 <  0x80 = -1
+    | m1 >= 0xC0 = -2
+    | m2 >= 0xC0 = -3
+    | otherwise  = -4
+  where m0 = A.unsafeIndex arr j
+        m1 = A.unsafeIndex arr (j-1)
+        m2 = A.unsafeIndex arr (j-2)
+        j = off + i
 {-# INLINE reverseIter_ #-}
 
--- | /O(1)/ Return the length of a 'Text' in units of 'Word16'.  This
+-- | /O(1)/ Return the length of a 'Text' in units of 'Word8'.  This
 -- is useful for sizing a target array appropriately before using
 -- 'unsafeCopyToPtr'.
-lengthWord16 :: Text -> Int
-lengthWord16 (Text _arr _off len) = len
-{-# INLINE lengthWord16 #-}
+lengthWord8 :: Text -> Int
+lengthWord8 (Text _arr _off len) = len
+{-# INLINE lengthWord8 #-}
 
--- | /O(1)/ Unchecked take of 'k' 'Word16's from the front of a 'Text'.
-takeWord16 :: Int -> Text -> Text
-takeWord16 k (Text arr off _len) = Text arr off k
-{-# INLINE takeWord16 #-}
+-- | /O(1)/ Unchecked take of 'k' 'Word8's from the front of a 'Text'.
+takeWord8 :: Int -> Text -> Text
+takeWord8 k (Text arr off _len) = Text arr off k
+{-# INLINE takeWord8 #-}
 
--- | /O(1)/ Unchecked drop of 'k' 'Word16's from the front of a 'Text'.
-dropWord16 :: Int -> Text -> Text
-dropWord16 k (Text arr off len) = Text arr (off+k) (len-k)
-{-# INLINE dropWord16 #-}
+-- | /O(1)/ Unchecked drop of 'k' 'Word8's from the front of a 'Text'.
+dropWord8 :: Int -> Text -> Text
+dropWord8 k (Text arr off len) = Text arr (off+k) (len-k)
+{-# INLINE dropWord8 #-}
diff --git a/tests/Tests/Properties/LowLevel.hs b/tests/Tests/Properties/LowLevel.hs
index 1972fd36..79aecd05 100644
--- a/tests/Tests/Properties/LowLevel.hs
+++ b/tests/Tests/Properties/LowLevel.hs
@@ -10,7 +10,7 @@ import Control.Exception as E (SomeException, catch, evaluate)
 import Data.Int (Int32, Int64)
 import Data.Text.Foreign
 import Data.Text.Internal (mul, mul32, mul64)
-import Data.Word (Word16, Word32)
+import Data.Word (Word8, Word16, Word32)
 import System.IO.Unsafe (unsafePerformIO)
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.QuickCheck (testProperty)
@@ -46,9 +46,9 @@ t_mul a b = mulRef a b === eval mul a b
 
 -- Misc.
 
-t_dropWord16 m t = dropWord16 m t `T.isSuffixOf` t
-t_takeWord16 m t = takeWord16 m t `T.isPrefixOf` t
-t_take_drop_16 (Small n) t = T.append (takeWord16 n t) (dropWord16 n t) === t
+t_dropWord8 m t = dropWord8 m t `T.isSuffixOf` t
+t_takeWord8 m t = takeWord8 m t `T.isPrefixOf` t
+t_take_drop_8 (Small n) t = T.append (takeWord8 n t) (dropWord8 n t) === t
 t_use_from t = ioProperty $ (==t) <$> useAsPtr t fromPtr
 
 t_copy t = T.copy t === t
@@ -80,9 +80,9 @@ testLowLevel =
     ],
 
     testGroup "misc" [
-      testProperty "t_dropWord16" t_dropWord16,
-      testProperty "t_takeWord16" t_takeWord16,
-      testProperty "t_take_drop_16" t_take_drop_16,
+      testProperty "t_dropWord8" t_dropWord8,
+      testProperty "t_takeWord8" t_takeWord8,
+      testProperty "t_take_drop_8" t_take_drop_8,
       testProperty "t_use_from" t_use_from,
       testProperty "t_copy" t_copy
     ],
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
index e6e1a60d..94790fa0 100644
--- a/tests/Tests/QuickCheckUtils.hs
+++ b/tests/Tests/QuickCheckUtils.hs
@@ -32,7 +32,7 @@ import Control.Arrow ((***))
 import Control.DeepSeq (NFData (..), deepseq)
 import Control.Exception (bracket)
 import Data.Char (isSpace)
-import Data.Text.Foreign (I16)
+import Data.Text.Foreign (I8)
 import Data.Text.Lazy.Builder.RealFloat (FPFormat(..))
 import Data.Word (Word8, Word16)
 import Test.QuickCheck hiding (Fixed(..), Small (..), (.&.))
@@ -51,7 +51,7 @@ import qualified System.IO as IO
 genWord8 :: Gen Word8
 genWord8 = chooseAny
 
-instance Arbitrary I16 where
+instance Arbitrary I8 where
     arbitrary     = arbitrarySizedIntegral
     shrink        = shrinkIntegral
 
diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs
index 157d0e89..90de08bf 100644
--- a/tests/Tests/Regressions.hs
+++ b/tests/Tests/Regressions.hs
@@ -80,10 +80,10 @@ mapAccumL_resize = do
   let f a _ = (a, '\65536')
       count = 5
       val   = T.mapAccumL f (0::Int) (T.replicate count "a")
-  assertEqual "mapAccumL should correctly fill buffers for two-word results"
+  assertEqual "mapAccumL should correctly fill buffers for four-byte results"
              (0, T.replicate count "\65536") val
-  assertEqual "mapAccumL should correctly size buffers for two-word results"
-             (count * 2) (T.lengthWord16 (snd val))
+  assertEqual "mapAccumL should correctly size buffers for four-byte results"
+             (count * 4) (T.lengthWord8 (snd val))
 
 -- See GitHub #197
 t197 :: IO ()
diff --git a/text.cabal b/text.cabal
index 2535129c..ccf86a6d 100644
--- a/text.cabal
+++ b/text.cabal
@@ -8,7 +8,7 @@ synopsis:       An efficient packed Unicode text type.
 description:
     .
     An efficient packed, immutable Unicode text type (both strict and
-    lazy), with a powerful loop fusion optimization framework.
+    lazy).
     .
     The 'Text' type represents Unicode character strings, in a time and
     space-efficient manner. This package provides text processing
@@ -37,23 +37,12 @@ description:
     the [text-icu package](https://hackage.haskell.org/package/text-icu)
     based on the well-respected and liberally
     licensed [ICU library](http://site.icu-project.org/).
-    .
-    == Internal Representation: UTF-16 vs. UTF-8
-    .
-    Currently the @text@ library uses UTF-16 as its internal representation
-    which is [neither a fixed-width nor always the most dense representation](http://utf8everywhere.org/)
-    for Unicode text. We're currently investigating the feasibility
-    of [changing Text's internal representation to UTF-8](https://github.com/text-utf8)
-    and if you need such a 'Text' type right now you might be interested in using the spin-off
-    packages <https://hackage.haskell.org/package/text-utf8 text-utf8> and
-    <https://hackage.haskell.org/package/text-short text-short>.
-
 
 license:        BSD2
 license-file:   LICENSE
 author:         Bryan O'Sullivan <bos@serpentine.com>
 maintainer:     Haskell Text Team <andrew.lelechenko@gmail.com>, Core Libraries Committee
-copyright:      2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper
+copyright:      2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper, 2021 Andrew Lelechenko
 category:       Data, Text
 build-type:     Simple
 tested-with:    GHC==9.0.1,
@@ -76,6 +65,7 @@ flag developer
 
 library
   c-sources:    cbits/cbits.c
+                cbits/utils.c
   include-dirs: include
   hs-source-dirs: src
 
-- 
GitLab