From a31c0f7df8fea235633a2eb82fb8d89b85a2fd47 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan <bos@serpentine.com> Date: Fri, 23 Dec 2011 12:24:17 -0800 Subject: [PATCH] Improve ASCII encoding performance in a safer way. --- Data/Text/Encoding.hs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Data/Text/Encoding.hs b/Data/Text/Encoding.hs index e6a99e08..b0fd388e 100644 --- a/Data/Text/Encoding.hs +++ b/Data/Text/Encoding.hs @@ -153,8 +153,9 @@ encodeUtf8 (Text arr off len) = unsafePerformIO $ do where loop n1 m1 ptr = go n1 m1 where + offLen = off + len go !n !m - | n == off+len = return (PS fp 0 m) + | n == offLen = return (PS fp 0 m) | otherwise = do let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8) ensure k act @@ -168,8 +169,17 @@ encodeUtf8 (Text arr off len) = unsafePerformIO $ do {-# INLINE ensure #-} case A.unsafeIndex arr n of w| w <= 0x7F -> ensure 1 $ do - poke8 m w - go (n+1) (m+1) + poke (ptr `plusPtr` m) (fromIntegral w :: Word8) + -- A single ASCII octet is likely to start a run of + -- them. We see better performance when we + -- special-case this assumption. + let ascii !t !u + | t == offLen || u == size || v >= 0x80 = go t u + | otherwise = do + poke (ptr `plusPtr` u) (fromIntegral v :: Word8) + ascii (t+1) (u+1) + where v = A.unsafeIndex arr t + ascii (n+1) (m+1) | w <= 0x7FF -> ensure 2 $ do poke8 m $ (w `shiftR` 6) + 0xC0 poke8 (m+1) $ (w .&. 0x3f) + 0x80 -- GitLab