Commit 4aedd00c authored by Herbert Valerio Riedel's avatar Herbert Valerio Riedel 🕺
Browse files

Modify replacement properties of `encodeStringUtf8`/`decodeStringUtf8`

This changes `decodeStringUtf8` to not replace U+FFFE and U+FFFF into
U+FFFD, while `encodeStringUtf8` now replaces surrogate pairs
(i.e. code-points U+D800 through U+DFFF which are invalid in UTF-8)
with U+FFFD.

Consequently, `decodeStringUtf8 . encodeStringUtf8` can now properly
round-trip all scalar code-points
(i.e. [U+0000..U+D7FF] ∪ [U+E000..U+10FFFF]).

This should finally address #4644
parent b67871c7
......@@ -163,15 +163,29 @@ writeFileAtomic targetPath content = do
-- * Unicode stuff
-- ------------------------------------------------------------
-- | Decode 'String' from UTF8-encoded 'BS.ByteString'
--
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
--
fromUTF8BS :: SBS.ByteString -> String
fromUTF8BS = decodeStringUtf8 . SBS.unpack
-- | Variant of 'fromUTF8BS' for lazy 'BS.ByteString's
--
fromUTF8LBS :: BS.ByteString -> String
fromUTF8LBS = decodeStringUtf8 . BS.unpack
-- | Encode 'String' to to UTF8-encoded 'SBS.ByteString'
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
--
toUTF8BS :: String -> SBS.ByteString
toUTF8BS = SBS.pack . encodeStringUtf8
-- | Variant of 'toUTF8BS' for lazy 'BS.ByteString's
--
toUTF8LBS :: String -> BS.ByteString
toUTF8LBS = BS.pack . encodeStringUtf8
......
......@@ -10,7 +10,8 @@ import Data.Char (chr,ord)
-- | Decode 'String' from UTF8-encoded octets.
--
-- Invalid data will be decoded as the replacement character (@U+FFFD@)
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
--
-- See also 'encodeStringUtf8'
decodeStringUtf8 :: [Word8] -> String
......@@ -40,9 +41,7 @@ decodeStringUtf8 = go
moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
moreBytes 1 overlong cs' acc
| overlong <= acc && acc <= 0x10FFFF
&& (acc < 0xD800 || 0xDFFF < acc)
&& (acc < 0xFFFE || 0xFFFF < acc)
| overlong <= acc, acc <= 0x10FFFF, (acc < 0xD800 || 0xDFFF < acc)
= chr acc : go cs'
| otherwise
......@@ -61,6 +60,9 @@ decodeStringUtf8 = go
-- | Encode 'String' to a list of UTF8-encoded octets
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
--
-- See also 'decodeUtf8'
encodeStringUtf8 :: String -> [Word8]
encodeStringUtf8 [] = []
......@@ -70,6 +72,12 @@ encodeStringUtf8 (c:cs)
| c <= '\x7FF' = (0xC0 .|. w8ShiftR 6 )
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
| c <= '\xD7FF'= (0xE0 .|. w8ShiftR 12 )
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
| c <= '\xDFFF'= 0xEF : 0xBF : 0xBD -- U+FFFD
: encodeStringUtf8 cs
| c <= '\xFFFF'= (0xE0 .|. w8ShiftR 12 )
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
: (0x80 .|. (w8 .&. 0x3F))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment