Unverified Commit 6e1871a5 authored by Mikhail Glushenkov's avatar Mikhail Glushenkov Committed by GitHub
Browse files

Merge pull request #4928 from hvr/pr/issue-4644

Modify replacement properties of `encodeStringUtf8`/`decodeStringUtf8`
parents 5711a844 4aedd00c
......@@ -389,6 +389,7 @@ test-suite unit-tests
UnitTests.Distribution.Simple.Program.Internal
UnitTests.Distribution.Simple.Utils
UnitTests.Distribution.System
UnitTests.Distribution.Utils.Generic
UnitTests.Distribution.Utils.NubList
UnitTests.Distribution.Utils.ShortText
UnitTests.Distribution.Version
......@@ -396,6 +397,7 @@ test-suite unit-tests
build-depends:
array,
base,
bytestring,
containers,
directory,
filepath,
......@@ -404,6 +406,7 @@ test-suite unit-tests
tasty-hunit,
tasty-quickcheck,
tagged,
text,
pretty,
QuickCheck >= 2.7 && < 2.11,
Cabal
......
......@@ -163,15 +163,29 @@ writeFileAtomic targetPath content = do
-- * Unicode stuff
-- ------------------------------------------------------------
-- | Decode 'String' from UTF8-encoded 'BS.ByteString'
--
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
--
fromUTF8BS :: SBS.ByteString -> String
fromUTF8BS = decodeStringUtf8 . SBS.unpack
-- | Variant of 'fromUTF8BS' for lazy 'BS.ByteString's
--
fromUTF8LBS :: BS.ByteString -> String
fromUTF8LBS = decodeStringUtf8 . BS.unpack
-- | Encode 'String' to to UTF8-encoded 'SBS.ByteString'
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
--
toUTF8BS :: String -> SBS.ByteString
toUTF8BS = SBS.pack . encodeStringUtf8
-- | Variant of 'toUTF8BS' for lazy 'BS.ByteString's
--
toUTF8LBS :: String -> BS.ByteString
toUTF8LBS = BS.pack . encodeStringUtf8
......
......@@ -10,7 +10,8 @@ import Data.Char (chr,ord)
-- | Decode 'String' from UTF8-encoded octets.
--
-- Invalid data will be decoded as the replacement character (@U+FFFD@)
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
--
-- See also 'encodeStringUtf8'
decodeStringUtf8 :: [Word8] -> String
......@@ -40,9 +41,7 @@ decodeStringUtf8 = go
moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
moreBytes 1 overlong cs' acc
| overlong <= acc && acc <= 0x10FFFF
&& (acc < 0xD800 || 0xDFFF < acc)
&& (acc < 0xFFFE || 0xFFFF < acc)
| overlong <= acc, acc <= 0x10FFFF, (acc < 0xD800 || 0xDFFF < acc)
= chr acc : go cs'
| otherwise
......@@ -61,6 +60,9 @@ decodeStringUtf8 = go
-- | Encode 'String' to a list of UTF8-encoded octets
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
--
-- See also 'decodeUtf8'
encodeStringUtf8 :: String -> [Word8]
encodeStringUtf8 [] = []
......@@ -70,6 +72,12 @@ encodeStringUtf8 (c:cs)
| c <= '\x7FF' = (0xC0 .|. w8ShiftR 6 )
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
| c <= '\xD7FF'= (0xE0 .|. w8ShiftR 12 )
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
: (0x80 .|. (w8 .&. 0x3F))
: encodeStringUtf8 cs
| c <= '\xDFFF'= 0xEF : 0xBF : 0xBD -- U+FFFD
: encodeStringUtf8 cs
| c <= '\xFFFF'= (0xE0 .|. w8ShiftR 12 )
: (0x80 .|. (w8ShiftR 6 .&. 0x3F))
: (0x80 .|. (w8 .&. 0x3F))
......
......@@ -20,6 +20,7 @@ import qualified UnitTests.Distribution.Compat.Graph
import qualified UnitTests.Distribution.Simple.Program.Internal
import qualified UnitTests.Distribution.Simple.Utils
import qualified UnitTests.Distribution.System
import qualified UnitTests.Distribution.Utils.Generic
import qualified UnitTests.Distribution.Utils.NubList
import qualified UnitTests.Distribution.Utils.ShortText
import qualified UnitTests.Distribution.Version (versionTests)
......@@ -44,6 +45,8 @@ tests mtimeChangeCalibrated =
UnitTests.Distribution.Simple.Program.Internal.tests
, testGroup "Distribution.Simple.Utils"
UnitTests.Distribution.Simple.Utils.tests
, testGroup "Distribution.Utils.Generic"
UnitTests.Distribution.Utils.Generic.tests
, testGroup "Distribution.Utils.NubList"
UnitTests.Distribution.Utils.NubList.tests
, testGroup "Distribution.Utils.ShortText"
......
{-# LANGUAGE OverloadedStrings #-}
-- to suppress WARNING in "Distribution.Compat.Prelude.Internal"
{-# OPTIONS_GHC -fno-warn-deprecations #-}
module UnitTests.Distribution.Utils.Generic ( tests ) where
import Prelude ()
import Distribution.Compat.Prelude.Internal
import Distribution.Utils.Generic
import qualified Data.ByteString.Char8 as BS
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Test.Tasty
import Test.Tasty.HUnit
import Test.Tasty.QuickCheck
tests :: [TestTree]
tests =
[ -- fromUTF8BS / toUTF8BS
testCase "fromUTF8BS mempty" testFromUTF8BSEmpty
, testCase "toUTF8BS mempty" testToUTF8BSEmpty
, testCase "toUTF8BS [U+D800..U+DFFF]" testToUTF8BSSurr
, testCase "toUTF8BS [U+0000..U+7F]" testToUTF8BSAscii
, testCase "toUTF8BS [U+0000..U+10FFFF]" testToUTF8BSText
, testCase "fromUTF8BS.toUTF8BS [U+0000..U+10FFFF]" testToFromUTF8BS
, testProperty "fromUTF8BS.toUTF8BS == id" prop_toFromUTF8BS
, testProperty "toUTF8BS == encodeUtf8" prop_toUTF8BS
]
testFromUTF8BSEmpty :: Assertion
testFromUTF8BSEmpty = mempty @=? fromUTF8BS mempty
testToUTF8BSEmpty :: Assertion
testToUTF8BSEmpty = mempty @=? toUTF8BS mempty
testToUTF8BSSurr :: Assertion
testToUTF8BSSurr = BS.concat (replicate 2048 u_fffd) @=? toUTF8BS surrogates
where
surrogates = ['\xD800'..'\xDFFF']
u_fffd = "\xEF\xBF\xBD"
testToUTF8BSText :: Assertion
testToUTF8BSText = T.encodeUtf8 (T.pack txt) @=? toUTF8BS txt
where
txt = ['\x00'..'\x10FFFF']
testToUTF8BSAscii :: Assertion
testToUTF8BSAscii = BS.pack txt @=? toUTF8BS txt
where
txt = ['\x00'..'\x7F']
testToFromUTF8BS :: Assertion
testToFromUTF8BS = txt @=? (fromUTF8BS . toUTF8BS) txt
where
txt = ['\x0000'..'\xD7FF'] ++ ['\xE000'..'\x10FFFF']
prop_toFromUTF8BS :: [Char] -> Property
prop_toFromUTF8BS txt = txt === (fromUTF8BS . toUTF8BS) txt
prop_toUTF8BS :: [Char] -> Property
prop_toUTF8BS txt = T.encodeUtf8 (T.pack txt) === toUTF8BS txt
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment