Skip to content
Snippets Groups Projects
Commit d444bbbd authored by Sigbjorn Finne's avatar Sigbjorn Finne
Browse files

added isUTF8Encoded predicate + utf8Encode to avoid repeated encodings

parent 6a8467a8
No related branches found
No related tags found
No related merge requests found
......@@ -24,13 +24,9 @@ import Data.Char (chr,ord)
default(Int)
-- | Encode a string using 'encode' and store the result in a 'String'.
encodeString :: String -> String
encodeString xs = map (toEnum . fromEnum) (encode xs)
-- | Decode a string using 'decode' using a 'String' as input.
-- | This is not safe but it is necessary if UTF-8 encoded text
-- | has been loaded into a 'String' prior to being decoded.
decodeString :: String -> String
decodeString xs = decode (map (toEnum . fromEnum) xs)
......@@ -66,20 +62,13 @@ decode [ ] = ""
decode (c:cs)
| c < 0x80 = chr (fromEnum c) : decode cs
| c < 0xc0 = replacement_character : decode cs
| c < 0xe0 = multi1
| c < 0xe0 = multi_byte 1 0x1f 0x80
| c < 0xf0 = multi_byte 2 0xf 0x800
| c < 0xf8 = multi_byte 3 0x7 0x10000
| c < 0xfc = multi_byte 4 0x3 0x200000
| c < 0xfe = multi_byte 5 0x1 0x4000000
| otherwise = replacement_character : decode cs
where
multi1 = case cs of
c1 : ds | c1 .&. 0xc0 == 0x80 ->
let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f)
in if d >= 0x000080 then toEnum d : decode ds
else replacement_character : decode ds
_ -> replacement_character : decode cs
multi_byte :: Int -> Word8 -> Int -> [Char]
multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
where
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment