diff --git a/libraries/base/GHC/IO/Encoding.hs b/libraries/base/GHC/IO/Encoding.hs index 108b0fcf117d5aa280fbf13ee20650bd87676b28..a690717ebdbe71b8d79178bfc44bce72b2cbc6fd 100644 --- a/libraries/base/GHC/IO/Encoding.hs +++ b/libraries/base/GHC/IO/Encoding.hs @@ -30,7 +30,6 @@ module GHC.IO.Encoding ( ) where import GHC.Base -import GHC.Foreign (charIsRepresentable) import GHC.IO.Exception import GHC.IO.Buffer import GHC.IO.Encoding.Failure @@ -255,18 +254,17 @@ mkTextEncoding' cfm enc = -- Unfortunately there is no good way to determine whether iconv is actually -- functional without telling it to do something. _ -> do res <- Iconv.mkIconvEncoding cfm enc - good <- charIsRepresentable res 'a' let isAscii = any (== enc) ansiEncNames - case good of - True -> return res + case res of + Just e -> return e -- At this point we know that we can't count on iconv to work -- (see, for instance, Trac #10298). However, we still want to do - -- what can to work with what we have. For instance, ASCII is + -- what we can to work with what we have. For instance, ASCII is -- easy. We match on ASCII encodings directly using several -- possible aliases (specified by RFC 1345 & Co) and for this use - -- the 'char8' encodeing - False - | isAscii -> return char8 + -- the 'ascii' encoding + Nothing + | isAscii -> return (Latin1.mkAscii cfm) | otherwise -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm) where diff --git a/libraries/base/GHC/IO/Encoding/Iconv.hs b/libraries/base/GHC/IO/Encoding/Iconv.hs index f64d2451cf3f443c5e3340140be9bc10d16613c2..061bd60c209a9868a846f0497160c243e7ee12dc 100644 --- a/libraries/base/GHC/IO/Encoding/Iconv.hs +++ b/libraries/base/GHC/IO/Encoding/Iconv.hs @@ -34,9 +34,10 @@ import GHC.Base () -- For build ordering #else import Foreign -import Foreign.C +import Foreign.C hiding (charIsRepresentable) import Data.Maybe import GHC.Base +import GHC.Foreign (charIsRepresentable) import GHC.IO.Buffer import GHC.IO.Encoding.Failure import GHC.IO.Encoding.Types @@ -96,17 +97,27 @@ char_shift :: Int char_shift | charSize == 2 = 1 | otherwise = 2 -iconvEncoding :: String -> IO TextEncoding +iconvEncoding :: String -> IO (Maybe TextEncoding) iconvEncoding = mkIconvEncoding ErrorOnCodingFailure -- | Construct an iconv-based 'TextEncoding' for the given character set and -- 'CodingFailureMode'. -mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding +-- +-- As iconv is missing in some minimal environments (e.g. #10298), this +-- checks to ensure that iconv is working properly before returning the +-- encoding, returning 'Nothing' if not. +mkIconvEncoding :: CodingFailureMode -> String -> IO (Maybe TextEncoding) mkIconvEncoding cfm charset = do - return (TextEncoding { - textEncodingName = charset, - mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode, - mkTextEncoder = newIConv haskellChar charset (recoverEncode cfm) iconvEncode}) + let enc = TextEncoding { + textEncodingName = charset, + mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) + (recoverDecode cfm) iconvDecode, + mkTextEncoder = newIConv haskellChar charset + (recoverEncode cfm) iconvEncode} + good <- charIsRepresentable enc 'a' + return $ if good + then Just enc + else Nothing where -- An annoying feature of GNU iconv is that the //PREFIXES only take -- effect when they appear on the tocode parameter to iconv_open: diff --git a/libraries/base/GHC/IO/Encoding/Latin1.hs b/libraries/base/GHC/IO/Encoding/Latin1.hs index 34a4fca193ba61999a5ccf53de5e0e018f8181d7..d24fcdfc10723ba056f6f25af4e406fd6e31fee4 100644 --- a/libraries/base/GHC/IO/Encoding/Latin1.hs +++ b/libraries/base/GHC/IO/Encoding/Latin1.hs @@ -15,7 +15,7 @@ -- Stability : internal -- Portability : non-portable -- --- UTF-32 Codecs for the IO library +-- Single-byte encodings that map directly to Unicode code points. -- -- Portions Copyright : (c) Tom Harper 2008-2009, -- (c) Bryan O'Sullivan 2009, @@ -26,9 +26,12 @@ module GHC.IO.Encoding.Latin1 ( latin1, mkLatin1, latin1_checked, mkLatin1_checked, + ascii, mkAscii, latin1_decode, + ascii_decode, latin1_encode, latin1_checked_encode, + ascii_encode, ) where import GHC.Base @@ -90,6 +93,46 @@ latin1_checked_EF cfm = setState = const $ return () }) +-- ----------------------------------------------------------------------------- +-- ASCII + +-- | @since 4.8.2.0 +ascii :: TextEncoding +ascii = mkAscii ErrorOnCodingFailure + +-- | @since 4.8.2.0 +mkAscii :: CodingFailureMode -> TextEncoding +mkAscii cfm = TextEncoding { textEncodingName = "ASCII", + mkTextDecoder = ascii_DF cfm, + mkTextEncoder = ascii_EF cfm } + +ascii_DF :: CodingFailureMode -> IO (TextDecoder ()) +ascii_DF cfm = + return (BufferCodec { + encode = ascii_decode, + recover = recoverDecode cfm, + close = return (), + getState = return (), + setState = const $ return () + }) + +ascii_EF :: CodingFailureMode -> IO (TextEncoder ()) +ascii_EF cfm = + return (BufferCodec { + encode = ascii_encode, + recover = recoverEncode cfm, + close = return (), + getState = return (), + setState = const $ return () + }) + + + +-- ----------------------------------------------------------------------------- +-- The actual decoders and encoders + +-- TODO: Eliminate code duplication between the checked and unchecked +-- versions of the decoder or encoder (but don't change the Core!) latin1_decode :: DecodeBuffer latin1_decode @@ -112,6 +155,30 @@ latin1_decode in loop ir0 ow0 +ascii_decode :: DecodeBuffer +ascii_decode + input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } + output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + = let + loop !ir !ow + | ow >= os = done OutputUnderflow ir ow + | ir >= iw = done InputUnderflow ir ow + | otherwise = do + c0 <- readWord8Buf iraw ir + if c0 > 0x7f then invalid else do + ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0)) + loop (ir+1) ow' + where + invalid = done InvalidSequence ir ow + + -- lambda-lifted, to avoid thunks being built in the inner-loop: + done why !ir !ow = return (why, + if ir == iw then input{ bufL=0, bufR=0 } + else input{ bufL=ir }, + output{ bufR=ow }) + in + loop ir0 ow0 + latin1_encode :: EncodeBuffer latin1_encode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } @@ -132,7 +199,15 @@ latin1_encode loop ir0 ow0 latin1_checked_encode :: EncodeBuffer -latin1_checked_encode +latin1_checked_encode input output + = single_byte_checked_encode 0xff input output + +ascii_encode :: EncodeBuffer +ascii_encode input output + = single_byte_checked_encode 0x7f input output + +single_byte_checked_encode :: Int -> EncodeBuffer +single_byte_checked_encode max_legal_char input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } = let @@ -145,11 +220,11 @@ latin1_checked_encode | ir >= iw = done InputUnderflow ir ow | otherwise = do (c,ir') <- readCharBuf iraw ir - if ord c > 0xff then invalid else do + if ord c > max_legal_char then invalid else do writeWord8Buf oraw ow (fromIntegral (ord c)) loop ir' (ow+1) where invalid = done InvalidSequence ir ow in loop ir0 ow0 - +{-# INLINE single_byte_checked_encode #-} diff --git a/libraries/base/GHC/TopHandler.hs b/libraries/base/GHC/TopHandler.hs index d9010697b387b9278dbd5e572dcd8a38ffc4620c..05c905f7c10bb7ec9001c90c1b17b9cd5a9ec52c 100644 --- a/libraries/base/GHC/TopHandler.hs +++ b/libraries/base/GHC/TopHandler.hs @@ -180,7 +180,8 @@ disasterHandler exit _ = msgStr = "encountered an exception while trying to report an exception." ++ "One possible reason for this is that we failed while trying to " ++ - "encode an error message. Check that your locale configured properly." + "encode an error message. Check that your locale is configured " ++ + "properly." {- Note [Disaster with iconv] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/libraries/base/tests/.gitignore b/libraries/base/tests/.gitignore index af90b5e47c8585d6ac22361105d1a9a29e742644..a430bd700aba897a48d7b498e51ea887a5ad6c20 100644 --- a/libraries/base/tests/.gitignore +++ b/libraries/base/tests/.gitignore @@ -105,6 +105,7 @@ /IO/encoding002 /IO/encoding003 /IO/encoding004 +/IO/encoding005 /IO/encodingerror001 /IO/environment001 /IO/finalization001 diff --git a/libraries/base/tests/IO/all.T b/libraries/base/tests/IO/all.T index e15c84d9f83598c53cc1446c975d7f6f511fe608..fe8faa0ce058d9b663f848e1f4eba6a45b91429c 100644 --- a/libraries/base/tests/IO/all.T +++ b/libraries/base/tests/IO/all.T @@ -145,6 +145,7 @@ test('encoding001', test('encoding002', normal, compile_and_run, ['']) test('encoding003', normal, compile_and_run, ['']) test('encoding004', normal, compile_and_run, ['']) +test('encoding005', normal, compile_and_run, ['']) test('environment001', [extra_clean(['environment001'])], diff --git a/libraries/base/tests/IO/encoding005.hs b/libraries/base/tests/IO/encoding005.hs new file mode 100644 index 0000000000000000000000000000000000000000..99db84af596e001d3254ec080d11fcdc712e01da --- /dev/null +++ b/libraries/base/tests/IO/encoding005.hs @@ -0,0 +1,115 @@ +import Control.Monad +import Data.Word (Word8) +import Foreign.Ptr +import Foreign.Marshal.Array +import GHC.Foreign (peekCStringLen, withCStringLen) +import GHC.IO.Encoding.Failure (CodingFailureMode(..)) +import qualified GHC.IO.Encoding.Latin1 as Latin1 +import System.IO +import System.IO.Error + +-- Tests for single-byte encodings that map directly to Unicode +-- (module GHC.IO.Encoding.Latin1) + +eitherToMaybe :: Either a b -> Maybe b +eitherToMaybe (Left _) = Nothing +eitherToMaybe (Right b) = Just b + +decode :: TextEncoding -> [Word8] -> IO (Maybe String) +decode enc xs = fmap eitherToMaybe . tryIOError $ withArrayLen xs (\sz p -> peekCStringLen enc (castPtr p, sz)) + +encode :: TextEncoding -> String -> IO (Maybe [Word8]) +encode enc cs = fmap eitherToMaybe . tryIOError $ withCStringLen enc cs (\(p, sz) -> peekArray sz (castPtr p)) + +testIO :: (Eq a, Show a) => IO a -> a -> IO () +testIO action expected = do + result <- action + when (result /= expected) $ + putStrLn $ "Test failed: expected " ++ show expected ++ ", but got " ++ show result + +-- Test char8-like encodings +test_char8 :: TextEncoding -> IO () +test_char8 enc = do + testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff'] + + testIO (encode enc ['\0'..'\x200']) $ Just ([0..0xff] ++ [0..0xff] ++ [0]) + +-- Test latin1-like encodings +test_latin1 :: CodingFailureMode -> TextEncoding -> IO () +test_latin1 cfm enc = do + testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff'] + + testIO (encode enc ['\0'..'\xff']) $ Just [0..0xff] + testIO (encode enc "\xfe\xff\x100\x101\x100\xff\xfe") $ case cfm of + ErrorOnCodingFailure -> Nothing + IgnoreCodingFailure -> Just [0xfe,0xff,0xff,0xfe] + TransliterateCodingFailure -> Just [0xfe,0xff,0x3f,0x3f,0x3f,0xff,0xfe] + -- N.B. The argument "LATIN1//TRANSLIT" to mkTextEncoding does not + -- correspond to "LATIN1//TRANSLIT" in iconv! Instead GHC asks iconv + -- to encode to "LATIN1" and uses its own "evil hack" to insert '?' + -- (ASCII 0x3f) in place of failures. See GHC.IO.Encoding.recoverEncode. + -- + -- U+0100 is LATIN CAPITAL LETTER A WITH MACRON, which iconv would + -- transliterate to 'A' (ASCII 0x41). Similarly iconv would + -- transliterate U+0101 LATIN SMALL LETTER A WITH MACRON to 'a' + -- (ASCII 0x61). + RoundtripFailure -> Nothing + +test_ascii :: CodingFailureMode -> TextEncoding -> IO () +test_ascii cfm enc = do + testIO (decode enc [0..0x7f]) $ Just ['\0'..'\x7f'] + testIO (decode enc [0x7e,0x7f,0x80,0x81,0x80,0x7f,0x7e]) $ case cfm of + ErrorOnCodingFailure -> Nothing + IgnoreCodingFailure -> Just "\x7e\x7f\x7f\x7e" + TransliterateCodingFailure -> Just "\x7e\x7f\xfffd\xfffd\xfffd\x7f\x7e" + -- Another GHC special: decode invalid input to the Char U+FFFD + -- REPLACEMENT CHARACTER. + RoundtripFailure -> Just "\x7e\x7f\xdc80\xdc81\xdc80\x7f\x7e" + -- GHC's PEP383-style String-encoding of invalid input, + -- see Note [Roundtripping] + + testIO (encode enc ['\0'..'\x7f']) $ Just [0..0x7f] + testIO (encode enc "\x7e\x7f\x80\x81\x80\x7f\xe9") $ case cfm of + ErrorOnCodingFailure -> Nothing + IgnoreCodingFailure -> Just [0x7e,0x7f,0x7f] + TransliterateCodingFailure -> Just [0x7e,0x7f,0x3f,0x3f,0x3f,0x7f,0x3f] + -- See comment in test_latin1. iconv -t ASCII//TRANSLIT would encode + -- U+00E9 LATIN SMALL LETTER E WITH ACUTE as 'e' (ASCII 0x65). + RoundtripFailure -> Nothing + + -- Test roundtripping for good measure + case cfm of + RoundtripFailure -> do + Just s <- decode enc [0..0xff] + testIO (encode enc s) $ Just [0..0xff] + _ -> return () + +main = do + putStrLn "char8 tests" + test_char8 char8 -- char8 never fails in either direction + + -- These use GHC's own implementation + putStrLn "Latin1.ascii tests" + test_ascii ErrorOnCodingFailure (Latin1.ascii) + test_ascii IgnoreCodingFailure (Latin1.mkAscii IgnoreCodingFailure) + test_ascii TransliterateCodingFailure (Latin1.mkAscii TransliterateCodingFailure) + test_ascii RoundtripFailure (Latin1.mkAscii RoundtripFailure) + + putStrLn "Latin1.latin1_checked tests" + test_latin1 ErrorOnCodingFailure (Latin1.latin1_checked) + test_latin1 IgnoreCodingFailure (Latin1.mkLatin1_checked IgnoreCodingFailure) + test_latin1 TransliterateCodingFailure (Latin1.mkLatin1_checked TransliterateCodingFailure) + test_latin1 RoundtripFailure (Latin1.mkLatin1_checked RoundtripFailure) + + -- These use iconv (normally, unless it is broken) + putStrLn "mkTextEncoding ASCII tests" + test_ascii ErrorOnCodingFailure =<< mkTextEncoding "ASCII" + test_ascii IgnoreCodingFailure =<< mkTextEncoding "ASCII//IGNORE" + test_ascii TransliterateCodingFailure =<< mkTextEncoding "ASCII//TRANSLIT" + test_ascii RoundtripFailure =<< mkTextEncoding "ASCII//ROUNDTRIP" + + putStrLn "mkTextEncoding LATIN1 tests" + test_latin1 ErrorOnCodingFailure =<< mkTextEncoding "LATIN1" + test_latin1 IgnoreCodingFailure =<< mkTextEncoding "LATIN1//IGNORE" + test_latin1 TransliterateCodingFailure =<< mkTextEncoding "LATIN1//TRANSLIT" + test_latin1 RoundtripFailure =<< mkTextEncoding "LATIN1//ROUNDTRIP" diff --git a/libraries/base/tests/IO/encoding005.stdout b/libraries/base/tests/IO/encoding005.stdout new file mode 100644 index 0000000000000000000000000000000000000000..664a1935924138df90af79bba2129572f31bea06 --- /dev/null +++ b/libraries/base/tests/IO/encoding005.stdout @@ -0,0 +1,5 @@ +char8 tests +Latin1.ascii tests +Latin1.latin1_checked tests +mkTextEncoding ASCII tests +mkTextEncoding LATIN1 tests