Commit dbe6dac9 authored by rwbarton's avatar rwbarton Committed by Ben Gamari

When iconv is unavailable, use an ASCII encoding to encode ASCII

D898 and D1059 implemented a fallback behavior to handle the case
that the end user's iconv installation is broken (typically due to
running inside a chroot in which the necessary locale files and/or
gconv modules have not been installed). In this case, if the
program requests an ASCII locale, GHC's char8 encoding is used
rather than the program failing.

However, silently mangling data like char8 does when the programmer
did not ask for it is poor behavior, for reasons described in D1059.

This commit implements an ASCII encoding and uses it in the fallback
case when iconv is unavailable and the user has requested ASCII.

Test Plan:
Added tests for the encodings defined in Latin1.
Also, manually ran a statically-linked executable of that test
in a chroot and the tests passed (up to the ones that call
mkTextEncoding "LATIN1", since there is no fallback from iconv
for that case yet).

Reviewers: austin, hvr, hsyl20, bgamari

Reviewed By: hsyl20, bgamari

Subscribers: thomie

Differential Revision: https://phabricator.haskell.org/D1085

GHC Trac Issues: #7695, #10623
parent 97a50d50
......@@ -262,9 +262,9 @@ mkTextEncoding' cfm enc =
-- what we can to work with what we have. For instance, ASCII is
-- easy. We match on ASCII encodings directly using several
-- possible aliases (specified by RFC 1345 & Co) and for this use
-- the 'char8' encoding
-- the 'ascii' encoding
Nothing
| isAscii -> return char8
| isAscii -> return (Latin1.mkAscii cfm)
| otherwise ->
unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
where
......
......@@ -15,7 +15,7 @@
-- Stability : internal
-- Portability : non-portable
--
-- UTF-32 Codecs for the IO library
-- Single-byte encodings that map directly to Unicode code points.
--
-- Portions Copyright : (c) Tom Harper 2008-2009,
-- (c) Bryan O'Sullivan 2009,
......@@ -26,9 +26,12 @@
module GHC.IO.Encoding.Latin1 (
latin1, mkLatin1,
latin1_checked, mkLatin1_checked,
ascii, mkAscii,
latin1_decode,
ascii_decode,
latin1_encode,
latin1_checked_encode,
ascii_encode,
) where
import GHC.Base
......@@ -90,6 +93,46 @@ latin1_checked_EF cfm =
setState = const $ return ()
})
-- -----------------------------------------------------------------------------
-- ASCII
-- | @since 4.8.2.0
ascii :: TextEncoding
ascii = mkAscii ErrorOnCodingFailure
-- | @since 4.8.2.0
mkAscii :: CodingFailureMode -> TextEncoding
mkAscii cfm = TextEncoding { textEncodingName = "ASCII",
mkTextDecoder = ascii_DF cfm,
mkTextEncoder = ascii_EF cfm }
ascii_DF :: CodingFailureMode -> IO (TextDecoder ())
ascii_DF cfm =
return (BufferCodec {
encode = ascii_decode,
recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
ascii_EF :: CodingFailureMode -> IO (TextEncoder ())
ascii_EF cfm =
return (BufferCodec {
encode = ascii_encode,
recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
-- -----------------------------------------------------------------------------
-- The actual decoders and encoders
-- TODO: Eliminate code duplication between the checked and unchecked
-- versions of the decoder or encoder (but don't change the Core!)
latin1_decode :: DecodeBuffer
latin1_decode
......@@ -112,6 +155,30 @@ latin1_decode
in
loop ir0 ow0
ascii_decode :: DecodeBuffer
ascii_decode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
| ow >= os = done OutputUnderflow ir ow
| ir >= iw = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
if c0 > 0x7f then invalid else do
ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
loop (ir+1) ow'
where
invalid = done InvalidSequence ir ow
-- lambda-lifted, to avoid thunks being built in the inner-loop:
done why !ir !ow = return (why,
if ir == iw then input{ bufL=0, bufR=0 }
else input{ bufL=ir },
output{ bufR=ow })
in
loop ir0 ow0
latin1_encode :: EncodeBuffer
latin1_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
......@@ -132,7 +199,15 @@ latin1_encode
loop ir0 ow0
latin1_checked_encode :: EncodeBuffer
latin1_checked_encode
latin1_checked_encode input output
= single_byte_checked_encode 0xff input output
ascii_encode :: EncodeBuffer
ascii_encode input output
= single_byte_checked_encode 0x7f input output
single_byte_checked_encode :: Int -> EncodeBuffer
single_byte_checked_encode max_legal_char
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
......@@ -145,11 +220,11 @@ latin1_checked_encode
| ir >= iw = done InputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
if ord c > 0xff then invalid else do
if ord c > max_legal_char then invalid else do
writeWord8Buf oraw ow (fromIntegral (ord c))
loop ir' (ow+1)
where
invalid = done InvalidSequence ir ow
in
loop ir0 ow0
{-# INLINE single_byte_checked_encode #-}
......@@ -105,6 +105,7 @@
/IO/encoding002
/IO/encoding003
/IO/encoding004
/IO/encoding005
/IO/encodingerror001
/IO/environment001
/IO/finalization001
......
......@@ -138,6 +138,7 @@ test('encoding001',
test('encoding002', normal, compile_and_run, [''])
test('encoding003', normal, compile_and_run, [''])
test('encoding004', normal, compile_and_run, [''])
test('encoding005', normal, compile_and_run, [''])
test('environment001',
[extra_clean(['environment001'])],
......
import Control.Monad
import Data.Word (Word8)
import Foreign.Ptr
import Foreign.Marshal.Array
import GHC.Foreign (peekCStringLen, withCStringLen)
import GHC.IO.Encoding.Failure (CodingFailureMode(..))
import qualified GHC.IO.Encoding.Latin1 as Latin1
import System.IO
import System.IO.Error
-- Tests for single-byte encodings that map directly to Unicode
-- (module GHC.IO.Encoding.Latin1)
eitherToMaybe :: Either a b -> Maybe b
eitherToMaybe (Left _) = Nothing
eitherToMaybe (Right b) = Just b
decode :: TextEncoding -> [Word8] -> IO (Maybe String)
decode enc xs = fmap eitherToMaybe . tryIOError $ withArrayLen xs (\sz p -> peekCStringLen enc (castPtr p, sz))
encode :: TextEncoding -> String -> IO (Maybe [Word8])
encode enc cs = fmap eitherToMaybe . tryIOError $ withCStringLen enc cs (\(p, sz) -> peekArray sz (castPtr p))
testIO :: (Eq a, Show a) => IO a -> a -> IO ()
testIO action expected = do
result <- action
when (result /= expected) $
putStrLn $ "Test failed: expected " ++ show expected ++ ", but got " ++ show result
-- Test char8-like encodings
test_char8 :: TextEncoding -> IO ()
test_char8 enc = do
testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff']
testIO (encode enc ['\0'..'\x200']) $ Just ([0..0xff] ++ [0..0xff] ++ [0])
-- Test latin1-like encodings
test_latin1 :: CodingFailureMode -> TextEncoding -> IO ()
test_latin1 cfm enc = do
testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff']
testIO (encode enc ['\0'..'\xff']) $ Just [0..0xff]
testIO (encode enc "\xfe\xff\x100\x101\x100\xff\xfe") $ case cfm of
ErrorOnCodingFailure -> Nothing
IgnoreCodingFailure -> Just [0xfe,0xff,0xff,0xfe]
TransliterateCodingFailure -> Just [0xfe,0xff,0x3f,0x3f,0x3f,0xff,0xfe]
-- N.B. The argument "LATIN1//TRANSLIT" to mkTextEncoding does not
-- correspond to "LATIN1//TRANSLIT" in iconv! Instead GHC asks iconv
-- to encode to "LATIN1" and uses its own "evil hack" to insert '?'
-- (ASCII 0x3f) in place of failures. See GHC.IO.Encoding.recoverEncode.
--
-- U+0100 is LATIN CAPITAL LETTER A WITH MACRON, which iconv would
-- transliterate to 'A' (ASCII 0x41). Similarly iconv would
-- transliterate U+0101 LATIN SMALL LETTER A WITH MACRON to 'a'
-- (ASCII 0x61).
RoundtripFailure -> Nothing
test_ascii :: CodingFailureMode -> TextEncoding -> IO ()
test_ascii cfm enc = do
testIO (decode enc [0..0x7f]) $ Just ['\0'..'\x7f']
testIO (decode enc [0x7e,0x7f,0x80,0x81,0x80,0x7f,0x7e]) $ case cfm of
ErrorOnCodingFailure -> Nothing
IgnoreCodingFailure -> Just "\x7e\x7f\x7f\x7e"
TransliterateCodingFailure -> Just "\x7e\x7f\xfffd\xfffd\xfffd\x7f\x7e"
-- Another GHC special: decode invalid input to the Char U+FFFD
-- REPLACEMENT CHARACTER.
RoundtripFailure -> Just "\x7e\x7f\xdc80\xdc81\xdc80\x7f\x7e"
-- GHC's PEP383-style String-encoding of invalid input,
-- see Note [Roundtripping]
testIO (encode enc ['\0'..'\x7f']) $ Just [0..0x7f]
testIO (encode enc "\x7e\x7f\x80\x81\x80\x7f\xe9") $ case cfm of
ErrorOnCodingFailure -> Nothing
IgnoreCodingFailure -> Just [0x7e,0x7f,0x7f]
TransliterateCodingFailure -> Just [0x7e,0x7f,0x3f,0x3f,0x3f,0x7f,0x3f]
-- See comment in test_latin1. iconv -t ASCII//TRANSLIT would encode
-- U+00E9 LATIN SMALL LETTER E WITH ACUTE as 'e' (ASCII 0x65).
RoundtripFailure -> Nothing
-- Test roundtripping for good measure
case cfm of
RoundtripFailure -> do
Just s <- decode enc [0..0xff]
testIO (encode enc s) $ Just [0..0xff]
_ -> return ()
main = do
putStrLn "char8 tests"
test_char8 char8 -- char8 never fails in either direction
-- These use GHC's own implementation
putStrLn "Latin1.ascii tests"
test_ascii ErrorOnCodingFailure (Latin1.ascii)
test_ascii IgnoreCodingFailure (Latin1.mkAscii IgnoreCodingFailure)
test_ascii TransliterateCodingFailure (Latin1.mkAscii TransliterateCodingFailure)
test_ascii RoundtripFailure (Latin1.mkAscii RoundtripFailure)
putStrLn "Latin1.latin1_checked tests"
test_latin1 ErrorOnCodingFailure (Latin1.latin1_checked)
test_latin1 IgnoreCodingFailure (Latin1.mkLatin1_checked IgnoreCodingFailure)
test_latin1 TransliterateCodingFailure (Latin1.mkLatin1_checked TransliterateCodingFailure)
test_latin1 RoundtripFailure (Latin1.mkLatin1_checked RoundtripFailure)
-- These use iconv (normally, unless it is broken)
putStrLn "mkTextEncoding ASCII tests"
test_ascii ErrorOnCodingFailure =<< mkTextEncoding "ASCII"
test_ascii IgnoreCodingFailure =<< mkTextEncoding "ASCII//IGNORE"
test_ascii TransliterateCodingFailure =<< mkTextEncoding "ASCII//TRANSLIT"
test_ascii RoundtripFailure =<< mkTextEncoding "ASCII//ROUNDTRIP"
putStrLn "mkTextEncoding LATIN1 tests"
test_latin1 ErrorOnCodingFailure =<< mkTextEncoding "LATIN1"
test_latin1 IgnoreCodingFailure =<< mkTextEncoding "LATIN1//IGNORE"
test_latin1 TransliterateCodingFailure =<< mkTextEncoding "LATIN1//TRANSLIT"
test_latin1 RoundtripFailure =<< mkTextEncoding "LATIN1//ROUNDTRIP"
char8 tests
Latin1.ascii tests
Latin1.latin1_checked tests
mkTextEncoding ASCII tests
mkTextEncoding LATIN1 tests
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment