From 2c67c68eb93e7ca44a9eb66e287cfdec4d1bd24c Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Tue, 21 Jul 2015 21:46:38 +0200
Subject: [PATCH] When iconv is unavailable, use an ASCII encoding to encode
 ASCII

D898 and D1059 implemented a fallback behavior to handle the case
that the end user's iconv installation is broken (typically due to
running inside a chroot in which the necessary locale files and/or
gconv modules have not been installed). In this case, if the
program requests an ASCII locale, GHC's char8 encoding is used
rather than the program failing.

However, silently mangling data like char8 does when the programmer
did not ask for it is poor behavior, for reasons described in D1059.

This commit implements an ASCII encoding and uses it in the fallback
case when iconv is unavailable and the user has requested ASCII.

Test Plan:
Added tests for the encodings defined in Latin1.
Also, manually ran a statically-linked executable of that test
in a chroot and the tests passed (up to the ones that call
mkTextEncoding "LATIN1", since there is no fallback from iconv
for that case yet).

Reviewers: austin, hvr, hsyl20, bgamari

Reviewed By: hsyl20, bgamari

Subscribers: thomie

Differential Revision: https://phabricator.haskell.org/D1085

GHC Trac Issues: #7695, #10623
---
 libraries/base/GHC/IO/Encoding.hs          |  14 ++-
 libraries/base/GHC/IO/Encoding/Iconv.hs    |  25 +++--
 libraries/base/GHC/IO/Encoding/Latin1.hs   |  83 ++++++++++++++-
 libraries/base/GHC/TopHandler.hs           |   3 +-
 libraries/base/tests/.gitignore            |   1 +
 libraries/base/tests/IO/all.T              |   1 +
 libraries/base/tests/IO/encoding005.hs     | 115 +++++++++++++++++++++
 libraries/base/tests/IO/encoding005.stdout |   5 +
 8 files changed, 227 insertions(+), 20 deletions(-)
 create mode 100644 libraries/base/tests/IO/encoding005.hs
 create mode 100644 libraries/base/tests/IO/encoding005.stdout

diff --git a/libraries/base/GHC/IO/Encoding.hs b/libraries/base/GHC/IO/Encoding.hs
index 108b0fcf117..a690717ebdb 100644
--- a/libraries/base/GHC/IO/Encoding.hs
+++ b/libraries/base/GHC/IO/Encoding.hs
@@ -30,7 +30,6 @@ module GHC.IO.Encoding (
     ) where
 
 import GHC.Base
-import GHC.Foreign (charIsRepresentable)
 import GHC.IO.Exception
 import GHC.IO.Buffer
 import GHC.IO.Encoding.Failure
@@ -255,18 +254,17 @@ mkTextEncoding' cfm enc =
     -- Unfortunately there is no good way to determine whether iconv is actually
     -- functional without telling it to do something.
     _ -> do res <- Iconv.mkIconvEncoding cfm enc
-            good <- charIsRepresentable res 'a'
             let isAscii = any (== enc) ansiEncNames
-            case good of
-              True -> return res
+            case res of
+              Just e -> return e
               -- At this point we know that we can't count on iconv to work
               -- (see, for instance, Trac #10298). However, we still want to do
-              --  what can to work with what we have. For instance, ASCII is
+              --  what we can to work with what we have. For instance, ASCII is
               -- easy. We match on ASCII encodings directly using several
               -- possible aliases (specified by RFC 1345 & Co) and for this use
-              -- the 'char8' encodeing
-              False
-                | isAscii   -> return char8
+              -- the 'ascii' encoding
+              Nothing
+                | isAscii   -> return (Latin1.mkAscii cfm)
                 | otherwise ->
                     unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
   where
diff --git a/libraries/base/GHC/IO/Encoding/Iconv.hs b/libraries/base/GHC/IO/Encoding/Iconv.hs
index f64d2451cf3..061bd60c209 100644
--- a/libraries/base/GHC/IO/Encoding/Iconv.hs
+++ b/libraries/base/GHC/IO/Encoding/Iconv.hs
@@ -34,9 +34,10 @@ import GHC.Base () -- For build ordering
 #else
 
 import Foreign
-import Foreign.C
+import Foreign.C hiding (charIsRepresentable)
 import Data.Maybe
 import GHC.Base
+import GHC.Foreign (charIsRepresentable)
 import GHC.IO.Buffer
 import GHC.IO.Encoding.Failure
 import GHC.IO.Encoding.Types
@@ -96,17 +97,27 @@ char_shift :: Int
 char_shift | charSize == 2 = 1
            | otherwise     = 2
 
-iconvEncoding :: String -> IO TextEncoding
+iconvEncoding :: String -> IO (Maybe TextEncoding)
 iconvEncoding = mkIconvEncoding ErrorOnCodingFailure
 
 -- | Construct an iconv-based 'TextEncoding' for the given character set and
 -- 'CodingFailureMode'.
-mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding
+--
+-- As iconv is missing in some minimal environments (e.g. #10298), this
+-- checks to ensure that iconv is working properly before returning the
+-- encoding, returning 'Nothing' if not.
+mkIconvEncoding :: CodingFailureMode -> String -> IO (Maybe TextEncoding)
 mkIconvEncoding cfm charset = do
-  return (TextEncoding {
-                textEncodingName = charset,
-                mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode,
-                mkTextEncoder = newIConv haskellChar charset                 (recoverEncode cfm) iconvEncode})
+    let enc = TextEncoding {
+                  textEncodingName = charset,
+                  mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix)
+                                           (recoverDecode cfm) iconvDecode,
+                  mkTextEncoder = newIConv haskellChar charset
+                                           (recoverEncode cfm) iconvEncode}
+    good <- charIsRepresentable enc 'a'
+    return $ if good
+               then Just enc
+               else Nothing
   where
     -- An annoying feature of GNU iconv is that the //PREFIXES only take
     -- effect when they appear on the tocode parameter to iconv_open:
diff --git a/libraries/base/GHC/IO/Encoding/Latin1.hs b/libraries/base/GHC/IO/Encoding/Latin1.hs
index 34a4fca193b..d24fcdfc107 100644
--- a/libraries/base/GHC/IO/Encoding/Latin1.hs
+++ b/libraries/base/GHC/IO/Encoding/Latin1.hs
@@ -15,7 +15,7 @@
 -- Stability   :  internal
 -- Portability :  non-portable
 --
--- UTF-32 Codecs for the IO library
+-- Single-byte encodings that map directly to Unicode code points.
 --
 -- Portions Copyright   : (c) Tom Harper 2008-2009,
 --                        (c) Bryan O'Sullivan 2009,
@@ -26,9 +26,12 @@
 module GHC.IO.Encoding.Latin1 (
   latin1, mkLatin1,
   latin1_checked, mkLatin1_checked,
+  ascii, mkAscii,
   latin1_decode,
+  ascii_decode,
   latin1_encode,
   latin1_checked_encode,
+  ascii_encode,
   ) where
 
 import GHC.Base
@@ -90,6 +93,46 @@ latin1_checked_EF cfm =
              setState = const $ return ()
           })
 
+-- -----------------------------------------------------------------------------
+-- ASCII
+
+-- | @since 4.8.2.0
+ascii :: TextEncoding
+ascii = mkAscii ErrorOnCodingFailure
+
+-- | @since 4.8.2.0
+mkAscii :: CodingFailureMode -> TextEncoding
+mkAscii cfm = TextEncoding { textEncodingName = "ASCII",
+                             mkTextDecoder = ascii_DF cfm,
+                             mkTextEncoder = ascii_EF cfm }
+
+ascii_DF :: CodingFailureMode -> IO (TextDecoder ())
+ascii_DF cfm =
+  return (BufferCodec {
+             encode   = ascii_decode,
+             recover  = recoverDecode cfm,
+             close    = return (),
+             getState = return (),
+             setState = const $ return ()
+          })
+
+ascii_EF :: CodingFailureMode -> IO (TextEncoder ())
+ascii_EF cfm =
+  return (BufferCodec {
+             encode   = ascii_encode,
+             recover  = recoverEncode cfm,
+             close    = return (),
+             getState = return (),
+             setState = const $ return ()
+          })
+
+
+
+-- -----------------------------------------------------------------------------
+-- The actual decoders and encoders
+
+-- TODO: Eliminate code duplication between the checked and unchecked
+-- versions of the decoder or encoder (but don't change the Core!)
 
 latin1_decode :: DecodeBuffer
 latin1_decode 
@@ -112,6 +155,30 @@ latin1_decode
     in
     loop ir0 ow0
 
+ascii_decode :: DecodeBuffer
+ascii_decode
+  input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
+  output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
+ = let
+       loop !ir !ow
+         | ow >= os = done OutputUnderflow ir ow
+         | ir >= iw = done InputUnderflow ir ow
+         | otherwise = do
+              c0 <- readWord8Buf iraw ir
+              if c0 > 0x7f then invalid else do
+              ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0))
+              loop (ir+1) ow'
+         where
+           invalid = done InvalidSequence ir ow
+
+       -- lambda-lifted, to avoid thunks being built in the inner-loop:
+       done why !ir !ow = return (why,
+                                  if ir == iw then input{ bufL=0, bufR=0 }
+                                              else input{ bufL=ir },
+                                  output{ bufR=ow })
+    in
+    loop ir0 ow0
+
 latin1_encode :: EncodeBuffer
 latin1_encode
   input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
@@ -132,7 +199,15 @@ latin1_encode
     loop ir0 ow0
 
 latin1_checked_encode :: EncodeBuffer
-latin1_checked_encode
+latin1_checked_encode input output
+ = single_byte_checked_encode 0xff input output
+
+ascii_encode :: EncodeBuffer
+ascii_encode input output
+ = single_byte_checked_encode 0x7f input output
+
+single_byte_checked_encode :: Int -> EncodeBuffer
+single_byte_checked_encode max_legal_char
   input@Buffer{  bufRaw=iraw, bufL=ir0, bufR=iw,  bufSize=_  }
   output@Buffer{ bufRaw=oraw, bufL=_,   bufR=ow0, bufSize=os }
  = let
@@ -145,11 +220,11 @@ latin1_checked_encode
         | ir >= iw = done InputUnderflow ir ow
         | otherwise = do
            (c,ir') <- readCharBuf iraw ir
-           if ord c > 0xff then invalid else do
+           if ord c > max_legal_char then invalid else do
            writeWord8Buf oraw ow (fromIntegral (ord c))
            loop ir' (ow+1)
         where
            invalid = done InvalidSequence ir ow
     in
     loop ir0 ow0
-
+{-# INLINE single_byte_checked_encode #-}
diff --git a/libraries/base/GHC/TopHandler.hs b/libraries/base/GHC/TopHandler.hs
index d9010697b38..05c905f7c10 100644
--- a/libraries/base/GHC/TopHandler.hs
+++ b/libraries/base/GHC/TopHandler.hs
@@ -180,7 +180,8 @@ disasterHandler exit _ =
     msgStr =
         "encountered an exception while trying to report an exception." ++
         "One possible reason for this is that we failed while trying to " ++
-        "encode an error message. Check that your locale configured properly."
+        "encode an error message. Check that your locale is configured " ++
+        "properly."
 
 {- Note [Disaster with iconv]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/libraries/base/tests/.gitignore b/libraries/base/tests/.gitignore
index af90b5e47c8..a430bd700ab 100644
--- a/libraries/base/tests/.gitignore
+++ b/libraries/base/tests/.gitignore
@@ -105,6 +105,7 @@
 /IO/encoding002
 /IO/encoding003
 /IO/encoding004
+/IO/encoding005
 /IO/encodingerror001
 /IO/environment001
 /IO/finalization001
diff --git a/libraries/base/tests/IO/all.T b/libraries/base/tests/IO/all.T
index e15c84d9f83..fe8faa0ce05 100644
--- a/libraries/base/tests/IO/all.T
+++ b/libraries/base/tests/IO/all.T
@@ -145,6 +145,7 @@ test('encoding001',
 test('encoding002', normal, compile_and_run, [''])
 test('encoding003', normal, compile_and_run, [''])
 test('encoding004', normal, compile_and_run, [''])
+test('encoding005', normal, compile_and_run, [''])
 
 test('environment001',
      [extra_clean(['environment001'])],
diff --git a/libraries/base/tests/IO/encoding005.hs b/libraries/base/tests/IO/encoding005.hs
new file mode 100644
index 00000000000..99db84af596
--- /dev/null
+++ b/libraries/base/tests/IO/encoding005.hs
@@ -0,0 +1,115 @@
+import Control.Monad
+import Data.Word (Word8)
+import Foreign.Ptr
+import Foreign.Marshal.Array
+import GHC.Foreign (peekCStringLen, withCStringLen)
+import GHC.IO.Encoding.Failure (CodingFailureMode(..))
+import qualified GHC.IO.Encoding.Latin1 as Latin1
+import System.IO
+import System.IO.Error
+
+-- Tests for single-byte encodings that map directly to Unicode
+-- (module GHC.IO.Encoding.Latin1)
+
+eitherToMaybe :: Either a b -> Maybe b
+eitherToMaybe (Left _) = Nothing
+eitherToMaybe (Right b) = Just b
+
+decode :: TextEncoding -> [Word8] -> IO (Maybe String)
+decode enc xs = fmap eitherToMaybe . tryIOError $ withArrayLen xs (\sz p -> peekCStringLen enc (castPtr p, sz))
+
+encode :: TextEncoding -> String -> IO (Maybe [Word8])
+encode enc cs = fmap eitherToMaybe . tryIOError $ withCStringLen enc cs (\(p, sz) -> peekArray sz (castPtr p))
+
+testIO :: (Eq a, Show a) => IO a -> a -> IO ()
+testIO action expected = do
+  result <- action
+  when (result /= expected) $
+    putStrLn $ "Test failed: expected " ++ show expected ++ ", but got " ++ show result
+
+-- Test char8-like encodings
+test_char8 :: TextEncoding -> IO ()
+test_char8 enc = do
+  testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff']
+
+  testIO (encode enc ['\0'..'\x200']) $ Just ([0..0xff] ++ [0..0xff] ++ [0])
+
+-- Test latin1-like encodings
+test_latin1 :: CodingFailureMode -> TextEncoding -> IO ()
+test_latin1 cfm enc = do
+  testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff']
+
+  testIO (encode enc ['\0'..'\xff']) $ Just [0..0xff]
+  testIO (encode enc "\xfe\xff\x100\x101\x100\xff\xfe") $ case cfm of
+    ErrorOnCodingFailure -> Nothing
+    IgnoreCodingFailure -> Just [0xfe,0xff,0xff,0xfe]
+    TransliterateCodingFailure -> Just [0xfe,0xff,0x3f,0x3f,0x3f,0xff,0xfe]
+    -- N.B. The argument "LATIN1//TRANSLIT" to mkTextEncoding does not
+    -- correspond to "LATIN1//TRANSLIT" in iconv! Instead GHC asks iconv
+    -- to encode to "LATIN1" and uses its own "evil hack" to insert '?'
+    -- (ASCII 0x3f) in place of failures. See GHC.IO.Encoding.recoverEncode.
+    --
+    -- U+0100 is LATIN CAPITAL LETTER A WITH MACRON, which iconv would
+    -- transliterate to 'A' (ASCII 0x41). Similarly iconv would
+    -- transliterate U+0101 LATIN SMALL LETTER A WITH MACRON to 'a'
+    -- (ASCII 0x61).
+    RoundtripFailure -> Nothing
+
+test_ascii :: CodingFailureMode -> TextEncoding -> IO ()
+test_ascii cfm enc = do
+  testIO (decode enc [0..0x7f]) $ Just ['\0'..'\x7f']
+  testIO (decode enc [0x7e,0x7f,0x80,0x81,0x80,0x7f,0x7e]) $ case cfm of
+    ErrorOnCodingFailure -> Nothing
+    IgnoreCodingFailure -> Just "\x7e\x7f\x7f\x7e"
+    TransliterateCodingFailure -> Just "\x7e\x7f\xfffd\xfffd\xfffd\x7f\x7e"
+    -- Another GHC special: decode invalid input to the Char U+FFFD
+    -- REPLACEMENT CHARACTER.
+    RoundtripFailure -> Just "\x7e\x7f\xdc80\xdc81\xdc80\x7f\x7e"
+    -- GHC's PEP383-style String-encoding of invalid input,
+    -- see Note [Roundtripping]
+
+  testIO (encode enc ['\0'..'\x7f']) $ Just [0..0x7f]
+  testIO (encode enc "\x7e\x7f\x80\x81\x80\x7f\xe9") $ case cfm of
+    ErrorOnCodingFailure -> Nothing
+    IgnoreCodingFailure -> Just [0x7e,0x7f,0x7f]
+    TransliterateCodingFailure -> Just [0x7e,0x7f,0x3f,0x3f,0x3f,0x7f,0x3f]
+    -- See comment in test_latin1. iconv -t ASCII//TRANSLIT would encode
+    -- U+00E9 LATIN SMALL LETTER E WITH ACUTE as 'e' (ASCII 0x65).
+    RoundtripFailure -> Nothing
+
+  -- Test roundtripping for good measure
+  case cfm of
+    RoundtripFailure -> do
+      Just s <- decode enc [0..0xff]
+      testIO (encode enc s) $ Just [0..0xff]
+    _ -> return ()
+
+main = do
+  putStrLn "char8 tests"
+  test_char8 char8              -- char8 never fails in either direction
+
+  -- These use GHC's own implementation
+  putStrLn "Latin1.ascii tests"
+  test_ascii ErrorOnCodingFailure (Latin1.ascii)
+  test_ascii IgnoreCodingFailure (Latin1.mkAscii IgnoreCodingFailure)
+  test_ascii TransliterateCodingFailure (Latin1.mkAscii TransliterateCodingFailure)
+  test_ascii RoundtripFailure (Latin1.mkAscii RoundtripFailure)
+
+  putStrLn "Latin1.latin1_checked tests"
+  test_latin1 ErrorOnCodingFailure (Latin1.latin1_checked)
+  test_latin1 IgnoreCodingFailure (Latin1.mkLatin1_checked IgnoreCodingFailure)
+  test_latin1 TransliterateCodingFailure (Latin1.mkLatin1_checked TransliterateCodingFailure)
+  test_latin1 RoundtripFailure (Latin1.mkLatin1_checked RoundtripFailure)
+
+  -- These use iconv (normally, unless it is broken)
+  putStrLn "mkTextEncoding ASCII tests"
+  test_ascii ErrorOnCodingFailure =<< mkTextEncoding "ASCII"
+  test_ascii IgnoreCodingFailure =<< mkTextEncoding "ASCII//IGNORE"
+  test_ascii TransliterateCodingFailure =<< mkTextEncoding "ASCII//TRANSLIT"
+  test_ascii RoundtripFailure =<< mkTextEncoding "ASCII//ROUNDTRIP"
+
+  putStrLn "mkTextEncoding LATIN1 tests"
+  test_latin1 ErrorOnCodingFailure =<< mkTextEncoding "LATIN1"
+  test_latin1 IgnoreCodingFailure =<< mkTextEncoding "LATIN1//IGNORE"
+  test_latin1 TransliterateCodingFailure =<< mkTextEncoding "LATIN1//TRANSLIT"
+  test_latin1 RoundtripFailure =<< mkTextEncoding "LATIN1//ROUNDTRIP"
diff --git a/libraries/base/tests/IO/encoding005.stdout b/libraries/base/tests/IO/encoding005.stdout
new file mode 100644
index 00000000000..664a1935924
--- /dev/null
+++ b/libraries/base/tests/IO/encoding005.stdout
@@ -0,0 +1,5 @@
+char8 tests
+Latin1.ascii tests
+Latin1.latin1_checked tests
+mkTextEncoding ASCII tests
+mkTextEncoding LATIN1 tests
-- 
GitLab