From 677552f21690761b89255d05e42976679be4d374 Mon Sep 17 00:00:00 2001 From: Ben Gamari <bgamari.foss@gmail.com> Date: Thu, 9 Jul 2015 08:07:54 -0400 Subject: [PATCH] Fix self-contained handling of ASCII encoding D898 was primarily intended to fix hangs in the event that iconv was unavailable (namely #10298 and #7695). In addition to this fix, it also introduced self-contained handling of ANSI terminals to allow compiled executables to run in minimal environments lacking iconv. However, the behavior that the patch introduced is highly suspicious. Specifically, it gives the user a UTF-8 encoding even if they requested ASCII. This has the potential to break quite a lot of code. At very least it breaks GHC's Unicode terminal detection logic, which attempts to catch an invalid character when encoding a pair of smart-quotes. Of course, this exception will never be thrown if a UTF-8 encoder is used. Here we use the `char8` encoding to handle requests for ASCII encodings. Fixes #10623. --- libraries/base/GHC/IO/Encoding.hs | 35 ++++++++++++++++++------- libraries/base/GHC/IO/Encoding/Iconv.hs | 2 ++ libraries/base/GHC/TopHandler.hs | 6 ++++- testsuite/tests/driver/Makefile | 2 +- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/libraries/base/GHC/IO/Encoding.hs b/libraries/base/GHC/IO/Encoding.hs index 014b61b8b026..108b0fcf117d 100644 --- a/libraries/base/GHC/IO/Encoding.hs +++ b/libraries/base/GHC/IO/Encoding.hs @@ -30,6 +30,7 @@ module GHC.IO.Encoding ( ) where import GHC.Base +import GHC.Foreign (charIsRepresentable) import GHC.IO.Exception import GHC.IO.Buffer import GHC.IO.Encoding.Failure @@ -235,14 +236,9 @@ mkTextEncoding e = case mb_coding_failure_mode of _ -> Nothing mkTextEncoding' :: CodingFailureMode -> String -> IO TextEncoding -mkTextEncoding' cfm enc - -- First, specifically match on ASCII encodings directly using - -- several possible aliases (specified by RFC 1345 & co), which - -- allows us to handle ASCII conversions without iconv at all (see - -- trac #10298). - | any (== enc) ansiEncNames = return (UTF8.mkUTF8 cfm) - -- Otherwise, handle other encoding needs via iconv. - | otherwise = case [toUpper c | c <- enc, c /= '-'] of +mkTextEncoding' cfm enc = + case [toUpper c | c <- enc, c /= '-'] of + -- UTF-8 and friends we can handle ourselves "UTF8" -> return $ UTF8.mkUTF8 cfm "UTF16" -> return $ UTF16.mkUTF16 cfm "UTF16LE" -> return $ UTF16.mkUTF16le cfm @@ -254,13 +250,32 @@ mkTextEncoding' cfm enc 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp _ -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm) #else - _ -> Iconv.mkIconvEncoding cfm enc -#endif + -- Otherwise, handle other encoding needs via iconv. + + -- Unfortunately there is no good way to determine whether iconv is actually + -- functional without telling it to do something. + _ -> do res <- Iconv.mkIconvEncoding cfm enc + good <- charIsRepresentable res 'a' + let isAscii = any (== enc) ansiEncNames + case good of + True -> return res + -- At this point we know that we can't count on iconv to work + -- (see, for instance, Trac #10298). However, we still want to do + -- what can to work with what we have. For instance, ASCII is + -- easy. We match on ASCII encodings directly using several + -- possible aliases (specified by RFC 1345 & Co) and for this use + -- the 'char8' encodeing + False + | isAscii -> return char8 + | otherwise -> + unknownEncodingErr (enc ++ codingFailureModeSuffix cfm) where ansiEncNames = -- ASCII aliases [ "ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991" , "US-ASCII", "us", "IBM367", "cp367", "csASCII", "ASCII", "ISO646-US" ] +#endif + latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8) latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8 diff --git a/libraries/base/GHC/IO/Encoding/Iconv.hs b/libraries/base/GHC/IO/Encoding/Iconv.hs index 89ca71e084ad..f64d2451cf3f 100644 --- a/libraries/base/GHC/IO/Encoding/Iconv.hs +++ b/libraries/base/GHC/IO/Encoding/Iconv.hs @@ -99,6 +99,8 @@ char_shift | charSize == 2 = 1 iconvEncoding :: String -> IO TextEncoding iconvEncoding = mkIconvEncoding ErrorOnCodingFailure +-- | Construct an iconv-based 'TextEncoding' for the given character set and +-- 'CodingFailureMode'. mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding mkIconvEncoding cfm charset = do return (TextEncoding { diff --git a/libraries/base/GHC/TopHandler.hs b/libraries/base/GHC/TopHandler.hs index e725196cdff1..d9010697b387 100644 --- a/libraries/base/GHC/TopHandler.hs +++ b/libraries/base/GHC/TopHandler.hs @@ -176,7 +176,11 @@ disasterHandler exit _ = withCAString "%s" $ \fmt -> withCAString msgStr $ \msg -> errorBelch fmt msg >> exit 1 - where msgStr = "encountered an exception while trying to report an exception" + where + msgStr = + "encountered an exception while trying to report an exception." ++ + "One possible reason for this is that we failed while trying to " ++ + "encode an error message. Check that your locale configured properly." {- Note [Disaster with iconv] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/testsuite/tests/driver/Makefile b/testsuite/tests/driver/Makefile index 4418ba44ac39..e12f3a5a989a 100644 --- a/testsuite/tests/driver/Makefile +++ b/testsuite/tests/driver/Makefile @@ -551,7 +551,7 @@ T7563: -"$(TEST_HC)" $(TEST_HC_OPTS_NO_RECOMP) -C T7563.hs # Below we set LC_ALL=C to request standard ASCII output in the resulting error -# messagse. Unfortunately, Mac OS X still uses a Unicode encoding even with +# messages. Unfortunately, Mac OS X still uses a Unicode encoding even with # LC_ALL=C, so we expect these tests to fail there. .PHONY: T6037 -- GitLab