Commit 9e939403 authored by thomie's avatar thomie Committed by Austin Seipp

StringBuffer should not contain initial byte-order mark (BOM)

Summary:
Just skipping over a BOM, but leaving it in the Stringbuffer, is not
sufficient. The Lexer calls prevChar when a regular expression starts
with '^' (which is a shorthand for '\n^'). It would never match on the
first line, since instead of '\n', prevChar would still return '\xfeff'.

Test Plan: validate

Reviewers: austin, ezyang

Reviewed By: austin, ezyang

Subscribers: simonmar, ezyang, carter

Differential Revision: https://phabricator.haskell.org/D176

GHC Trac Issues: #6016
parent e81e0280
......@@ -47,9 +47,12 @@ import Encoding
import FastString
import FastTypes
import FastFunctions
import Outputable
import Util
import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
, Handle, hTell, openBinaryFile )
import Data.Maybe
import Control.Exception
import System.IO
import System.IO.Unsafe ( unsafePerformIO )
import GHC.Exts
......@@ -89,7 +92,8 @@ hGetStringBuffer :: FilePath -> IO StringBuffer
hGetStringBuffer fname = do
h <- openBinaryFile fname ReadMode
size_i <- hFileSize h
let size = fromIntegral size_i
offset_i <- skipBOM h size_i 0 -- offset is 0 initially
let size = fromIntegral $ size_i - offset_i
buf <- mallocForeignPtrArray (size+3)
withForeignPtr buf $ \ptr -> do
r <- if size == 0 then return 0 else hGetBuf h ptr size
......@@ -101,7 +105,7 @@ hGetStringBuffer fname = do
hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
hGetStringBufferBlock handle wanted
= do size_i <- hFileSize handle
offset_i <- hTell handle
offset_i <- hTell handle >>= skipBOM handle size_i
let size = min wanted (fromIntegral $ size_i-offset_i)
buf <- mallocForeignPtrArray (size+3)
withForeignPtr buf $ \ptr ->
......@@ -110,19 +114,34 @@ hGetStringBufferBlock handle wanted
then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
else newUTF8StringBuffer buf ptr size
-- | Skip the byte-order mark if there is one (see #1744 and #6016),
-- and return the new position of the handle in bytes.
--
-- This is better than treating #FEFF as whitespace,
-- because that would mess up layout. We don't have a concept
-- of zero-width whitespace in Haskell: all whitespace codepoints
-- have a width of one column.
skipBOM :: Handle -> Integer -> Integer -> IO Integer
skipBOM h size offset =
-- Only skip BOM at the beginning of a file.
if size > 0 && offset == 0
then do
-- Validate assumption that handle is in binary mode.
ASSERTM( hGetEncoding h >>= return . isNothing )
-- Temporarily select text mode to make `hLookAhead` and
-- `hGetChar` return full Unicode characters.
bracket_ (hSetBinaryMode h False) (hSetBinaryMode h True) $ do
c <- hLookAhead h
if c == '\xfeff'
then hGetChar h >> hTell h
else return offset
else return offset
newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
newUTF8StringBuffer buf ptr size = do
pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
-- sentinels for UTF-8 decoding
let
sb0 = StringBuffer buf size 0
(first_char, sb1) = nextChar sb0
-- skip the byte-order mark if there is one (see #1744)
-- This is better than treating #FEFF as whitespace,
-- because that would mess up layout. We don't have a concept
-- of zero-width whitespace in Haskell: all whitespace codepoints
-- have a width of one column.
return (if first_char == '\xfeff' then sb1 else sb0)
return $ StringBuffer buf size 0
appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
appendStringBuffers sb1 sb2
......
......@@ -1074,6 +1074,8 @@ mk/ghcconfig_*_inplace_bin_ghc-stage2.exe.mk
/tests/parser/should_run/readRun004
/tests/parser/unicode/1744
/tests/parser/unicode/T1744
/tests/parser/unicode/T6016
/tests/parser/unicode/T6016-twoBOMs
/tests/parser/unicode/utf8_024
/tests/patsyn/should_run/bidir-explicit
/tests/patsyn/should_run/bidir-explicit-scope
......
module Main where
import Control.Exception
import Data.Char
import System.IO
import StringBuffer
twoBOMs = "T6016-twoBOMs"
ignoreFirstBOM = do
-- StringBuffer should not contain initial byte-order mark.
--
-- Just skipping over it, but leaving it in the Stringbuffer, is not
-- sufficient. The Lexer calls prevChar when a regular expression
-- starts with '^' (which is a shorthand for '\n^'). It would never
-- match on the first line, since instead of '\n', prevChar would
-- still return '\xfeff'.
s <- hGetStringBuffer twoBOMs
assert (prevChar s '\n' == '\n') return ()
dontIgnoreSecondBOM = do
-- U+FEFF is considered a BOM only if it appears as the first
-- character of a file.
h <- openBinaryFile twoBOMs ReadMode
hSeek h AbsoluteSeek 3
s <- hGetStringBufferBlock h 3
hClose h
assert (currentChar s == '\xfeff') return ()
main = do
writeFile twoBOMs "\xfeff\xfeff"
ignoreFirstBOM
dontIgnoreSecondBOM
......@@ -20,4 +20,5 @@ test('T1744', normal, compile_and_run, [''])
test('T1103', normal, compile, [''])
test('T2302', only_ways(['normal']), compile_fail, [''])
test('T4373', normal, compile, [''])
test('T6016', extra_clean('T6016-twoBOMs'), compile_and_run, ['-package ghc'])
test('T7671', normal, compile, [''])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment