Commit 1e74d272 authored by Simon Marlow's avatar Simon Marlow

Save and restore the codec state when re-decoding

We previously had an ugly hack to check for a BOM when re-decoding
some binary data in flushCharBuffer.  The hack was there essentially
because codecs like UTF-16 have a state, and we had not restored it.
This patch gives codecs an explicit state, and implemented
saving/restoring of the state as necessary.  Hence, the hack in
flushCharBuffer is replaced by a more general mechanism that works for
any codec with state.

Unfortunately, iconv doesn't give us a way to save and restore the
state, so this is currently only implemented for the built-in codecs.
parent bbbf03ed
......@@ -135,7 +135,7 @@ mkTextEncoding charset = do
newIConv :: String -> String
-> (IConv -> Buffer a -> Buffer b -> IO (Buffer a, Buffer b))
-> IO (BufferCodec a b)
-> IO (BufferCodec a b ())
newIConv from to fn =
withCString from $ \ from_str ->
withCString to $ \ to_str -> do
......@@ -144,7 +144,10 @@ newIConv from to fn =
return ()
return BufferCodec{
encode = fn iconvt,
close = iclose
close = iclose,
-- iconv doesn't supply a way to save/restore the state
getState = return (),
setState = const $ return ()
}
iconvDecode :: IConv -> Buffer Word8 -> Buffer CharBufElem
......
......@@ -42,18 +42,36 @@ latin1 :: TextEncoding
latin1 = TextEncoding { mkTextDecoder = latin1_DF,
mkTextEncoder = latin1_EF }
latin1_DF :: IO TextDecoder
latin1_DF = return (BufferCodec latin1_decode (return ()))
latin1_DF :: IO (TextDecoder ())
latin1_DF =
return (BufferCodec {
encode = latin1_decode,
close = return (),
getState = return (),
setState = const $ return ()
})
latin1_EF :: IO TextEncoder
latin1_EF = return (BufferCodec latin1_encode (return ()))
latin1_EF :: IO (TextEncoder ())
latin1_EF =
return (BufferCodec {
encode = latin1_encode,
close = return (),
getState = return (),
setState = const $ return ()
})
latin1_checked :: TextEncoding
latin1_checked = TextEncoding { mkTextDecoder = latin1_DF,
mkTextEncoder = latin1_checked_EF }
latin1_checked_EF :: IO TextEncoder
latin1_checked_EF = return (BufferCodec latin1_checked_encode (return ()))
latin1_checked_EF :: IO (TextEncoder ())
latin1_checked_EF =
return (BufferCodec {
encode = latin1_checked_encode,
close = return (),
getState = return (),
setState = const $ return ()
})
latin1_decode :: DecodeBuffer
......
......@@ -28,7 +28,7 @@ import GHC.IO.Buffer
-- -----------------------------------------------------------------------------
-- Text encoders/decoders
data BufferCodec from to = BufferCodec {
data BufferCodec from to state = BufferCodec {
encode :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
-- ^ The @encode@ function translates elements of the buffer @from@
-- to the buffer @to@. It should translate as many elements as possible
......@@ -45,10 +45,27 @@ data BufferCodec from to = BufferCodec {
-- library in order to report translation errors at the point they
-- actually occur, rather than when the buffer is translated.
--
close :: IO ()
close :: IO (),
-- ^ Resources associated with the encoding may now be released.
-- The @encode@ function may not be called again after calling
-- @close@.
getState :: IO state,
-- ^ Return the current state of the codec.
--
-- Many codecs are not stateful, and in these case the state can be
-- represented as '()'. Other codecs maintain a state. For
-- example, UTF-16 recognises a BOM (byte-order-mark) character at
-- the beginning of the input, and remembers thereafter whether to
-- use big-endian or little-endian mode. In this case, the state
-- of the codec would include two pieces of information: whether we
-- are at the beginning of the stream (the BOM only occurs at the
-- beginning), and if not, whether to use the big or little-endian
-- encoding.
setState :: state -> IO()
-- restore the state of the codec using the state from a previous
-- call to 'getState'.
}
type DecodeBuffer = Buffer Word8 -> Buffer Char
......@@ -57,16 +74,16 @@ type DecodeBuffer = Buffer Word8 -> Buffer Char
type EncodeBuffer = Buffer Char -> Buffer Word8
-> IO (Buffer Char, Buffer Word8)
type TextDecoder = BufferCodec Word8 CharBufElem
type TextEncoder = BufferCodec CharBufElem Word8
type TextDecoder state = BufferCodec Word8 CharBufElem state
type TextEncoder state = BufferCodec CharBufElem Word8 state
-- | A 'TextEncoding' is a specification of a conversion scheme
-- between sequences of bytes and sequences of Unicode characters.
--
-- For example, UTF-8 is an encoding of Unicode characters into a sequence
-- of bytes. The 'TextEncoding' for UTF-8 is 'utf_8'.
-- of bytes. The 'TextEncoding' for UTF-8 is 'utf8'.
data TextEncoding
= TextEncoding {
mkTextDecoder :: IO TextDecoder,
mkTextEncoder :: IO TextEncoder
= forall dstate estate . TextEncoding {
mkTextDecoder :: IO (TextDecoder dstate),
mkTextEncoder :: IO (TextEncoder estate)
}
......@@ -62,15 +62,25 @@ utf16 :: TextEncoding
utf16 = TextEncoding { mkTextDecoder = utf16_DF,
mkTextEncoder = utf16_EF }
utf16_DF :: IO TextDecoder
utf16_DF :: IO (TextDecoder (Maybe DecodeBuffer))
utf16_DF = do
seen_bom <- newIORef Nothing
return (BufferCodec (utf16_decode seen_bom) (return ()))
utf16_EF :: IO TextEncoder
return (BufferCodec {
encode = utf16_decode seen_bom,
close = return (),
getState = readIORef seen_bom,
setState = writeIORef seen_bom
})
utf16_EF :: IO (TextEncoder Bool)
utf16_EF = do
done_bom <- newIORef False
return (BufferCodec (utf16_encode done_bom) (return ()))
return (BufferCodec {
encode = utf16_encode done_bom,
close = return (),
getState = readIORef done_bom,
setState = writeIORef done_bom
})
utf16_encode :: IORef Bool -> EncodeBuffer
utf16_encode done_bom input
......@@ -131,23 +141,45 @@ utf16be :: TextEncoding
utf16be = TextEncoding { mkTextDecoder = utf16be_DF,
mkTextEncoder = utf16be_EF }
utf16be_DF :: IO TextDecoder
utf16be_DF = return (BufferCodec utf16be_decode (return ()))
utf16be_EF :: IO TextEncoder
utf16be_EF = return (BufferCodec utf16be_encode (return ()))
utf16be_DF :: IO (TextDecoder ())
utf16be_DF =
return (BufferCodec {
encode = utf16be_decode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf16be_EF :: IO (TextEncoder ())
utf16be_EF =
return (BufferCodec {
encode = utf16be_encode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf16le :: TextEncoding
utf16le = TextEncoding { mkTextDecoder = utf16le_DF,
mkTextEncoder = utf16le_EF }
utf16le_DF :: IO TextDecoder
utf16le_DF = return (BufferCodec utf16le_decode (return ()))
utf16le_EF :: IO TextEncoder
utf16le_EF = return (BufferCodec utf16le_encode (return ()))
utf16le_DF :: IO (TextDecoder ())
utf16le_DF =
return (BufferCodec {
encode = utf16le_decode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf16le_EF :: IO (TextEncoder ())
utf16le_EF =
return (BufferCodec {
encode = utf16le_encode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf16be_decode :: DecodeBuffer
......
......@@ -51,15 +51,25 @@ utf32 :: TextEncoding
utf32 = TextEncoding { mkTextDecoder = utf32_DF,
mkTextEncoder = utf32_EF }
utf32_DF :: IO TextDecoder
utf32_DF :: IO (TextDecoder (Maybe DecodeBuffer))
utf32_DF = do
seen_bom <- newIORef Nothing
return (BufferCodec (utf32_decode seen_bom) (return ()))
utf32_EF :: IO TextEncoder
return (BufferCodec {
encode = utf32_decode seen_bom,
close = return (),
getState = readIORef seen_bom,
setState = writeIORef seen_bom
})
utf32_EF :: IO (TextEncoder Bool)
utf32_EF = do
done_bom <- newIORef False
return (BufferCodec (utf32_encode done_bom) (return ()))
return (BufferCodec {
encode = utf32_encode done_bom,
close = return (),
getState = readIORef done_bom,
setState = writeIORef done_bom
})
utf32_encode :: IORef Bool -> EncodeBuffer
utf32_encode done_bom input
......@@ -123,23 +133,46 @@ utf32be :: TextEncoding
utf32be = TextEncoding { mkTextDecoder = utf32be_DF,
mkTextEncoder = utf32be_EF }
utf32be_DF :: IO TextDecoder
utf32be_DF = return (BufferCodec utf32be_decode (return ()))
utf32be_EF :: IO TextEncoder
utf32be_EF = return (BufferCodec utf32be_encode (return ()))
utf32be_DF :: IO (TextDecoder ())
utf32be_DF =
return (BufferCodec {
encode = utf32be_decode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf32be_EF :: IO (TextEncoder ())
utf32be_EF =
return (BufferCodec {
encode = utf32be_encode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf32le :: TextEncoding
utf32le = TextEncoding { mkTextDecoder = utf32le_DF,
mkTextEncoder = utf32le_EF }
utf32le_DF :: IO TextDecoder
utf32le_DF = return (BufferCodec utf32le_decode (return ()))
utf32le_EF :: IO TextEncoder
utf32le_EF = return (BufferCodec utf32le_encode (return ()))
utf32le_DF :: IO (TextDecoder ())
utf32le_DF =
return (BufferCodec {
encode = utf32le_decode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf32le_EF :: IO (TextEncoder ())
utf32le_EF =
return (BufferCodec {
encode = utf32le_encode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf32be_decode :: DecodeBuffer
......
......@@ -39,11 +39,23 @@ utf8 :: TextEncoding
utf8 = TextEncoding { mkTextDecoder = utf8_DF,
mkTextEncoder = utf8_EF }
utf8_DF :: IO TextDecoder
utf8_DF = return (BufferCodec utf8_decode (return ()))
utf8_EF :: IO TextEncoder
utf8_EF = return (BufferCodec utf8_encode (return ()))
utf8_DF :: IO (TextDecoder ())
utf8_DF =
return (BufferCodec {
encode = utf8_decode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf8_EF :: IO (TextEncoder ())
utf8_EF =
return (BufferCodec {
encode = utf8_encode,
close = return (),
getState = return (),
setState = const $ return ()
})
utf8_decode :: DecodeBuffer
utf8_decode
......
......@@ -259,8 +259,10 @@ hSetEncoding :: Handle -> TextEncoding -> IO ()
hSetEncoding hdl encoding = do
withHandle "hSetEncoding" hdl $ \h_@Handle__{..} -> do
flushCharBuffer h_
(mb_encoder,mb_decoder) <- getEncoding (Just encoding) haType
return (Handle__{ haDecoder = mb_decoder, haEncoder = mb_encoder, .. },
openTextEncoding (Just encoding) haType $ \ mb_encoder mb_decoder -> do
bbuf <- readIORef haByteBuffer
ref <- newIORef (error "last_decode")
return (Handle__{ haLastDecode = ref, haDecoder = mb_decoder, haEncoder = mb_encoder, .. },
())
-- -----------------------------------------------------------------------------
......@@ -513,15 +515,21 @@ hSetBinaryMode handle bin =
withAllHandles__ "hSetBinaryMode" handle $ \ h_@Handle__{..} ->
do
flushBuffer h_
let mb_te | bin = Nothing
| otherwise = Just localeEncoding
openTextEncoding mb_te haType $ \ mb_encoder mb_decoder -> do
-- should match the default newline mode, whatever that is
let nl | bin = noNewlineTranslation
| otherwise = nativeNewlineMode
(mb_encoder, mb_decoder) <- getEncoding mb_te haType
return Handle__{ haEncoder = mb_encoder,
bbuf <- readIORef haByteBuffer
ref <- newIORef (error "codec_state", bbuf)
return Handle__{ haLastDecode = ref,
haEncoder = mb_encoder,
haDecoder = mb_decoder,
haInputNL = inputNL nl,
haOutputNL = outputNL nl, .. }
......
......@@ -31,7 +31,7 @@ module GHC.IO.Handle.Internals (
wantSeekableHandle,
mkHandle, mkFileHandle, mkDuplexHandle,
getEncoding, initBufferState,
openTextEncoding, initBufferState,
dEFAULT_CHAR_BUFFER_SIZE,
flushBuffer, flushWriteBuffer, flushWriteBuffer_, flushCharReadBuffer,
......@@ -432,7 +432,7 @@ flushCharReadBuffer Handle__{..} = do
-- haLastDecode is the byte buffer just before we did our last batch of
-- decoding. We're going to re-decode the bytes up to the current char,
-- to find out where we should revert the byte buffer to.
bbuf0 <- readIORef haLastDecode
(codec_state, bbuf0) <- readIORef haLastDecode
cbuf0 <- readIORef haCharBuffer
writeIORef haCharBuffer cbuf0{ bufL=0, bufR=0 }
......@@ -453,24 +453,17 @@ flushCharReadBuffer Handle__{..} = do
Just decoder -> do
debugIO ("flushCharReadBuffer re-decode, bbuf=" ++ summaryBuffer bbuf0 ++
" cbuf=" ++ summaryBuffer cbuf0)
-- restore the codec state
setState decoder codec_state
(bbuf1,cbuf1) <- (encode decoder) bbuf0
cbuf0{ bufL=0, bufR=0, bufSize = bufL cbuf0 }
-- tricky case: if the decoded string starts with e BOM, then it was
-- probably ignored last time we decoded these bytes, and we should
-- therefore decode another char.
(c,_) <- readCharBuf (bufRaw cbuf1) (bufL cbuf1)
(bbuf2,_) <- if (c == '\xfeff')
then do debugIO "found BOM, decoding another char"
(encode decoder) bbuf1
cbuf0{ bufL=0, bufR=0, bufSize = 1 }
else return (bbuf1,cbuf1)
debugIO ("finished, bbuf=" ++ summaryBuffer bbuf1 ++
" cbuf=" ++ summaryBuffer cbuf1)
writeIORef haByteBuffer bbuf2
writeIORef haByteBuffer bbuf1
-- When flushing the byte read buffer, we seek backwards by the number
......@@ -508,12 +501,12 @@ mkHandle :: (IODevice dev, BufferedIO dev, Typeable dev) => dev
-> IO Handle
mkHandle dev filepath ha_type buffered mb_codec nl finalizer other_side = do
openTextEncoding mb_codec ha_type $ \ mb_encoder mb_decoder -> do
let buf_state = initBufferState ha_type
bbuf <- Buffered.newBuffer dev buf_state
bbufref <- newIORef bbuf
last_decode <- newIORef bbuf
(mb_encoder, mb_decoder) <- getEncoding mb_codec ha_type
last_decode <- newIORef (error "codec_state", bbuf)
(cbufref,bmode) <-
if buffered then getCharBuffer dev buf_state
......@@ -585,23 +578,25 @@ initBufferState :: HandleType -> BufferState
initBufferState ReadHandle = ReadBuffer
initBufferState _ = WriteBuffer
getEncoding :: Maybe TextEncoding -> HandleType
-> IO (Maybe TextEncoder,
Maybe TextDecoder)
openTextEncoding
:: Maybe TextEncoding
-> HandleType
-> (forall es ds . Maybe (TextEncoder es) -> Maybe (TextDecoder ds) -> IO a)
-> IO a
getEncoding Nothing ha_type = return (Nothing, Nothing)
getEncoding (Just te) ha_type = do
openTextEncoding Nothing ha_type cont = cont Nothing Nothing
openTextEncoding (Just TextEncoding{..}) ha_type cont = do
mb_decoder <- if isReadableHandleType ha_type then do
decoder <- mkTextDecoder te
decoder <- mkTextDecoder
return (Just decoder)
else
return Nothing
mb_encoder <- if isWritableHandleType ha_type then do
encoder <- mkTextEncoder te
encoder <- mkTextEncoder
return (Just encoder)
else
return Nothing
return (mb_encoder, mb_decoder)
cont mb_encoder mb_decoder
-- ---------------------------------------------------------------------------
-- closing Handles
......@@ -737,10 +732,15 @@ readTextDevice h_@Handle__{..} cbuf = do
debugIO ("readTextDevice after reading: bbuf=" ++ summaryBuffer bbuf1)
writeIORef haLastDecode bbuf1
(bbuf2,cbuf') <- case haDecoder of
Nothing -> latin1_decode bbuf1 cbuf
Just decoder -> (encode decoder) bbuf1 cbuf
(bbuf2,cbuf') <-
case haDecoder of
Nothing -> do
writeIORef haLastDecode (error "codec_state", bbuf1)
latin1_decode bbuf1 cbuf
Just decoder -> do
state <- getState decoder
writeIORef haLastDecode (state, bbuf1)
(encode decoder) bbuf1 cbuf
debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf2)
......@@ -766,10 +766,15 @@ readTextDevice' h_@Handle__{..} bbuf0 cbuf = do
debugIO ("readTextDevice after reading: bbuf=" ++ summaryBuffer bbuf2)
writeIORef haLastDecode bbuf2
(bbuf3,cbuf') <- case haDecoder of
Nothing -> latin1_decode bbuf2 cbuf
Just decoder -> (encode decoder) bbuf2 cbuf
(bbuf3,cbuf') <-
case haDecoder of
Nothing -> do
writeIORef haLastDecode (error "codec_state", bbuf2)
latin1_decode bbuf2 cbuf
Just decoder -> do
state <- getState decoder
writeIORef haLastDecode (state, bbuf2)
(encode decoder) bbuf2 cbuf
debugIO ("readTextDevice after decoding: cbuf=" ++ summaryBuffer cbuf' ++
" bbuf=" ++ summaryBuffer bbuf3)
......
......@@ -121,17 +121,17 @@ instance Eq Handle where
_ == _ = False
data Handle__
= forall dev . (IODevice dev, BufferedIO dev, Typeable dev) =>
= forall dev enc_state dec_state . (IODevice dev, BufferedIO dev, Typeable dev) =>
Handle__ {
haDevice :: !dev,
haType :: HandleType, -- type (read/write/append etc.)
haByteBuffer :: !(IORef (Buffer Word8)),
haBufferMode :: BufferMode,
haLastDecode :: !(IORef (Buffer Word8)),
haLastDecode :: !(IORef (dec_state, Buffer Word8)),
haCharBuffer :: !(IORef (Buffer CharBufElem)), -- the current buffer
haBuffers :: !(IORef (BufferList CharBufElem)), -- spare buffers
haEncoder :: Maybe TextEncoder,
haDecoder :: Maybe TextDecoder,
haEncoder :: Maybe (TextEncoder enc_state),
haDecoder :: Maybe (TextDecoder dec_state),
haInputNL :: Newline,
haOutputNL :: Newline,
haOtherSide :: Maybe (MVar Handle__) -- ptr to the write side of a
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment