StringBuffer.lhs 7.83 KB
Newer Older
1
%
Simon Marlow's avatar
Simon Marlow committed
2
% (c) The University of Glasgow 2006
3
% (c) The University of Glasgow, 1997-2006
4
5
6
7
8
%

Buffers for scanning string input stored in external arrays.

\begin{code}
9
{-# OPTIONS -w #-}
10
11
12
-- The above warning supression flag is a temporary kludge.
-- While working on this module you are encouraged to remove it and fix
-- any warnings in the module. See
Ian Lynagh's avatar
Ian Lynagh committed
13
--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
14
15
-- for details

16
17
module StringBuffer
       (
18
        StringBuffer(..),
19
	-- non-abstract for vs\/HaskellService
20

21
	 -- * Creation\/destruction
22
        hGetStringBuffer,
23
24
        hGetStringBufferBlock,
        appendStringBuffers,
25
	stringToStringBuffer,
26

27
28
29
30
31
	-- * Inspection
	nextChar,
	currentChar,
	prevChar,
	atEnd,
32

33
34
35
36
	-- * Moving and comparison
	stepOn,
	offsetBytes,
	byteDiff,
37

38
        -- * Conversion
39
40
        lexemeToString,
        lexemeToFastString,
41
42

	 -- * Parsing integers
43
	parseUnsignedInteger,
44
       ) where
45

46
#include "HsVersions.h"
47

48
import Encoding
49
import FastString		( FastString,mkFastString,mkFastStringBytes )
50

51
import Foreign
52
53
import System.IO		( hGetBuf, hFileSize,IOMode(ReadMode), hClose
                                , Handle, hTell )
sof's avatar
sof committed
54

55
import GHC.Exts
56
57
58
import GHC.IOBase		( IO(..) )
import GHC.Base			( unsafeChr )

sof's avatar
sof committed
59
60
61
62
63
#if __GLASGOW_HASKELL__ >= 601
import System.IO		( openBinaryFile )
#else
import IOExts                   ( openFileEx, IOModeEx(..) )
#endif
64

sof's avatar
sof committed
65
66
67
#if __GLASGOW_HASKELL__ < 601
openBinaryFile fp mode = openFileEx fp (BinaryMode mode)
#endif
68

69
70
71
-- -----------------------------------------------------------------------------
-- The StringBuffer type

72
73
74
75
76
77
78
-- |A StringBuffer is an internal pointer to a sized chunk of bytes.
-- The bytes are intended to be *immutable*.  There are pure
-- operations to read the contents of a StringBuffer.
--
-- A StringBuffer may have a finalizer, depending on how it was
-- obtained.
--
79
data StringBuffer
80
81
82
83
84
85
86
87
88
 = StringBuffer {
     buf :: {-# UNPACK #-} !(ForeignPtr Word8),
     len :: {-# UNPACK #-} !Int, 	-- length
     cur :: {-# UNPACK #-} !Int		-- current pos
  }
	-- The buffer is assumed to be UTF-8 encoded, and furthermore
	-- we add three '\0' bytes to the end as sentinels so that the
	-- decoder doesn't have to check for overflow at every single byte
	-- of a multibyte sequence.
89

90
instance Show StringBuffer where
91
92
93
	showsPrec _ s = showString "<stringbuffer(" 
		      . shows (len s) . showString "," . shows (cur s)
		      . showString ">"
94

95
96
97
-- -----------------------------------------------------------------------------
-- Creation / Destruction

98
99
hGetStringBuffer :: FilePath -> IO StringBuffer
hGetStringBuffer fname = do
sof's avatar
sof committed
100
   h <- openBinaryFile fname ReadMode
101
102
103
104
105
106
107
   size_i <- hFileSize h
   let size = fromIntegral size_i
   buf <- mallocForeignPtrArray (size+3)
   withForeignPtr buf $ \ptr -> do
     r <- if size == 0 then return 0 else hGetBuf h ptr size
     hClose h
     if (r /= size)
108
	then ioError (userError "short read of file")
109
	else newUTF8StringBuffer buf ptr size
110

111
112
113
114
115
116
117
118
119
hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
hGetStringBufferBlock handle wanted
    = do size_i <- hFileSize handle
         offset_i <- hTell handle
         let size = min wanted (fromIntegral $ size_i-offset_i)
         buf <- mallocForeignPtrArray (size+3)
         withForeignPtr buf $ \ptr ->
             do r <- if size == 0 then return 0 else hGetBuf handle ptr size
                if r /= size
120
                   then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
                   else newUTF8StringBuffer buf ptr size

newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
newUTF8StringBuffer buf ptr size = do
  pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
	 -- sentinels for UTF-8 decoding
  let 
      sb0 = StringBuffer buf size 0
      (first_char, sb1) = nextChar sb0
        -- skip the byte-order mark if there is one (see #1744)
        -- This is better than treating #FEFF as whitespace,
        -- because that would mess up layout.  We don't have a concept
        -- of zero-width whitespace in Haskell: all whitespace codepoints
        -- have a width of one column.
  return (if first_char == '\xfeff' then sb1 else sb0)
136
137
138
139
140
141
142
143
144
145
146
147
148
149

appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
appendStringBuffers sb1 sb2
    = do newBuf <- mallocForeignPtrArray (size+3)
         withForeignPtr newBuf $ \ptr ->
          withForeignPtr (buf sb1) $ \sb1Ptr ->
           withForeignPtr (buf sb2) $ \sb2Ptr ->
             do copyArray (sb1Ptr `advancePtr` cur sb1) ptr (calcLen sb1)
                copyArray (sb2Ptr `advancePtr` cur sb2) (ptr `advancePtr` cur sb1) (calcLen sb2)
                pokeArray (ptr `advancePtr` size) [0,0,0]
                return (StringBuffer newBuf size 0)
    where calcLen sb = len sb - cur sb
          size = calcLen sb1 + calcLen sb2

150
stringToStringBuffer :: String -> IO StringBuffer
151
stringToStringBuffer str = do
152
153
154
155
156
157
158
  let size = utf8EncodedLength str
  buf <- mallocForeignPtrArray (size+3)
  withForeignPtr buf $ \ptr -> do
    utf8EncodeString ptr str
    pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
	 -- sentinels for UTF-8 decoding
  return (StringBuffer buf size 0)
159

160
-- -----------------------------------------------------------------------------
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
-- Grab a character

-- Getting our fingers dirty a little here, but this is performance-critical
{-# INLINE nextChar #-}
nextChar :: StringBuffer -> (Char,StringBuffer)
nextChar (StringBuffer buf len (I# cur#)) =
  inlinePerformIO $ do
    withForeignPtr buf $ \(Ptr a#) -> do
	case utf8DecodeChar# (a# `plusAddr#` cur#) of
	  (# c#, b# #) ->
	     let cur' = I# (b# `minusAddr#` a#) in
	     return (C# c#, StringBuffer buf len cur')

currentChar :: StringBuffer -> Char
currentChar = fst . nextChar
176

177
prevChar :: StringBuffer -> Char -> Char
178
179
180
181
182
183
prevChar (StringBuffer buf len 0)   deflt = deflt
prevChar (StringBuffer buf len cur) deflt = 
  inlinePerformIO $ do
    withForeignPtr buf $ \p -> do
      p' <- utf8PrevChar (p `plusPtr` cur)
      return (fst (utf8DecodeChar p'))
184

185
186
-- -----------------------------------------------------------------------------
-- Moving
187

188
stepOn :: StringBuffer -> StringBuffer
189
190
191
192
stepOn s = snd (nextChar s)

offsetBytes :: Int -> StringBuffer -> StringBuffer
offsetBytes i s = s { cur = cur s + i }
193

194
195
byteDiff :: StringBuffer -> StringBuffer -> Int
byteDiff s1 s2 = cur s2 - cur s1
196

197
atEnd :: StringBuffer -> Bool
198
atEnd (StringBuffer _ l c) = l == c
199

200
201
-- -----------------------------------------------------------------------------
-- Conversion
202

203
lexemeToString :: StringBuffer -> Int {-bytes-} -> String
204
lexemeToString _ 0 = ""
205
206
207
208
lexemeToString (StringBuffer buf _ cur) bytes =
  inlinePerformIO $ 
    withForeignPtr buf $ \ptr -> 
      utf8DecodeString (ptr `plusPtr` cur) bytes
209

210
lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
211
lexemeToFastString _ 0 = mkFastString ""
212
213
214
215
lexemeToFastString (StringBuffer buf _ cur) len =
   inlinePerformIO $
     withForeignPtr buf $ \ptr ->
       return $! mkFastStringBytes (ptr `plusPtr` cur) len
216
217
218
219

-- -----------------------------------------------------------------------------
-- Parsing integer strings in various bases

220
221
222
223
224
225
byteOff :: StringBuffer -> Int -> Char
byteOff (StringBuffer buf _ cur) i = 
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    w <- peek (ptr `plusPtr` (cur+i))
    return (unsafeChr (fromIntegral (w::Word8)))

226
-- | XXX assumes ASCII digits only (by using byteOff)
227
228
parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
parseUnsignedInteger buf len radix char_to_int 
229
  = go 0 0
230
231
232
233
  where
    go i x | i == len  = x
           | otherwise = go (i+1)
              (x * radix + toInteger (char_to_int (byteOff buf i)))
234
235
236
237
238
239
240
241
242

-- -----------------------------------------------------------------------------
-- under the carpet

-- Just like unsafePerformIO, but we inline it.
{-# INLINE inlinePerformIO #-}
inlinePerformIO :: IO a -> a
inlinePerformIO (IO m) = case m realWorld# of (# _, r #)   -> r

243
\end{code}