Lex.hs 18 KB
Newer Older
1
{-# LANGUAGE Trustworthy #-}
2
{-# LANGUAGE NoImplicitPrelude #-}
3

4 5 6 7 8
-----------------------------------------------------------------------------
-- |
-- Module      :  Text.Read.Lex
-- Copyright   :  (c) The University of Glasgow 2002
-- License     :  BSD-style (see the file libraries/base/LICENSE)
9
--
10 11
-- Maintainer  :  libraries@haskell.org
-- Stability   :  provisional
ross's avatar
ross committed
12
-- Portability :  non-portable (uses Text.ParserCombinators.ReadP)
13 14 15 16
--
-- The cut-down Haskell lexer, used by Text.Read
--
-----------------------------------------------------------------------------
17 18 19

module Text.Read.Lex
  -- lexing types
20
  ( Lexeme(..), Number
Don Stewart's avatar
Don Stewart committed
21

22
  , numberToInteger, numberToFixed, numberToRational, numberToRangedRational
23

24
  -- lexer
25
  , lex, expect
26 27
  , hsLex
  , lexChar
Don Stewart's avatar
Don Stewart committed
28

29 30 31 32
  , readIntP
  , readOctP
  , readDecP
  , readHexP
Geraldus's avatar
Geraldus committed
33 34

  , isSymbolChar
35 36 37 38 39 40
  )
 where

import Text.ParserCombinators.ReadP

import GHC.Base
41
import GHC.Char
42
import GHC.Num( Num(..), Integer )
43
import GHC.Show( Show(..) )
44 45
import GHC.Unicode
  ( GeneralCategory(..), generalCategory, isSpace, isAlpha, isAlphaNum )
46 47
import GHC.Real( Rational, (%), fromIntegral, Integral,
                 toInteger, (^), quot, even )
48
import GHC.List
49
import GHC.Enum( minBound, maxBound )
50
import Data.Maybe
51 52 53 54 55 56 57

-- local copy to break import-cycle
-- | @'guard' b@ is @'return' ()@ if @b@ is 'True',
-- and 'mzero' if @b@ is 'False'.
guard           :: (MonadPlus m) => Bool -> m ()
guard True      =  return ()
guard False     =  mzero
58

59 60
-- -----------------------------------------------------------------------------
-- Lexing types
61

ross's avatar
ross committed
62
-- ^ Haskell lexemes.
63
data Lexeme
Don Stewart's avatar
Don Stewart committed
64 65 66 67 68
  = Char   Char         -- ^ Character literal
  | String String       -- ^ String literal, with escapes interpreted
  | Punc   String       -- ^ Punctuation or reserved symbol, e.g. @(@, @::@
  | Ident  String       -- ^ Haskell identifier, e.g. @foo@, @Baz@
  | Symbol String       -- ^ Haskell symbol, e.g. @>>@, @:%@
69
  | Number Number       -- ^ @since 4.6.0.0
70 71
  | EOF
 deriving (Eq, Show)
72

73
-- | @since 4.7.0.0
74 75 76 77 78 79 80
data Number = MkNumber Int              -- Base
                       Digits           -- Integral part
            | MkDecimal Digits          -- Integral part
                        (Maybe Digits)  -- Fractional part
                        (Maybe Integer) -- Exponent
 deriving (Eq, Show)

81
-- | @since 4.5.1.0
82
numberToInteger :: Number -> Maybe Integer
83 84
numberToInteger (MkNumber base iPart) = Just (val (fromIntegral base) iPart)
numberToInteger (MkDecimal iPart Nothing Nothing) = Just (val 10 iPart)
85 86
numberToInteger _ = Nothing

87
-- | @since 4.7.0.0
88
numberToFixed :: Integer -> Number -> Maybe (Integer, Integer)
89 90
numberToFixed _ (MkNumber base iPart) = Just (val (fromIntegral base) iPart, 0)
numberToFixed _ (MkDecimal iPart Nothing Nothing) = Just (val 10 iPart, 0)
91
numberToFixed p (MkDecimal iPart (Just fPart) Nothing)
92 93
    = let i = val 10 iPart
          f = val 10 (integerTake p (fPart ++ repeat 0))
94 95 96 97 98 99 100 101 102 103
          -- Sigh, we really want genericTake, but that's above us in
          -- the hierarchy, so we define our own version here (actually
          -- specialised to Integer)
          integerTake             :: Integer -> [a] -> [a]
          integerTake n _ | n <= 0 = []
          integerTake _ []        =  []
          integerTake n (x:xs)    =  x : integerTake (n-1) xs
      in Just (i, f)
numberToFixed _ _ = Nothing

104 105 106 107 108 109 110 111 112 113 114
-- This takes a floatRange, and if the Rational would be outside of
-- the floatRange then it may return Nothing. Not that it will not
-- /necessarily/ return Nothing, but it is good enough to fix the
-- space problems in #5688
-- Ways this is conservative:
-- * the floatRange is in base 2, but we pretend it is in base 10
-- * we pad the floateRange a bit, just in case it is very small
--   and we would otherwise hit an edge case
-- * We only worry about numbers that have an exponent. If they don't
--   have an exponent then the Rational won't be much larger than the
--   Number, so there is no problem
115
-- | @since 4.5.1.0
116 117 118
numberToRangedRational :: (Int, Int) -> Number
                       -> Maybe Rational -- Nothing = Inf
numberToRangedRational (neg, pos) n@(MkDecimal iPart mFPart (Just exp))
119 120 121 122 123 124
    -- if exp is out of integer bounds,
    -- then the number is definitely out of range
    | exp > fromIntegral (maxBound :: Int) ||
      exp < fromIntegral (minBound :: Int)
    = Nothing
    | otherwise
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
    = let mFirstDigit = case dropWhile (0 ==) iPart of
                        iPart'@(_ : _) -> Just (length iPart')
                        [] -> case mFPart of
                              Nothing -> Nothing
                              Just fPart ->
                                  case span (0 ==) fPart of
                                  (_, []) -> Nothing
                                  (zeroes, _) ->
                                      Just (negate (length zeroes))
      in case mFirstDigit of
         Nothing -> Just 0
         Just firstDigit ->
             let firstDigit' = firstDigit + fromInteger exp
             in if firstDigit' > (pos + 3)
                then Nothing
                else if firstDigit' < (neg - 3)
                then Just 0
                else Just (numberToRational n)
numberToRangedRational _ n = Just (numberToRational n)

145
-- | @since 4.6.0.0
146
numberToRational :: Number -> Rational
147
numberToRational (MkNumber base iPart) = val (fromIntegral base) iPart % 1
148
numberToRational (MkDecimal iPart mFPart mExp)
149
 = let i = val 10 iPart
150 151 152 153 154 155 156 157 158 159 160 161
   in case (mFPart, mExp) of
      (Nothing, Nothing)     -> i % 1
      (Nothing, Just exp)
       | exp >= 0            -> (i * (10 ^ exp)) % 1
       | otherwise           -> i % (10 ^ (- exp))
      (Just fPart, Nothing)  -> fracExp 0   i fPart
      (Just fPart, Just exp) -> fracExp exp i fPart
      -- fracExp is a bit more efficient in calculating the Rational.
      -- Instead of calculating the fractional part alone, then
      -- adding the integral part and finally multiplying with
      -- 10 ^ exp if an exponent was given, do it all at once.

162 163
-- -----------------------------------------------------------------------------
-- Lexing
164

165 166 167
lex :: ReadP Lexeme
lex = skipSpaces >> lexToken

168
-- | @since 4.7.0.0
169
expect :: Lexeme -> ReadP ()
170
expect lexeme = do { skipSpaces
171 172 173
                   ; thing <- lexToken
                   ; if thing == lexeme then return () else pfail }

174 175
hsLex :: ReadP String
-- ^ Haskell lexer: returns the lexed string, rather than the lexeme
176
hsLex = do skipSpaces
Don Stewart's avatar
Don Stewart committed
177 178
           (s,_) <- gather lexToken
           return s
179 180 181

lexToken :: ReadP Lexeme
lexToken = lexEOF     +++
182 183 184 185 186
           lexLitChar +++
           lexString  +++
           lexPunc    +++
           lexSymbol  +++
           lexId      +++
Don Stewart's avatar
Don Stewart committed
187
           lexNumber
188

189

190
-- ----------------------------------------------------------------------
191 192 193
-- End of file
lexEOF :: ReadP Lexeme
lexEOF = do s <- look
Don Stewart's avatar
Don Stewart committed
194 195
            guard (null s)
            return EOF
196 197 198

-- ---------------------------------------------------------------------------
-- Single character lexemes
199

200 201 202 203
lexPunc :: ReadP Lexeme
lexPunc =
  do c <- satisfy isPuncChar
     return (Punc [c])
204 205 206 207

-- | The @special@ character class as defined in the Haskell Report.
isPuncChar :: Char -> Bool
isPuncChar c = c `elem` ",;()[]{}`"
208 209 210 211 212

-- ----------------------------------------------------------------------
-- Symbols

lexSymbol :: ReadP Lexeme
213 214
lexSymbol =
  do s <- munch1 isSymbolChar
215
     if s `elem` reserved_ops then
Don Stewart's avatar
Don Stewart committed
216
        return (Punc s)         -- Reserved-ops count as punctuation
217
      else
Don Stewart's avatar
Don Stewart committed
218
        return (Symbol s)
Geraldus's avatar
Geraldus committed
219 220 221 222 223 224 225 226 227 228 229 230 231
  where
    reserved_ops   = ["..", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>"]

isSymbolChar :: Char -> Bool
isSymbolChar c = not (isPuncChar c) && case generalCategory c of
    MathSymbol              -> True
    CurrencySymbol          -> True
    ModifierSymbol          -> True
    OtherSymbol             -> True
    DashPunctuation         -> True
    OtherPunctuation        -> not (c `elem` "'\"")
    ConnectorPunctuation    -> c /= '_'
    _                       -> False
232
-- ----------------------------------------------------------------------
233 234
-- identifiers

235
lexId :: ReadP Lexeme
236 237 238
lexId = do c <- satisfy isIdsChar
           s <- munch isIdfChar
           return (Ident (c:s))
239
  where
Don Stewart's avatar
Don Stewart committed
240
          -- Identifiers can start with a '_'
241 242
    isIdsChar c = isAlpha c || c == '_'
    isIdfChar c = isAlphaNum c || c `elem` "_'"
243

244 245
-- ---------------------------------------------------------------------------
-- Lexing character literals
246

247
lexLitChar :: ReadP Lexeme
248
lexLitChar =
249
  do _ <- char '\''
250
     (c,esc) <- lexCharE
Don Stewart's avatar
Don Stewart committed
251
     guard (esc || c /= '\'')   -- Eliminate '' possibility
252
     _ <- char '\''
253 254
     return (Char c)

255 256 257 258 259
lexChar :: ReadP Char
lexChar = do { (c,_) <- lexCharE; return c }

lexCharE :: ReadP (Char, Bool)  -- "escaped or not"?
lexCharE =
Ian Lynagh's avatar
Ian Lynagh committed
260 261 262 263
  do c1 <- get
     if c1 == '\\'
       then do c2 <- lexEsc; return (c2, True)
       else do return (c1, False)
264
 where
265 266 267 268 269
  lexEsc =
    lexEscChar
      +++ lexNumeric
        +++ lexCntrlChar
          +++ lexAscii
270

271 272 273 274 275 276 277 278 279 280 281 282 283 284
  lexEscChar =
    do c <- get
       case c of
         'a'  -> return '\a'
         'b'  -> return '\b'
         'f'  -> return '\f'
         'n'  -> return '\n'
         'r'  -> return '\r'
         't'  -> return '\t'
         'v'  -> return '\v'
         '\\' -> return '\\'
         '\"' -> return '\"'
         '\'' -> return '\''
         _    -> pfail
285

286
  lexNumeric =
287
    do base <- lexBaseChar <++ return 10
288 289 290
       n    <- lexInteger base
       guard (n <= toInteger (ord maxBound))
       return (chr (fromInteger n))
291

292
  lexCntrlChar =
293
    do _ <- char '^'
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
       c <- get
       case c of
         '@'  -> return '\^@'
         'A'  -> return '\^A'
         'B'  -> return '\^B'
         'C'  -> return '\^C'
         'D'  -> return '\^D'
         'E'  -> return '\^E'
         'F'  -> return '\^F'
         'G'  -> return '\^G'
         'H'  -> return '\^H'
         'I'  -> return '\^I'
         'J'  -> return '\^J'
         'K'  -> return '\^K'
         'L'  -> return '\^L'
         'M'  -> return '\^M'
         'N'  -> return '\^N'
         'O'  -> return '\^O'
         'P'  -> return '\^P'
         'Q'  -> return '\^Q'
         'R'  -> return '\^R'
         'S'  -> return '\^S'
         'T'  -> return '\^T'
         'U'  -> return '\^U'
         'V'  -> return '\^V'
         'W'  -> return '\^W'
         'X'  -> return '\^X'
         'Y'  -> return '\^Y'
         'Z'  -> return '\^Z'
         '['  -> return '\^['
         '\\' -> return '\^\'
         ']'  -> return '\^]'
         '^'  -> return '\^^'
         '_'  -> return '\^_'
         _    -> pfail

  lexAscii =
    do choice
332
         [ (string "SOH" >> return '\SOH') <++
333
           (string "SO"  >> return '\SO')
Don Stewart's avatar
Don Stewart committed
334 335
                -- \SO and \SOH need maximal-munch treatment
                -- See the Haskell report Sect 2.6
336

337
         , string "NUL" >> return '\NUL'
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
         , string "STX" >> return '\STX'
         , string "ETX" >> return '\ETX'
         , string "EOT" >> return '\EOT'
         , string "ENQ" >> return '\ENQ'
         , string "ACK" >> return '\ACK'
         , string "BEL" >> return '\BEL'
         , string "BS"  >> return '\BS'
         , string "HT"  >> return '\HT'
         , string "LF"  >> return '\LF'
         , string "VT"  >> return '\VT'
         , string "FF"  >> return '\FF'
         , string "CR"  >> return '\CR'
         , string "SI"  >> return '\SI'
         , string "DLE" >> return '\DLE'
         , string "DC1" >> return '\DC1'
         , string "DC2" >> return '\DC2'
         , string "DC3" >> return '\DC3'
         , string "DC4" >> return '\DC4'
         , string "NAK" >> return '\NAK'
         , string "SYN" >> return '\SYN'
         , string "ETB" >> return '\ETB'
         , string "CAN" >> return '\CAN'
         , string "EM"  >> return '\EM'
         , string "SUB" >> return '\SUB'
         , string "ESC" >> return '\ESC'
         , string "FS"  >> return '\FS'
         , string "GS"  >> return '\GS'
         , string "RS"  >> return '\RS'
         , string "US"  >> return '\US'
         , string "SP"  >> return '\SP'
         , string "DEL" >> return '\DEL'
         ]

371 372

-- ---------------------------------------------------------------------------
373 374
-- string literal

375
lexString :: ReadP Lexeme
376
lexString =
377
  do _ <- char '"'
378 379 380 381 382 383
     body id
 where
  body f =
    do (c,esc) <- lexStrItem
       if c /= '"' || esc
         then body (f.(c:))
384
         else let s = f "" in
Don Stewart's avatar
Don Stewart committed
385
              return (String s)
386

387
  lexStrItem = (lexEmpty >> lexStrItem)
Don Stewart's avatar
Don Stewart committed
388
               +++ lexCharE
389

390
  lexEmpty =
391
    do _ <- char '\\'
392 393 394
       c <- get
       case c of
         '&'           -> do return ()
395
         _ | isSpace c -> do skipSpaces; _ <- char '\\'; return ()
396 397
         _             -> do pfail

398 399
-- ---------------------------------------------------------------------------
--  Lexing numbers
400 401 402 403

type Base   = Int
type Digits = [Int]

404
lexNumber :: ReadP Lexeme
405
lexNumber
Don Stewart's avatar
Don Stewart committed
406 407 408
  = lexHexOct  <++      -- First try for hex or octal 0x, 0o etc
                        -- If that fails, try for a decimal number
    lexDecNumber        -- Start with ordinary digits
409

410 411
lexHexOct :: ReadP Lexeme
lexHexOct
412
  = do  _ <- char '0'
Don Stewart's avatar
Don Stewart committed
413 414
        base <- lexBaseChar
        digits <- lexDigits base
415
        return (Number (MkNumber base digits))
416 417

lexBaseChar :: ReadP Int
418 419
-- Lex a single character indicating the base; fail if not there
lexBaseChar = do { c <- get;
Don Stewart's avatar
Don Stewart committed
420 421 422 423 424
                   case c of
                        'o' -> return 8
                        'O' -> return 8
                        'x' -> return 16
                        'X' -> return 16
425
                        _   -> pfail }
426 427 428 429 430 431

lexDecNumber :: ReadP Lexeme
lexDecNumber =
  do xs    <- lexDigits 10
     mFrac <- lexFrac <++ return Nothing
     mExp  <- lexExp  <++ return Nothing
432
     return (Number (MkDecimal xs mFrac mExp))
433 434 435 436

lexFrac :: ReadP (Maybe Digits)
-- Read the fractional part; fail if it doesn't
-- start ".d" where d is a digit
437
lexFrac = do _ <- char '.'
Ian Lynagh's avatar
Ian Lynagh committed
438 439
             fraction <- lexDigits 10
             return (Just fraction)
440 441

lexExp :: ReadP (Maybe Integer)
442
lexExp = do _ <- char 'e' +++ char 'E'
443
            exp <- signedExp +++ lexInteger 10
Don Stewart's avatar
Don Stewart committed
444
            return (Just exp)
445
 where
446
   signedExp
447 448 449
     = do c <- char '-' +++ char '+'
          n <- lexInteger 10
          return (if c == '-' then -n else n)
450 451

lexDigits :: Int -> ReadP Digits
452
-- Lex a non-empty sequence of digits in specified base
453 454 455 456 457 458 459
lexDigits base =
  do s  <- look
     xs <- scan s id
     guard (not (null xs))
     return xs
 where
  scan (c:cs) f = case valDig base c of
460
                    Just n  -> do _ <- get; scan cs (f.(n:))
461 462 463 464 465 466
                    Nothing -> do return (f [])
  scan []     f = do return (f [])

lexInteger :: Base -> ReadP Integer
lexInteger base =
  do xs <- lexDigits base
467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509
     return (val (fromIntegral base) xs)

val :: Num a => a -> Digits -> a
val = valSimple
{-# RULES
"val/Integer" val = valInteger
  #-}
{-# INLINE [1] val #-}

-- The following algorithm is only linear for types whose Num operations
-- are in constant time.
valSimple :: (Num a, Integral d) => a -> [d] -> a
valSimple base = go 0
  where
    go r [] = r
    go r (d : ds) = r' `seq` go r' ds
      where
        r' = r * base + fromIntegral d
{-# INLINE valSimple #-}

-- A sub-quadratic algorithm for Integer. Pairs of adjacent radix b
-- digits are combined into a single radix b^2 digit. This process is
-- repeated until we are left with a single digit. This algorithm
-- performs well only on large inputs, so we use the simple algorithm
-- for smaller inputs.
valInteger :: Integer -> Digits -> Integer
valInteger b0 ds0 = go b0 (length ds0) $ map fromIntegral ds0
  where
    go _ _ []  = 0
    go _ _ [d] = d
    go b l ds
        | l > 40 = b' `seq` go b' l' (combine b ds')
        | otherwise = valSimple b ds
      where
        -- ensure that we have an even number of digits
        -- before we call combine:
        ds' = if even l then ds else 0 : ds
        b' = b * b
        l' = (l + 1) `quot` 2
    combine b (d1 : d2 : ds) = d `seq` (d : combine b ds)
      where
        d = d1 * b + d2
    combine _ []  = []
Eric Seidel's avatar
Eric Seidel committed
510
    combine _ [_] = errorWithoutStackTrace "this should not happen"
511

512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
-- Calculate a Rational from the exponent [of 10 to multiply with],
-- the integral part of the mantissa and the digits of the fractional
-- part. Leaving the calculation of the power of 10 until the end,
-- when we know the effective exponent, saves multiplications.
-- More importantly, this way we need at most one gcd instead of three.
--
-- frac was never used with anything but Integer and base 10, so
-- those are hardcoded now (trivial to change if necessary).
fracExp :: Integer -> Integer -> Digits -> Rational
fracExp exp mant []
  | exp < 0     = mant % (10 ^ (-exp))
  | otherwise   = fromInteger (mant * 10 ^ exp)
fracExp exp mant (d:ds) = exp' `seq` mant' `seq` fracExp exp' mant' ds
  where
    exp'  = exp - 1
    mant' = mant * 10 + fromIntegral d
528

Ian Lynagh's avatar
Ian Lynagh committed
529
valDig :: (Eq a, Num a) => a -> Char -> Maybe Int
530 531 532 533
valDig 8 c
  | '0' <= c && c <= '7' = Just (ord c - ord '0')
  | otherwise            = Nothing

534
valDig 10 c = valDecDig c
535 536 537 538 539 540 541

valDig 16 c
  | '0' <= c && c <= '9' = Just (ord c - ord '0')
  | 'a' <= c && c <= 'f' = Just (ord c - ord 'a' + 10)
  | 'A' <= c && c <= 'F' = Just (ord c - ord 'A' + 10)
  | otherwise            = Nothing

Eric Seidel's avatar
Eric Seidel committed
542
valDig _ _ = errorWithoutStackTrace "valDig: Bad base"
Ian Lynagh's avatar
Ian Lynagh committed
543 544

valDecDig :: Char -> Maybe Int
545 546 547 548
valDecDig c
  | '0' <= c && c <= '9' = Just (ord c - ord '0')
  | otherwise            = Nothing

549
-- ----------------------------------------------------------------------
550 551 552 553 554
-- other numeric lexing functions

readIntP :: Num a => a -> (Char -> Bool) -> (Char -> Int) -> ReadP a
readIntP base isDigit valDigit =
  do s <- munch1 isDigit
555 556 557
     return (val base (map valDigit s))
{-# SPECIALISE readIntP
        :: Integer -> (Char -> Bool) -> (Char -> Int) -> ReadP Integer #-}
558

Ian Lynagh's avatar
Ian Lynagh committed
559
readIntP' :: (Eq a, Num a) => a -> ReadP a
560 561 562 563
readIntP' base = readIntP base isDigit valDigit
 where
  isDigit  c = maybe False (const True) (valDig base c)
  valDigit c = maybe 0     id           (valDig base c)
564
{-# SPECIALISE readIntP' :: Integer -> ReadP Integer #-}
565

Ian Lynagh's avatar
Ian Lynagh committed
566
readOctP, readDecP, readHexP :: (Eq a, Num a) => ReadP a
567 568 569
readOctP = readIntP' 8
readDecP = readIntP' 10
readHexP = readIntP' 16
570 571 572
{-# SPECIALISE readOctP :: ReadP Integer #-}
{-# SPECIALISE readDecP :: ReadP Integer #-}
{-# SPECIALISE readHexP :: ReadP Integer #-}