Lexer.x 128 KB
Newer Older
1
-----------------------------------------------------------------------------
2
-- (c) The University of Glasgow, 2006
3
--
4
-- GHC's lexer for Haskell 2010 [1].
5
--
6 7
-- This is a combination of an Alex-generated lexer [2] from a regex
-- definition, with some hand-coded bits. [3]
8 9
--
-- Completely accurate information about token-spans within the source
Ian Lynagh's avatar
Ian Lynagh committed
10 11
-- file is maintained.  Every token has a start and end RealSrcLoc
-- attached to it.
12
--
13 14 15
-- References:
-- [1] https://www.haskell.org/onlinereport/haskell2010/haskellch2.html
-- [2] http://www.haskell.org/alex/
16
-- [3] https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/parser
17
--
18 19 20 21 22 23 24 25 26 27
-----------------------------------------------------------------------------

--   ToDo / known bugs:
--    - parsing integers is a bit slow
--    - readRational is a bit slow
--
--   Known bugs, that were also in the previous version:
--    - M... should be 3 tokens, not 1.
--    - pragma-end should be only valid in a pragma

28
--   qualified operator NOTES.
Ian Lynagh's avatar
Ian Lynagh committed
29
--
30 31 32 33 34 35 36 37 38
--   - If M.(+) is a single lexeme, then..
--     - Probably (+) should be a single lexeme too, for consistency.
--       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
--     - But we have to rule out reserved operators, otherwise (..) becomes
--       a different lexeme.
--     - Should we therefore also rule out reserved operators in the qualified
--       form?  This is quite difficult to achieve.  We don't do it for
--       qualified varids.

39 40 41 42

-- -----------------------------------------------------------------------------
-- Alex "Haskell code fragment top"

43
{
44
{-# LANGUAGE CPP #-}
45
{-# LANGUAGE BangPatterns #-}
46
{-# LANGUAGE LambdaCase #-}
47
{-# LANGUAGE MultiWayIf #-}
48

49
{-# OPTIONS_GHC -funbox-strict-fields #-}
50
{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
51

52
module Lexer (
53
   Token(..), lexer, lexerDbg, pragState, mkPState, mkPStatePure, PState(..),
Vladislav Zavialov's avatar
Vladislav Zavialov committed
54 55 56 57
   P(..), ParseResult(..), mkParserFlags, mkParserFlags', ParserFlags(..),
   appendWarning,
   appendError,
   allocateComments,
58
   MonadP(..),
59
   getRealSrcLoc, getPState, withThisPackage,
60 61
   failLocMsgP, srcParseFail,
   getErrorMessages, getMessages,
62
   popContext, pushModuleContext, setLastToken, setSrcLoc,
vivian's avatar
vivian committed
63
   activeContext, nextIsEOF,
simonpj@microsoft.com's avatar
simonpj@microsoft.com committed
64
   getLexState, popLexState, pushLexState,
65
   ExtBits(..),
Vladislav Zavialov's avatar
Vladislav Zavialov committed
66
   xtest,
Alan Zimmerman's avatar
Alan Zimmerman committed
67
   lexTokenStream,
Vladislav Zavialov's avatar
Vladislav Zavialov committed
68 69
   AddAnn(..),mkParensApiAnn,
   addAnnsAt,
70
   commentToAnnotation
71 72
  ) where

73 74
import GhcPrelude

rodlogic's avatar
rodlogic committed
75 76
-- base
import Control.Monad
77
import Control.Monad.Fail as MonadFail
rodlogic's avatar
rodlogic committed
78
import Data.Bits
79
import Data.Char
rodlogic's avatar
rodlogic committed
80 81 82 83
import Data.List
import Data.Maybe
import Data.Word

84 85
import EnumSet (EnumSet)
import qualified EnumSet
86

87 88 89
-- ghc-boot
import qualified GHC.LanguageExtensions as LangExt

rodlogic's avatar
rodlogic committed
90 91 92 93 94 95 96 97
-- bytestring
import Data.ByteString (ByteString)

-- containers
import Data.Map (Map)
import qualified Data.Map as Map

-- compiler/utils
98
import Bag
99 100 101 102
import Outputable
import StringBuffer
import FastString
import UniqFM
103
import Util             ( readRational, readHexRational )
rodlogic's avatar
rodlogic committed
104 105 106

-- compiler/main
import ErrUtils
107
import DynFlags
rodlogic's avatar
rodlogic committed
108 109 110

-- compiler/basicTypes
import SrcLoc
111
import Module
112 113
import BasicTypes     ( InlineSpec(..), RuleMatchInfo(..),
                        IntegralLit(..), FractionalLit(..),
114
                        SourceText(..) )
115

rodlogic's avatar
rodlogic committed
116 117
-- compiler/parser
import Ctype
Alan Zimmerman's avatar
Alan Zimmerman committed
118 119

import ApiAnnotation
120 121
}

122 123 124
-- -----------------------------------------------------------------------------
-- Alex "Character set macros"

125 126
-- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs
-- Any changes here should likely be reflected there.
127
$unispace    = \x05 -- Trick Alex into handling Unicode. See [Unicode in Alex].
128 129 130
$nl          = [\n\r\f]
$whitechar   = [$nl\v\ $unispace]
$white_no_nl = $whitechar # \n -- TODO #8424
131
$tab         = \t
132 133

$ascdigit  = 0-9
134
$unidigit  = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex].
135
$decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
136 137 138
$digit     = [$ascdigit $unidigit]

$special   = [\(\)\,\;\[\]\`\{\}]
139
$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
140
$unisymbol = \x04 -- Trick Alex into handling Unicode. See [Unicode in Alex].
141
$symbol    = [$ascsymbol $unisymbol] # [$special \_\"\']
142

143
$unilarge  = \x01 -- Trick Alex into handling Unicode. See [Unicode in Alex].
144
$asclarge  = [A-Z]
145 146
$large     = [$asclarge $unilarge]

147
$unismall  = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex].
148
$ascsmall  = [a-z]
149 150
$small     = [$ascsmall $unismall \_]

151
$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex].
152
$graphic   = [$small $large $symbol $digit $special $unigraphic \"\']
153

154
$binit     = 0-1
Ian Lynagh's avatar
Ian Lynagh committed
155
$octit     = 0-7
156
$hexit     = [$decdigit A-F a-f]
157

158
$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex].
159
$idchar    = [$small $large $digit $uniidchar \']
160

161 162
$pragmachar = [$small $large $digit]

163 164
$docsym    = [\| \^ \* \$]

165

166 167 168 169 170 171 172 173
-- -----------------------------------------------------------------------------
-- Alex "Regular expression macros"

@varid     = $small $idchar*          -- variable identifiers
@conid     = $large $idchar*          -- constructor identifiers

@varsym    = ($symbol # \:) $symbol*  -- variable (operator) symbol
@consym    = \: $symbol*              -- constructor (operator) symbol
174

175 176 177 178 179 180 181 182
-- See Note [Lexing NumericUnderscores extension] and #14473
@numspc       = _*                   -- numeric spacer (#14473)
@decimal      = $decdigit(@numspc $decdigit)*
@binary       = $binit(@numspc $binit)*
@octal        = $octit(@numspc $octit)*
@hexadecimal  = $hexit(@numspc $hexit)*
@exponent     = @numspc [eE] [\-\+]? @decimal
@bin_exponent = @numspc [pP] [\-\+]? @decimal
183 184

@qual = (@conid \.)+
185 186 187 188
@qvarid = @qual @varid
@qconid = @qual @conid
@qvarsym = @qual @varsym
@qconsym = @qual @consym
189

190 191
@floating_point = @numspc @decimal \. @decimal @exponent? | @numspc @decimal @exponent
@hex_floating_point = @numspc @hexadecimal \. @hexadecimal @bin_exponent? | @numspc @hexadecimal @bin_exponent
192

193 194 195 196 197
-- normal signed numerical literals can only be explicitly negative,
-- not explicitly positive (contrast @exponent)
@negative = \-
@signed = @negative ?

198 199 200 201

-- -----------------------------------------------------------------------------
-- Alex "Identifier"

202 203
haskell :-

204 205 206 207 208

-- -----------------------------------------------------------------------------
-- Alex "Rules"

-- everywhere: skip whitespace
Ian Lynagh's avatar
Ian Lynagh committed
209
$white_no_nl+ ;
210
$tab          { warnTab }
211 212 213 214 215

-- Everywhere: deal with nested comments.  We explicitly rule out
-- pragmas, "{-#", so that we don't accidentally treat them as comments.
-- (this can happen even though pragmas will normally take precedence due to
-- longest-match, because pragmas aren't valid in every state, but comments
216 217 218 219
-- are). We also rule out nested Haddock comments, if the -haddock flag is
-- set.

"{-" / { isNormalComment } { nested_comment lexToken }
220 221 222 223

-- Single-line comments are a bit tricky.  Haskell 98 says that two or
-- more dashes followed by a symbol should be parsed as a varsym, so we
-- have to exclude those.
224 225

-- Since Haddock comments aren't valid in every state, we need to rule them
Ian Lynagh's avatar
Ian Lynagh committed
226
-- out here.
227 228 229 230 231 232 233

-- The following two rules match comments that begin with two dashes, but
-- continue with a different character. The rules test that this character
-- is not a symbol (in which case we'd have a varsym), and that it's not a
-- space followed by a Haddock comment symbol (docsym) (in which case we'd
-- have a Haddock comment). The rules then munch the rest of the line.

234
"-- " ~$docsym .* { lineCommentToken }
235
"--" [^$symbol \ ] .* { lineCommentToken }
236 237 238

-- Next, match Haddock comments if no -haddock flag

239
"-- " $docsym .* / { alexNotPred (ifExtension HaddockBit) } { lineCommentToken }
240 241 242 243 244 245 246

-- Now, when we've matched comments that begin with 2 dashes and continue
-- with a different character, we need to match comments that begin with three
-- or more dashes (which clearly can't be Haddock comments). We only need to
-- make sure that the first non-dash character isn't a symbol, and munch the
-- rest of the line.

247
"---"\-* ~$symbol .* { lineCommentToken }
248 249 250 251

-- Since the previous rules all match dashes followed by at least one
-- character, we also need to match a whole line filled with just dashes.

Jedai's avatar
Jedai committed
252
"--"\-* / { atEOL } { lineCommentToken }
253 254 255 256

-- We need this rule since none of the other single line comment rules
-- actually match this case.

Jedai's avatar
Jedai committed
257
"-- " / { atEOL } { lineCommentToken }
258 259 260 261 262 263 264 265 266 267 268

-- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
-- blank lines) until we find a non-whitespace character, then do layout
-- processing.
--
-- One slight wibble here: what if the line begins with {-#? In
-- theory, we have to lex the pragma to see if it's one we recognise,
-- and if it is, then we backtrack and do_bol, otherwise we treat it
-- as a nested comment.  We don't bother with this: if the line begins
-- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
<bol> {
Ian Lynagh's avatar
Ian Lynagh committed
269
  \n                                    ;
Adam Gundry's avatar
Adam Gundry committed
270 271
  ^\# line                              { begin line_prag1 }
  ^\# / { followedByDigit }             { begin line_prag1 }
Ian Lynagh's avatar
Ian Lynagh committed
272 273 274
  ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
  ^\# \! .* \n                          ; -- #!, for scripts
  ()                                    { do_bol }
275 276 277 278 279
}

-- after a layout keyword (let, where, do, of), we begin a new layout
-- context if the curly brace is missing.
-- Careful! This stuff is quite delicate.
280
<layout, layout_do, layout_if> {
Ian Lynagh's avatar
Ian Lynagh committed
281 282 283 284
  \{ / { notFollowedBy '-' }            { hopefully_open_brace }
        -- we might encounter {-# here, but {- has been handled already
  \n                                    ;
  ^\# (line)?                           { begin line_prag1 }
285 286
}

287 288
-- after an 'if', a vertical bar starts a layout context for MultiWayIf
<layout_if> {
289
  \| / { notFollowedBySymbol }          { new_layout_context True dontGenerateSemic ITvbar }
290 291 292
  ()                                    { pop }
}

293
-- do is treated in a subtly different way, see new_layout_context
294 295
<layout>    ()                          { new_layout_context True  generateSemic ITvocurly }
<layout_do> ()                          { new_layout_context False generateSemic ITvocurly }
296 297 298 299

-- after a new layout context which was found to be to the left of the
-- previous context, we have generated a '{' token, and we now need to
-- generate a matching '}' token.
Ian Lynagh's avatar
Ian Lynagh committed
300
<layout_left>  ()                       { do_layout_left }
301

Ian Lynagh's avatar
Ian Lynagh committed
302
<0,option_prags> \n                     { begin bol }
303

304 305
"{-#" $whitechar* $pragmachar+ / { known_pragma linePrags }
                                { dispatch_pragmas linePrags }
306 307 308

-- single-line line pragmas, of the form
--    # <line> "<file>" <extra-stuff> \n
309 310 311 312 313
<line_prag1> {
  @decimal $white_no_nl+ \" [$graphic \ ]* \"  { setLineAndFile line_prag1a }
  ()                                           { failLinePrag1 }
}
<line_prag1a> .*                               { popLinePrag1 }
314 315 316

-- Haskell-style line pragmas, of the form
--    {-# LINE <line> "<file>" #-}
317 318 319 320
<line_prag2> {
  @decimal $white_no_nl+ \" [$graphic \ ]* \"  { setLineAndFile line_prag2a }
}
<line_prag2a> "#-}"|"-}"                       { pop }
321 322
   -- NOTE: accept -} at the end of a LINE pragma, for compatibility
   -- with older versions of GHC which generated these.
323

Rufflewind's avatar
Rufflewind committed
324 325 326 327
-- Haskell-style column pragmas, of the form
--    {-# COLUMN <column> #-}
<column_prag> @decimal $whitechar* "#-}" { setColumn }

328
<0,option_prags> {
Ian Lynagh's avatar
Ian Lynagh committed
329
  "{-#" $whitechar* $pragmachar+
330 331 332 333 334
        $whitechar+ $pragmachar+ / { known_pragma twoWordPrags }
                                 { dispatch_pragmas twoWordPrags }

  "{-#" $whitechar* $pragmachar+ / { known_pragma oneWordPrags }
                                 { dispatch_pragmas oneWordPrags }
335

336
  -- We ignore all these pragmas, but don't generate a warning for them
337 338
  "{-#" $whitechar* $pragmachar+ / { known_pragma ignoredPrags }
                                 { dispatch_pragmas ignoredPrags }
339 340

  -- ToDo: should only be valid inside a pragma:
Ian Lynagh's avatar
Ian Lynagh committed
341
  "#-}"                          { endPrag }
342 343
}

344
<option_prags> {
345 346
  "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
                                   { dispatch_pragmas fileHeaderPrags }
David Waern's avatar
David Waern committed
347 348
}

349 350
<0> {
  -- In the "0" mode we ignore these pragmas
351
  "{-#"  $whitechar* $pragmachar+ / { known_pragma fileHeaderPrags }
352 353 354
                     { nested_comment lexToken }
}

355
<0,option_prags> {
356 357
  "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
                    (nested_comment lexToken) }
358 359
}

360 361
-- '0' state: ordinary lexemes

362 363
-- Haddock comments

364
<0,option_prags> {
365 366
  "-- " $docsym      / { ifExtension HaddockBit } { multiline_doc_comment }
  "{-" \ ? $docsym   / { ifExtension HaddockBit } { nested_doc_comment }
367 368
}

369 370
-- "special" symbols

371
<0> {
372 373 374 375 376 377 378 379 380 381 382
  "[|"        / { ifExtension ThQuotesBit } { token (ITopenExpQuote NoE NormalSyntax) }
  "[||"       / { ifExtension ThQuotesBit } { token (ITopenTExpQuote NoE) }
  "[e|"       / { ifExtension ThQuotesBit } { token (ITopenExpQuote HasE NormalSyntax) }
  "[e||"      / { ifExtension ThQuotesBit } { token (ITopenTExpQuote HasE) }
  "[p|"       / { ifExtension ThQuotesBit } { token ITopenPatQuote }
  "[d|"       / { ifExtension ThQuotesBit } { layout_token ITopenDecQuote }
  "[t|"       / { ifExtension ThQuotesBit } { token ITopenTypQuote }
  "|]"        / { ifExtension ThQuotesBit } { token (ITcloseQuote NormalSyntax) }
  "||]"       / { ifExtension ThQuotesBit } { token ITcloseTExpQuote }

  "[" @varid "|"  / { ifExtension QqBit }   { lex_quasiquote_tok }
383 384

  -- qualified quasi-quote (#5555)
385
  "[" @qvarid "|"  / { ifExtension QqBit }  { lex_qquasiquote_tok }
386 387 388

  $unigraphic -- ⟦
    / { ifCurrentChar '⟦' `alexAndPred`
389 390
        ifExtension UnicodeSyntaxBit `alexAndPred`
        ifExtension ThQuotesBit }
391 392 393
    { token (ITopenExpQuote NoE UnicodeSyntax) }
  $unigraphic -- ⟧
    / { ifCurrentChar '⟧' `alexAndPred`
394 395
        ifExtension UnicodeSyntaxBit `alexAndPred`
        ifExtension ThQuotesBit }
396
    { token (ITcloseQuote UnicodeSyntax) }
397 398
}

399
<0> {
400 401 402 403 404 405 406
  "(|"
    / { ifExtension ArrowsBit `alexAndPred`
        notFollowedBySymbol }
    { special (IToparenbar NormalSyntax) }
  "|)"
    / { ifExtension ArrowsBit }
    { special (ITcparenbar NormalSyntax) }
407 408 409

  $unigraphic -- ⦇
    / { ifCurrentChar '⦇' `alexAndPred`
410 411
        ifExtension UnicodeSyntaxBit `alexAndPred`
        ifExtension ArrowsBit }
412 413 414
    { special (IToparenbar UnicodeSyntax) }
  $unigraphic -- ⦈
    / { ifCurrentChar '⦈' `alexAndPred`
415 416
        ifExtension UnicodeSyntaxBit `alexAndPred`
        ifExtension ArrowsBit }
417
    { special (ITcparenbar UnicodeSyntax) }
418 419
}

420
<0> {
421
  \? @varid / { ifExtension IpBit } { skip_one_varid ITdupipvarid }
422 423
}

Adam Gundry's avatar
Adam Gundry committed
424
<0> {
425
  "#" @varid / { ifExtension OverloadedLabelsBit } { skip_one_varid ITlabelvarid }
Adam Gundry's avatar
Adam Gundry committed
426 427
}

428
<0> {
429 430
  "(#" / { ifExtension UnboxedTuplesBit `alexOrPred`
           ifExtension UnboxedSumsBit }
Ian Lynagh's avatar
Ian Lynagh committed
431
         { token IToubxparen }
432 433
  "#)" / { ifExtension UnboxedTuplesBit `alexOrPred`
           ifExtension UnboxedSumsBit }
Ian Lynagh's avatar
Ian Lynagh committed
434 435 436
         { token ITcubxparen }
}

437
<0,option_prags> {
Ian Lynagh's avatar
Ian Lynagh committed
438 439 440 441 442 443 444 445 446 447
  \(                                    { special IToparen }
  \)                                    { special ITcparen }
  \[                                    { special ITobrack }
  \]                                    { special ITcbrack }
  \,                                    { special ITcomma }
  \;                                    { special ITsemi }
  \`                                    { special ITbackquote }

  \{                                    { open_brace }
  \}                                    { close_brace }
448 449
}

450
<0,option_prags> {
451 452
  @qvarid                       { idtoken qvarid }
  @qconid                       { idtoken qconid }
Ian Lynagh's avatar
Ian Lynagh committed
453
  @varid                        { varid }
454
  @conid                        { idtoken conid }
455 456
}

457
<0> {
458 459 460 461
  @qvarid "#"+      / { ifExtension MagicHashBit } { idtoken qvarid }
  @qconid "#"+      / { ifExtension MagicHashBit } { idtoken qconid }
  @varid "#"+       / { ifExtension MagicHashBit } { varid }
  @conid "#"+       / { ifExtension MagicHashBit } { idtoken conid }
462 463
}

464 465 466 467 468 469 470 471 472
-- Operators classified into prefix, suffix, tight infix, and loose infix.
-- See Note [Whitespace-sensitive operator parsing]
<0> {
  @varsym / { precededByClosingToken `alexAndPred` followedByOpeningToken } { varsym_tight_infix }
  @varsym / { followedByOpeningToken }  { varsym_prefix }
  @varsym / { precededByClosingToken }  { varsym_suffix }
  @varsym                               { varsym_loose_infix }
}

473 474
-- ToDo: - move `var` and (sym) into lexical syntax?
--       - remove backquote from $special?
475
<0> {
476 477
  @qvarsym                                         { idtoken qvarsym }
  @qconsym                                         { idtoken qconsym }
478
  @consym                                          { consym }
479 480
}

481 482
-- For the normal boxed literals we need to be careful
-- when trying to be close to Haskell98
483 484 485 486 487 488 489 490 491 492

-- Note [Lexing NumericUnderscores extension] (#14473)
--
-- NumericUnderscores extension allows underscores in numeric literals.
-- Multiple underscores are represented with @numspc macro.
-- To be simpler, we have only the definitions with underscores.
-- And then we have a separate function (tok_integral and tok_frac)
-- that validates the literals.
-- If extensions are not enabled, check that there are no underscores.
--
493
<0> {
494
  -- Normal integral literals (:: Num a => a, from Integer)
495 496 497 498 499 500 501 502 503
  @decimal                                                                   { tok_num positive 0 0 decimal }
  0[bB] @numspc @binary                / { ifExtension BinaryLiteralsBit }   { tok_num positive 2 2 binary }
  0[oO] @numspc @octal                                                       { tok_num positive 2 2 octal }
  0[xX] @numspc @hexadecimal                                                 { tok_num positive 2 2 hexadecimal }
  @negative @decimal                   / { ifExtension NegativeLiteralsBit } { tok_num negative 1 1 decimal }
  @negative 0[bB] @numspc @binary      / { ifExtension NegativeLiteralsBit `alexAndPred`
                                           ifExtension BinaryLiteralsBit }   { tok_num negative 3 3 binary }
  @negative 0[oO] @numspc @octal       / { ifExtension NegativeLiteralsBit } { tok_num negative 3 3 octal }
  @negative 0[xX] @numspc @hexadecimal / { ifExtension NegativeLiteralsBit } { tok_num negative 3 3 hexadecimal }
504 505

  -- Normal rational literals (:: Fractional a => a, from Rational)
506 507 508 509 510 511
  @floating_point                                                            { tok_frac 0 tok_float }
  @negative @floating_point            / { ifExtension NegativeLiteralsBit } { tok_frac 0 tok_float }
  0[xX] @numspc @hex_floating_point    / { ifExtension HexFloatLiteralsBit } { tok_frac 0 tok_hex_float }
  @negative 0[xX] @numspc @hex_floating_point
                                       / { ifExtension HexFloatLiteralsBit `alexAndPred`
                                           ifExtension NegativeLiteralsBit } { tok_frac 0 tok_hex_float }
512 513
}

514
<0> {
Ian Lynagh's avatar
Ian Lynagh committed
515
  -- Unboxed ints (:: Int#) and words (:: Word#)
516 517
  -- It's simpler (and faster?) to give separate cases to the negatives,
  -- especially considering octal/hexadecimal prefixes.
518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
  @decimal                          \# / { ifExtension MagicHashBit }        { tok_primint positive 0 1 decimal }
  0[bB] @numspc @binary             \# / { ifExtension MagicHashBit `alexAndPred`
                                           ifExtension BinaryLiteralsBit }   { tok_primint positive 2 3 binary }
  0[oO] @numspc @octal              \# / { ifExtension MagicHashBit }        { tok_primint positive 2 3 octal }
  0[xX] @numspc @hexadecimal        \# / { ifExtension MagicHashBit }        { tok_primint positive 2 3 hexadecimal }
  @negative @decimal                \# / { ifExtension MagicHashBit }        { tok_primint negative 1 2 decimal }
  @negative 0[bB] @numspc @binary   \# / { ifExtension MagicHashBit `alexAndPred`
                                           ifExtension BinaryLiteralsBit }   { tok_primint negative 3 4 binary }
  @negative 0[oO] @numspc @octal    \# / { ifExtension MagicHashBit }        { tok_primint negative 3 4 octal }
  @negative 0[xX] @numspc @hexadecimal \#
                                       / { ifExtension MagicHashBit }        { tok_primint negative 3 4 hexadecimal }

  @decimal                       \# \# / { ifExtension MagicHashBit }        { tok_primword 0 2 decimal }
  0[bB] @numspc @binary          \# \# / { ifExtension MagicHashBit `alexAndPred`
                                           ifExtension BinaryLiteralsBit }   { tok_primword 2 4 binary }
  0[oO] @numspc @octal           \# \# / { ifExtension MagicHashBit }        { tok_primword 2 4 octal }
  0[xX] @numspc @hexadecimal     \# \# / { ifExtension MagicHashBit }        { tok_primword 2 4 hexadecimal }
Ian Lynagh's avatar
Ian Lynagh committed
535

536 537
  -- Unboxed floats and doubles (:: Float#, :: Double#)
  -- prim_{float,double} work with signed literals
538 539
  @signed @floating_point           \# / { ifExtension MagicHashBit }        { tok_frac 1 tok_primfloat }
  @signed @floating_point        \# \# / { ifExtension MagicHashBit }        { tok_frac 2 tok_primdouble }
540 541 542 543 544 545
}

-- Strings and chars are lexed by hand-written code.  The reason is
-- that even if we recognise the string or char here in the regex
-- lexer, we would still have to parse the string afterward in order
-- to convert it to a String.
546
<0> {
Ian Lynagh's avatar
Ian Lynagh committed
547 548
  \'                            { lex_char_tok }
  \"                            { lex_string_tok }
549 550
}

551 552 553 554 555 556 557 558 559 560 561
-- Note [Whitespace-sensitive operator parsing]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- In accord with GHC Proposal #229 https://github.com/ghc-proposals/ghc-proposals/blob/master/proposals/0229-whitespace-bang-patterns.rst
-- we classify operator occurrences into four categories:
--
--     a ! b   -- a loose infix occurrence
--     a!b     -- a tight infix occurrence
--     a !b    -- a prefix occurrence
--     a! b    -- a suffix occurrence
--
-- The rules are a bit more elaborate than simply checking for whitespace, in
Brian Wignall's avatar
Brian Wignall committed
562
-- order to accommodate the following use cases:
563 564 565 566 567 568 569 570 571 572 573 574 575 576
--
--     f (!a) = ...    -- prefix occurrence
--     g (a !)         -- loose infix occurrence
--     g (! a)         -- loose infix occurrence
--
-- The precise rules are as follows:
--
--  * Identifiers, literals, and opening brackets (, (#, [, [|, [||, [p|, [e|,
--    [t|, {, are considered "opening tokens". The function followedByOpeningToken
--    tests whether the next token is an opening token.
--
--  * Identifiers, literals, and closing brackets ), #), ], |], },
--    are considered "closing tokens". The function precededByClosingToken tests
--    whether the previous token is a closing token.
eir@cis.upenn.edu's avatar
eir@cis.upenn.edu committed
577
--
578 579
--  * Whitespace, comments, separators, and other tokens, are considered
--    neither opening nor closing.
eir@cis.upenn.edu's avatar
eir@cis.upenn.edu committed
580
--
581 582
--  * Any unqualified operator occurrence is classified as prefix, suffix, or
--    tight/loose infix, based on preceding and following tokens:
eir@cis.upenn.edu's avatar
eir@cis.upenn.edu committed
583
--
584 585 586 587 588 589 590
--       precededByClosingToken | followedByOpeningToken | Occurrence
--      ------------------------+------------------------+------------
--       False                  | True                   | prefix
--       True                   | False                  | suffix
--       True                   | True                   | tight infix
--       False                  | False                  | loose infix
--      ------------------------+------------------------+------------
eir@cis.upenn.edu's avatar
eir@cis.upenn.edu committed
591
--
592 593
-- A loose infix occurrence is always considered an operator. Other types of
-- occurrences may be assigned a special per-operator meaning override:
eir@cis.upenn.edu's avatar
eir@cis.upenn.edu committed
594
--
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
--   Operator |  Occurrence   | Token returned
--  ----------+---------------+------------------------------------------
--    !       |  prefix       | ITbang
--            |               |   strictness annotation or bang pattern,
--            |               |   e.g.  f !x = rhs, data T = MkT !a
--            |  not prefix   | ITvarsym "!"
--            |               |   ordinary operator or type operator,
--            |               |   e.g.  xs ! 3, (! x), Int ! Bool
--  ----------+---------------+------------------------------------------
--    ~       |  prefix       | ITtilde
--            |               |   laziness annotation or lazy pattern,
--            |               |   e.g.  f ~x = rhs, data T = MkT ~a
--            |  not prefix   | ITvarsym "~"
--            |               |   ordinary operator or type operator,
--            |               |   e.g.  xs ~ 3, (~ x), Int ~ Bool
--  ----------+---------------+------------------------------------------
--    $  $$   |  prefix       | ITdollar, ITdollardollar
--            |               |   untyped or typed Template Haskell splice,
--            |               |   e.g.  $(f x), $$(f x), $$"str"
--            |  not prefix   | ITvarsym "$", ITvarsym "$$"
--            |               |   ordinary operator or type operator,
--            |               |   e.g.  f $ g x, a $$ b
--  ----------+---------------+------------------------------------------
--    @       |  prefix       | ITtypeApp
--            |               |   type application, e.g.  fmap @Maybe
--            |  tight infix  | ITat
--            |               |   as-pattern, e.g.  f p@(a,b) = rhs
--            |  suffix       | parse error
--            |               |   e.g. f p@ x = rhs
--            |  loose infix  | ITvarsym "@"
--            |               |   ordinary operator or type operator,
--            |               |   e.g.  f @ g, (f @)
--  ----------+---------------+------------------------------------------
eir@cis.upenn.edu's avatar
eir@cis.upenn.edu committed
628
--
629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658
-- Also, some of these overrides are guarded behind language extensions.
-- According to the specification, we must determine the occurrence based on
-- surrounding *tokens* (see the proposal for the exact rules). However, in
-- the implementation we cheat a little and do the classification based on
-- characters, for reasons of both simplicity and efficiency (see
-- 'followedByOpeningToken' and 'precededByClosingToken')
--
-- When an operator is subject to a meaning override, it is mapped to special
-- token: ITbang, ITtilde, ITat, ITdollar, ITdollardollar. Otherwise, it is
-- returned as ITvarsym.
--
-- For example, this is how we process the (!):
--
--    precededByClosingToken | followedByOpeningToken | Token
--   ------------------------+------------------------+-------------
--    False                  | True                   | ITbang
--    True                   | False                  | ITvarsym "!"
--    True                   | True                   | ITvarsym "!"
--    False                  | False                  | ITvarsym "!"
--   ------------------------+------------------------+-------------
--
-- And this is how we process the (@):
--
--    precededByClosingToken | followedByOpeningToken | Token
--   ------------------------+------------------------+-------------
--    False                  | True                   | ITtypeApp
--    True                   | False                  | parse error
--    True                   | True                   | ITat
--    False                  | False                  | ITvarsym "@"
--   ------------------------+------------------------+-------------
659 660 661 662

-- -----------------------------------------------------------------------------
-- Alex "Haskell code fragment bottom"

663
{
664

665 666 667
-- -----------------------------------------------------------------------------
-- The token type

668
data Token
Ian Lynagh's avatar
Ian Lynagh committed
669
  = ITas                        -- Haskell keywords
670 671 672 673 674 675 676 677
  | ITcase
  | ITclass
  | ITdata
  | ITdefault
  | ITderiving
  | ITdo
  | ITelse
  | IThiding
678
  | ITforeign
679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694
  | ITif
  | ITimport
  | ITin
  | ITinfix
  | ITinfixl
  | ITinfixr
  | ITinstance
  | ITlet
  | ITmodule
  | ITnewtype
  | ITof
  | ITqualified
  | ITthen
  | ITtype
  | ITwhere

695
  | ITforall            IsUnicodeSyntax -- GHC extension keywords
696 697 698 699
  | ITexport
  | ITlabel
  | ITdynamic
  | ITsafe
700
  | ITinterruptible
701 702 703
  | ITunsafe
  | ITstdcallconv
  | ITccallconv
704
  | ITcapiconv
705
  | ITprimcallconv
thoughtpolice's avatar
thoughtpolice committed
706
  | ITjavascriptcallconv
707
  | ITmdo
708
  | ITfamily
709
  | ITrole
710 711 712
  | ITgroup
  | ITby
  | ITusing
cactus's avatar
cactus committed
713
  | ITpattern
714
  | ITstatic
Ryan Scott's avatar
Ryan Scott committed
715 716
  | ITstock
  | ITanyclass
Ryan Scott's avatar
Ryan Scott committed
717
  | ITvia
718

Edward Z. Yang's avatar
Edward Z. Yang committed
719 720 721 722 723 724
  -- Backpack tokens
  | ITunit
  | ITsignature
  | ITdependency
  | ITrequires

Alan Zimmerman's avatar
Alan Zimmerman committed
725 726 727 728 729 730 731 732
  -- Pragmas, see  note [Pragma source text] in BasicTypes
  | ITinline_prag       SourceText InlineSpec RuleMatchInfo
  | ITspec_prag         SourceText                -- SPECIALISE
  | ITspec_inline_prag  SourceText Bool    -- SPECIALISE INLINE (or NOINLINE)
  | ITsource_prag       SourceText
  | ITrules_prag        SourceText
  | ITwarning_prag      SourceText
  | ITdeprecated_prag   SourceText
733 734
  | ITline_prag         SourceText  -- not usually produced, see 'UsePosPragsBit'
  | ITcolumn_prag       SourceText  -- not usually produced, see 'UsePosPragsBit'
Alan Zimmerman's avatar
Alan Zimmerman committed
735 736 737 738 739 740
  | ITscc_prag          SourceText
  | ITgenerated_prag    SourceText
  | ITcore_prag         SourceText         -- hdaume: core annotations
  | ITunpack_prag       SourceText
  | ITnounpack_prag     SourceText
  | ITann_prag          SourceText
741
  | ITcomplete_prag     SourceText
742
  | ITclose_prag
743 744 745
  | IToptions_prag String
  | ITinclude_prag String
  | ITlanguage_prag
Alan Zimmerman's avatar
Alan Zimmerman committed
746 747 748 749 750 751
  | ITminimal_prag      SourceText
  | IToverlappable_prag SourceText  -- instance overlap mode
  | IToverlapping_prag  SourceText  -- instance overlap mode
  | IToverlaps_prag     SourceText  -- instance overlap mode
  | ITincoherent_prag   SourceText  -- instance overlap mode
  | ITctype             SourceText
752
  | ITcomment_line_prag         -- See Note [Nested comment line pragmas]
753

Ian Lynagh's avatar
Ian Lynagh committed
754
  | ITdotdot                    -- reserved symbols
755
  | ITcolon
756
  | ITdcolon            IsUnicodeSyntax
757 758
  | ITequal
  | ITlam
759
  | ITlcase
760
  | ITvbar
761 762 763
  | ITlarrow            IsUnicodeSyntax
  | ITrarrow            IsUnicodeSyntax
  | ITdarrow            IsUnicodeSyntax
764
  | ITminus
765 766 767 768
  | ITbang     -- Prefix (!) only, e.g. f !x = rhs
  | ITtilde    -- Prefix (~) only, e.g. f ~x = rhs
  | ITat       -- Tight infix (@) only, e.g. f x@pat = rhs
  | ITtypeApp  -- Prefix (@) only, e.g. f @t
769
  | ITstar              IsUnicodeSyntax
770 771
  | ITdot

Ian Lynagh's avatar
Ian Lynagh committed
772
  | ITbiglam                    -- GHC-extension symbols
773

Ian Lynagh's avatar
Ian Lynagh committed
774
  | ITocurly                    -- special symbols
775 776 777 778
  | ITccurly
  | ITvocurly
  | ITvccurly
  | ITobrack
Ian Lynagh's avatar
Ian Lynagh committed
779 780
  | ITopabrack                  -- [:, for parallel arrays with -XParallelArrays
  | ITcpabrack                  -- :], for parallel arrays with -XParallelArrays
781 782 783 784 785 786 787 788 789
  | ITcbrack
  | IToparen
  | ITcparen
  | IToubxparen
  | ITcubxparen
  | ITsemi
  | ITcomma
  | ITunderscore
  | ITbackquote
dreixel's avatar
dreixel committed
790
  | ITsimpleQuote               --  '
791

Ian Lynagh's avatar
Ian Lynagh committed
792
  | ITvarid   FastString        -- identifiers
793 794 795 796 797 798 799 800
  | ITconid   FastString
  | ITvarsym  FastString
  | ITconsym  FastString
  | ITqvarid  (FastString,FastString)
  | ITqconid  (FastString,FastString)
  | ITqvarsym (FastString,FastString)
  | ITqconsym (FastString,FastString)

Ian Lynagh's avatar
Ian Lynagh committed
801
  | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
Adam Gundry's avatar
Adam Gundry committed
802
  | ITlabelvarid   FastString   -- Overloaded label: #x
803

804 805
  | ITchar     SourceText Char       -- Note [Literal source text] in BasicTypes
  | ITstring   SourceText FastString -- Note [Literal source text] in BasicTypes
806
  | ITinteger  IntegralLit           -- Note [Literal source text] in BasicTypes
Alan Zimmerman's avatar
Alan Zimmerman committed
807
  | ITrational FractionalLit
808

809 810 811 812
  | ITprimchar   SourceText Char     -- Note [Literal source text] in BasicTypes
  | ITprimstring SourceText ByteString -- Note [Literal source text] @BasicTypes
  | ITprimint    SourceText Integer  -- Note [Literal source text] in BasicTypes
  | ITprimword   SourceText Integer  -- Note [Literal source text] in BasicTypes
813 814
  | ITprimfloat  FractionalLit
  | ITprimdouble FractionalLit
815

816
  -- Template Haskell extension tokens
817 818 819 820 821 822 823
  | ITopenExpQuote HasE IsUnicodeSyntax --  [| or [e|
  | ITopenPatQuote                      --  [p|
  | ITopenDecQuote                      --  [d|
  | ITopenTypQuote                      --  [t|
  | ITcloseQuote IsUnicodeSyntax        --  |]
  | ITopenTExpQuote HasE                --  [|| or [e||
  | ITcloseTExpQuote                    --  ||]
824 825
  | ITdollar                            --  prefix $
  | ITdollardollar                      --  prefix $$
826
  | ITtyQuote                           --  ''
827 828 829 830 831 832 833 834
  | ITquasiQuote (FastString,FastString,RealSrcSpan)
    -- ITquasiQuote(quoter, quote, loc)
    -- represents a quasi-quote of the form
    -- [quoter| quote |]
  | ITqQuasiQuote (FastString,FastString,FastString,RealSrcSpan)
    -- ITqQuasiQuote(Qual, quoter, quote, loc)
    -- represents a qualified quasi-quote of the form
    -- [Qual.quoter| quote |]
835 836 837 838

  -- Arrow notation extension
  | ITproc
  | ITrec
839 840 841 842 843 844 845 846 847
  | IToparenbar  IsUnicodeSyntax -- ^ @(|@
  | ITcparenbar  IsUnicodeSyntax -- ^ @|)@
  | ITlarrowtail IsUnicodeSyntax -- ^ @-<@
  | ITrarrowtail IsUnicodeSyntax -- ^ @>-@
  | ITLarrowtail IsUnicodeSyntax -- ^ @-<<@
  | ITRarrowtail IsUnicodeSyntax -- ^ @>>-@

  | ITunknown String             -- ^ Used when the lexer can't make sense of it
  | ITeof                        -- ^ end of file token
848 849

  -- Documentation annotations
850 851 852 853 854 855 856
  | ITdocCommentNext  String     -- ^ something beginning @-- |@
  | ITdocCommentPrev  String     -- ^ something beginning @-- ^@
  | ITdocCommentNamed String     -- ^ something beginning @-- $@
  | ITdocSection      Int String -- ^ a section heading
  | ITdocOptions      String     -- ^ doc options (prune, ignore-exports, etc)
  | ITlineComment     String     -- ^ comment starting by "--"
  | ITblockComment    String     -- ^ comment in {- -}
857

858
  deriving Show
859

Alan Zimmerman's avatar
Alan Zimmerman committed
860 861 862
instance Outputable Token where
  ppr x = text (show x)

Alan Zimmerman's avatar
Alan Zimmerman committed
863

864 865 866 867 868 869 870
-- the bitmap provided as the third component indicates whether the
-- corresponding extension keyword is valid under the extension options
-- provided to the compiler; if the extension corresponding to *any* of the
-- bits set in the bitmap is enabled, the keyword is valid (this setup
-- facilitates using a keyword in two different extensions that can be
-- activated independently)
--
871
reservedWordsFM :: UniqFM (Token, ExtsBitmap)
872
reservedWordsFM = listToUFM $
Ian Lynagh's avatar
Ian Lynagh committed
873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
    map (\(x, y, z) -> (mkFastString x, (y, z)))
        [( "_",              ITunderscore,    0 ),
         ( "as",             ITas,            0 ),
         ( "case",           ITcase,          0 ),
         ( "class",          ITclass,         0 ),
         ( "data",           ITdata,          0 ),
         ( "default",        ITdefault,       0 ),
         ( "deriving",       ITderiving,      0 ),
         ( "do",             ITdo,            0 ),
         ( "else",           ITelse,          0 ),
         ( "hiding",         IThiding,        0 ),
         ( "if",             ITif,            0 ),
         ( "import",         ITimport,        0 ),
         ( "in",             ITin,            0 ),
         ( "infix",          ITinfix,         0 ),
         ( "infixl",         ITinfixl,        0 ),
         ( "infixr",         ITinfixr,        0 ),
         ( "instance",       ITinstance,      0 ),
         ( "let",            ITlet,           0 ),
         ( "module",         ITmodule,        0 ),
         ( "newtype",        ITnewtype,       0 ),
         ( "of",             ITof,            0 ),
         ( "qualified",      ITqualified,     0 ),
         ( "then",           ITthen,          0 ),
         ( "type",           ITtype,          0 ),
         ( "where",          ITwhere,         0 ),

900
         ( "forall",         ITforall NormalSyntax, 0),
901
         ( "mdo",            ITmdo,           xbit RecursiveDoBit),
902 903 904
             -- See Note [Lexing type pseudo-keywords]
         ( "family",         ITfamily,        0 ),
         ( "role",           ITrole,          0 ),
905
         ( "pattern",        ITpattern,       xbit PatternSynonymsBit),
906
         ( "static",         ITstatic,        xbit StaticPointersBit ),
Ryan Scott's avatar
Ryan Scott committed
907 908
         ( "stock",          ITstock,         0 ),
         ( "anyclass",       ITanyclass,      0 ),
Ryan Scott's avatar
Ryan Scott committed
909
         ( "via",            ITvia,           0 ),
910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
         ( "group",          ITgroup,         xbit TransformComprehensionsBit),
         ( "by",             ITby,            xbit TransformComprehensionsBit),
         ( "using",          ITusing,         xbit TransformComprehensionsBit),

         ( "foreign",        ITforeign,       xbit FfiBit),
         ( "export",         ITexport,        xbit FfiBit),
         ( "label",          ITlabel,         xbit FfiBit),
         ( "dynamic",        ITdynamic,       xbit FfiBit),
         ( "safe",           ITsafe,          xbit FfiBit .|.
                                              xbit SafeHaskellBit),
         ( "interruptible",  ITinterruptible, xbit InterruptibleFfiBit),
         ( "unsafe",         ITunsafe,        xbit FfiBit),
         ( "stdcall",        ITstdcallconv,   xbit FfiBit),
         ( "ccall",          ITccallconv,     xbit FfiBit),
         ( "capi",           ITcapiconv,      xbit CApiFfiBit),
         ( "prim",           ITprimcallconv,  xbit FfiBit),
         ( "javascript",     ITjavascriptcallconv, xbit FfiBit),

Edward Z. Yang's avatar
Edward Z. Yang committed
928 929 930 931
         ( "unit",           ITunit,          0 ),
         ( "dependency",     ITdependency,       0 ),
         ( "signature",      ITsignature,     0 ),

932 933 934
         ( "rec",            ITrec,           xbit ArrowsBit .|.
                                              xbit RecursiveDoBit),
         ( "proc",           ITproc,          xbit ArrowsBit)
935 936
     ]

937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952
{-----------------------------------
Note [Lexing type pseudo-keywords]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

One might think that we wish to treat 'family' and 'role' as regular old
varids whenever -XTypeFamilies and -XRoleAnnotations are off, respectively.
But, there is no need to do so. These pseudo-keywords are not stolen syntax:
they are only used after the keyword 'type' at the top-level, where varids are
not allowed. Furthermore, checks further downstream (TcTyClsDecls) ensure that
type families and role annotations are never declared without their extensions
on. In fact, by unconditionally lexing these pseudo-keywords as special, we
can get better error messages.

Also, note that these are included in the `varid` production in the parser --
a key detail to make all this work.
-------------------------------------}
953

954
reservedSymsFM :: UniqFM (Token, IsUnicodeSyntax, ExtsBitmap)
955
reservedSymsFM = listToUFM $
956 957
    map (\ (x,w,y,z) -> (mkFastString x,(w,y,z)))
      [ ("..",  ITdotdot,                   NormalSyntax,  0 )
Ian Lynagh's avatar
Ian Lynagh committed
958
        -- (:) is a reserved op, meaning only list cons
959 960 961 962 963 964 965 966 967 968 969
       ,(":",   ITcolon,                    NormalSyntax,  0 )
       ,("::",  ITdcolon NormalSyntax,      NormalSyntax,  0 )
       ,("=",   ITequal,                    NormalSyntax,  0 )
       ,("\\",  ITlam,                      NormalSyntax,  0 )
       ,("|",   ITvbar,                     NormalSyntax,  0 )
       ,("<-",  ITlarrow NormalSyntax,      NormalSyntax,  0 )
       ,("->",  ITrarrow NormalSyntax,      NormalSyntax,  0 )
       ,("=>",  ITdarrow NormalSyntax,      NormalSyntax,  0 )
       ,("-",   ITminus,                    NormalSyntax,  0 )

       ,("*",   ITstar NormalSyntax,        NormalSyntax,  xbit StarIsTypeBit)
970

Ian Lynagh's avatar
Ian Lynagh committed
971
        -- For 'forall a . t'
972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990
       ,(".",   ITdot,                      NormalSyntax,  0 )

       ,("-<",  ITlarrowtail NormalSyntax,  NormalSyntax,  xbit ArrowsBit)
       ,(">-",  ITrarrowtail NormalSyntax,  NormalSyntax,  xbit ArrowsBit)
       ,("-<<", ITLarrowtail NormalSyntax,  NormalSyntax,  xbit ArrowsBit)
       ,(">>-", ITRarrowtail NormalSyntax,  NormalSyntax,  xbit ArrowsBit)

       ,("∷",   ITdcolon UnicodeSyntax,     UnicodeSyntax, 0 )
       ,("⇒",   ITdarrow UnicodeSyntax,     UnicodeSyntax, 0 )
       ,("∀",   ITforall UnicodeSyntax,     UnicodeSyntax, 0 )
       ,("→",   ITrarrow UnicodeSyntax,     UnicodeSyntax, 0 )
       ,("←",   ITlarrow UnicodeSyntax,     UnicodeSyntax, 0 )

       ,("⤙",   ITlarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit)
       ,("⤚",   ITrarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit)
       ,("⤛",   ITLarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit)
       ,("⤜",   ITRarrowtail UnicodeSyntax, UnicodeSyntax, xbit ArrowsBit)

       ,("★",   ITstar UnicodeSyntax,       UnicodeSyntax, xbit StarIsTypeBit)
991

Simon Marlow's avatar
Simon Marlow committed
992 993 994
        -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
        -- form part of a large operator.  This would let us have a better
        -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
995 996 997 998 999
       ]

-- -----------------------------------------------------------------------------
-- Lexer actions

Ian Lynagh's avatar
Ian Lynagh committed
1000
type Action = RealSrcSpan -> StringBuffer -> Int -> P (RealLocated Token)
1001

1002
special :: Token -> Action
twanvl's avatar
twanvl committed
1003
special tok span _buf _len = return (L span tok)
1004

1005
token, layout_token :: Token -> Action
twanvl's avatar
twanvl committed
1006 1007
token t span _buf _len = return (L span t)
layout_token t span _buf _len = pushLexState layout >> return (L span t)
1008

1009 1010
idtoken :: (StringBuffer -> Int -> Token) -> Action
idtoken f span buf len = return (L span $! (f buf len))
1011

1012
skip_one_varid :: (FastString -> Token) -> Action
Ian Lynagh's avatar
Ian Lynagh committed
1013
skip_one_varid f span buf len
1014
  = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
1015

1016 1017 1018 1019
skip_two_varid :: (FastString -> Token) -> Action
skip_two_varid f span buf len
  = return (L span $! f (lexemeToFastString (stepOn (stepOn buf)) (len-2)))

1020
strtoken :: (String -> Token) -> Action
Ian Lynagh's avatar
Ian Lynagh committed
1021
strtoken f span buf len =
1022
  return (L span $! (f $! lexemeToString buf len))
1023 1024

begin :: Int -> Action
1025
begin code _span _str _len = do pushLexState code; lexToken
1026 1027

pop :: Action
Ian Lynagh's avatar
Ian Lynagh committed
1028 1029
pop _span _buf _len = do _ <- popLexState
                         lexToken
1030 1031 1032
-- See Note [Nested comment line pragmas]
failLinePrag1 :: Action
failLinePrag1 span _buf _len = do
1033
  b <- getBit InNestedCommentBit
1034 1035 1036 1037 1038 1039
  if b then return (L span ITcomment_line_prag)
       else lexError "lexical error in pragma"

-- See Note [Nested comment line pragmas]
popLinePrag1 :: Action
popLinePrag1 span _buf _len = do
1040
  b <- getBit InNestedCommentBit
1041 1042 1043
  if b then return (L span ITcomment_line_prag) else do
    _ <- popLexState
    lexToken
1044

1045 1046
hopefully_open_brace :: Action
hopefully_open_brace span buf len
1047
 = do relaxed <- getBit RelaxedLayoutBit
1048 1049 1050 1051 1052
      ctx <- getContext
      (AI l _) <- getInput
      let offset = srcLocCol l
          isOK = relaxed ||
                 case ctx of
1053 1054
                 Layout prev_off _ : _ -> prev_off < offset
                 _                     -> True
1055
      if isOK then pop_and open_brace span buf len
1056
              else addFatalError (RealSrcSpan span) (text "Missing block")
1057

1058
pop_and :: Action -> Action
Ian Lynagh's avatar
Ian Lynagh committed
1059 1060
pop_and act span buf len = do _ <- popLexState
                              act span buf len
1061

1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
-- See Note [Whitespace-sensitive operator parsing]
followedByOpeningToken :: AlexAccPred ExtsBitmap
followedByOpeningToken _ _ _ (AI _ buf)
  | atEnd buf = False
  | otherwise =
      case nextChar buf of
        ('{', buf') -> nextCharIsNot buf' (== '-')
        ('(', _) -> True
        ('[', _) -> True
        ('\"', _) -> True
        ('\'', _) -> True
        ('_', _) -> True
        (c, _) -> isAlphaNum c

-- See Note [Whitespace-sensitive operator parsing]
precededByClosingToken :: AlexAccPred ExtsBitmap
precededByClosingToken _ (AI _ buf) _ _ =
  case prevChar buf '\n' of
    '}' -> decodePrevNChars 1 buf /= "-"
    ')' -> True
    ']' -> True
    '\"' -> True
    '\'' -> True
    '_' -> True
    c -> isAlphaNum c

1088
{-# INLINE nextCharIs #-}
Ian Lynagh's avatar
Ian Lynagh committed
1089
nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
1090 1091
nextCharIs buf p = not (atEnd buf) && p (currentChar buf)

1092 1093 1094 1095
{-# INLINE nextCharIsNot #-}
nextCharIsNot :: StringBuffer -> (Char -> Bool) -> Bool
nextCharIsNot buf p = not (nextCharIs buf p)

1096
notFollowedBy :: Char -> AlexAccPred ExtsBitmap
Ian Lynagh's avatar
Ian Lynagh committed
1097
notFollowedBy char _ _ _ (AI _ buf)
1098
  = nextCharIsNot buf (== char)
1099

1100
notFollowedBySymbol :: AlexAccPred ExtsBitmap
1101
notFollowedBySymbol _ _ _ (AI _ buf)
1102
  = nextCharIsNot buf (`elem` "!#$%&*+./<=>?@\\^|-~")
1103

Adam Gundry's avatar
Adam Gundry committed
1104 1105 1106 1107
followedByDigit :: AlexAccPred ExtsBitmap
followedByDigit _ _ _ (AI _ buf)
  = afterOptionalSpace buf (\b -> nextCharIs b (`elem` ['0'..'9']))

1108 1109 1110 1111
ifCurrentChar :: Char -> AlexAccPred ExtsBitmap
ifCurrentChar char _ (AI _ buf) _ _
  = nextCharIs buf (== char)

1112 1113 1114 1115 1116
-- We must reject doc comments as being ordinary comments everywhere.
-- In some cases the doc comment will be selected as the lexeme due to
-- maximal munch, but not always, because the nested comment rule is
-- valid in all states, but the doc-comment rules are only valid in
-- the non-layout states.
1117
isNormalComment :: AlexAccPred ExtsBitmap
1118
isNormalComment bits _ _ (AI _ buf)
1119 1120
  | HaddockBit `xtest` bits = notFollowedByDocOrPragma
  | otherwise               = nextCharIsNot buf (== '#')
1121 1122
  where
    notFollowedByDocOrPragma
1123
       = afterOptionalSpace buf (\b -> nextCharIsNot b (`elem` "|^*$#"))
1124

1125 1126 1127 1128 1129
afterOptionalSpace :: StringBuffer -> (StringBuffer -> Bool) -> Bool
afterOptionalSpace buf p
    = if nextCharIs buf (== ' ')
      then p (snd (nextChar buf))
      else p buf
1130

1131
atEOL :: AlexAccPred ExtsBitmap
1132
atEOL _ _ _ (AI _ buf) = atEnd buf || currentChar buf == '\n'
1133

1134 1135 1136 1137 1138
ifExtension :: ExtBits -> AlexAccPred ExtsBitmap
ifExtension extBits bits _ _ _ = extBits `xtest` bits

alexNotPred p userState in1 len in2
  = not (p userState in1 len in2)
1139

1140 1141
alexOrPred p1 p2 userState in1 len in2
  = p1 userState in1 len in2 || p2 userState in1 len in2
1142

1143 1144 1145
multiline_doc_comment :: Action
multiline_doc_comment span buf _len = withLexedDocType (worker "")
  where
1146
    worker commentAcc input docType checkNextLine = case alexGetChar' input of
Ian Lynagh's avatar
Ian Lynagh committed
1147
      Just ('\n', input')
1148 1149
        | checkNextLine -> case checkIfCommentLine input' of
          Just input -> worker ('\n':commentAcc) input docType checkNextLine
1150
          Nothing -> docCommentEnd input commentAcc docType buf span
1151 1152
        | otherwise -> docCommentEnd input commentAcc docType buf span
      Just (c, input) -> worker (c:commentAcc) input docType checkNextLine
1153
      Nothing -> docCommentEnd input commentAcc docType buf span
Ian Lynagh's avatar
Ian Lynagh committed
1154

1155 1156 1157 1158 1159 1160 1161 1162
    -- Check if the next line of input belongs to this doc comment as well.
    -- A doc comment continues onto the next line when the following
    -- conditions are met:
    --   * The line starts with "--"
    --   * The line doesn't start with "---".
    --   * The line doesn't start with "-- $", because that would be the
    --     start of a /new/ named haddock chunk (#10398).
    checkIfCommentLine :: AlexInput -> Maybe AlexInput
1163 1164
    checkIfCommentLine input = check (dropNonNewlineSpace input)
      where
1165 1166 1167 1168 1169 1170 1171 1172 1173 1174
        check input = do
          ('-', input) <- alexGetChar' input
          ('-', input) <- alexGetChar' input
          (c, after_c) <- alexGetChar' input
          case c of
            '-' -> Nothing
            ' ' -> case alexGetChar' after_c of
                     Just ('$', _) -> Nothing
                     _ -> Just input
            _   -> Just input
1175

1176
        dropNonNewlineSpace input = case alexGetChar' input of
Ian Lynagh's avatar
Ian Lynagh committed
1177
          Just (c, input')
1178 1179 1180 1181
            | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
            | otherwise -> input
          Nothing -> input

Jedai's avatar
Jedai committed
1182 1183
lineCommentToken :: Action
lineCommentToken span buf len = do
1184
  b <- getBit RawTokenStreamBit
Jedai's avatar
Jedai committed
1185 1186
  if b then strtoken ITlineComment span buf len else lexToken

1187 1188 1189 1190
{-
  nested comments require traversing by hand, they can't be parsed
  using regular expressions.
-}
Ian Lynagh's avatar
Ian Lynagh committed
1191
nested_comment :: P (RealLocated Token) -> Action
Alan Zimmerman's avatar
Alan Zimmerman committed
1192
nested_comment cont span buf len = do
1193
  input <- getInput
1194
  go (reverse $ lexemeToString buf len) (1::Int) input
1195
  where
Alan Zimmerman's avatar
Alan Zimmerman committed
1196 1197
    go commentAcc 0 input = do
      setInput input
1198
      b <- getBit RawTokenStreamBit
Alan Zimmerman's avatar
Alan Zimmerman committed
1199 1200 1201
      if b
        then docCommentEnd input commentAcc ITblockComment buf span
        else cont
1202
    go commentAcc n input = case alexGetChar' input of
1203
      Nothing -> errBrace input span
1204
      Just ('-',input) -> case alexGetChar' input of
1205
        Nothing  -> errBrace input span
1206
        Just ('\125',input) -> go ('\125':'-':commentAcc) (n-1) input -- '}'
Jedai's avatar
Jedai committed
1207
        Just (_,_)          -> go ('-':commentAcc) n input
1208
      Just ('\123',input) -> case alexGetChar' input of  -- '{' char
1209
        Nothing  -> errBrace input span
Jedai's avatar
Jedai committed
1210 1211
        Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
        Just (_,_)       -> go ('\123':commentAcc) n input
1212 1213 1214 1215 1216 1217
      -- See Note [Nested comment line pragmas]
      Just ('\n',input) -> case alexGetChar' input of
        Nothing  -> errBrace input span
        Just ('#',_) -> do (parsedAcc,input) <- parseNestedPragma input
                           go (parsedAcc ++ '\n':commentAcc) n input
        Just (_,_)   -> go ('\n':commentAcc) n input
Jedai's avatar
Jedai committed
1218
      Just (c,input) -> go (c:commentAcc) n input
1219 1220 1221 1222

nested_doc_comment :: Action
nested_doc_comment span buf _len = withLexedDocType (go "")
  where
1223
    go commentAcc input docType _ = case alexGetChar' input of
1224
      Nothing -> errBrace input span
1225
      Just ('-',input) -> case alexGetChar' input of
1226
        Nothing -> errBrace input span
twanvl's avatar
twanvl committed
1227
        Just ('\125',input) ->
1228
          docCommentEnd input commentAcc docType buf span
twanvl's avatar
twanvl committed
1229
        Just (_,_) -> go ('-':commentAcc) input docType False
1230
      Just ('\123', input) -> case alexGetChar' input of
1231 1232 1233 1234 1235
        Nothing  -> errBrace input span
        Just ('-',input) -> do
          setInput input
          let cont = do input <- getInput; go commentAcc input docType False
          nested_comment cont span buf _len
twanvl's avatar
twanvl committed
1236
        Just (_,_) -> go ('\123':commentAcc) input docType False
1237 1238 1239 1240 1241 1242
      -- See Note [Nested comment line pragmas]
      Just ('\n',input) -> case alexGetChar' input of
        Nothing  -> errBrace input span
        Just ('#',_) -> do (parsedAcc,input) <- parseNestedPragma input
                           go (parsedAcc ++ '\n':commentAcc) input docType False
        Just (_,_)   -> go ('\n':commentAcc) input docType False
1243 1244
      Just (c,input) -> go (c:commentAcc) input docType False

1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256
-- See Note [Nested comment line pragmas]
parseNestedPragma :: AlexInput -> P (String,AlexInput)
parseNestedPragma input@(AI _ buf) = do
  origInput <- getInput
  setInput input
  setExts (.|. xbit InNestedCommentBit)
  pushLexState bol
  lt <- lexToken
  _ <- popLexState
  setExts (.&. complement (xbit InNestedCommentBit))
  postInput@(AI _ postBuf) <- getInput
  setInput origInput