From 64c9898f7b5239435f131f5444d62bda23dfc9ef Mon Sep 17 00:00:00 2001 From: Thomas Miedema Date: Mon, 1 Sep 2014 15:13:00 -0500 Subject: [PATCH] Make Lexer.x more like the 2010 report Summary: I tried reading the lexer and the 2010 report side-by-side. Althought I didn't quite finish, here are some small discrepancies that I found. This revision may be low priority for reviewers, but having these commits just in my local repository does no good either. Changes: * $nl was defined, but not used anywhere * formfeed is a newline character * add \: to $ascsymbol For simplification reason, the colon (':') was added to the character set $ascsymbol in the 2010 report. Here we make the same change. * introduce the macros `qvarid`, `qconid`, `qvarsym` and `qconsym` * foreign is a Haskell keyword * add/update comments Test Plan: Harbormaster (is awesome) Reviewers: simonmar, hvr, austin Reviewed By: austin Subscribers: hvr, simonmar, ezyang, carter Differential Revision: https://phabricator.haskell.org/D180 --- compiler/parser/Lexer.x | 115 +++++++++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 42 deletions(-) diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x index 88a0f07d90..cfe795585b 100644 --- a/compiler/parser/Lexer.x +++ b/compiler/parser/Lexer.x @@ -1,15 +1,20 @@ ----------------------------------------------------------------------------- -- (c) The University of Glasgow, 2006 -- --- GHC's lexer. +-- GHC's lexer for Haskell 2010 [1]. -- --- This is a combination of an Alex-generated lexer from a regex --- definition, with some hand-coded bits. +-- This is a combination of an Alex-generated lexer [2] from a regex +-- definition, with some hand-coded bits. [3] -- -- Completely accurate information about token-spans within the source -- file is maintained. Every token has a start and end RealSrcLoc -- attached to it. -- +-- References: +-- [1] https://www.haskell.org/onlinereport/haskell2010/haskellch2.html +-- [2] http://www.haskell.org/alex/ +-- [3] https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/Parser +-- ----------------------------------------------------------------------------- -- ToDo / known bugs: @@ -31,6 +36,10 @@ -- form? This is quite difficult to achieve. We don't do it for -- qualified varids. + +-- ----------------------------------------------------------------------------- +-- Alex "Haskell code fragment top" + { -- XXX The above flags turn off warnings in the generated code: {-# LANGUAGE BangPatterns #-} @@ -91,48 +100,55 @@ import Data.Ratio import Data.Word } -$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetChar. -$whitechar = [\ \n\r\f\v $unispace] -$white_no_nl = $whitechar # \n + +-- ----------------------------------------------------------------------------- +-- Alex "Character set macros" + +$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetByte. +$nl = [\n\r\f] +$whitechar = [$nl\v\ $unispace] +$white_no_nl = $whitechar # \n -- TODO #8424 $tab = \t $ascdigit = 0-9 -$unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetChar. +$unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetByte. $decdigit = $ascdigit -- for now, should really be $digit (ToDo) $digit = [$ascdigit $unidigit] $special = [\(\)\,\;\[\]\`\{\}] -$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~] -$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar. -$symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\'] +$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:] +$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetByte. +$symbol = [$ascsymbol $unisymbol] # [$special \_\"\'] -$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetChar. +$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetByte. $asclarge = [A-Z] $large = [$asclarge $unilarge] -$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetChar. +$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetByte. $ascsmall = [a-z] $small = [$ascsmall $unismall \_] -$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar. -$graphic = [$small $large $symbol $digit $special $unigraphic \:\"\'] +$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetByte. +$graphic = [$small $large $symbol $digit $special $unigraphic \"\'] $binit = 0-1 $octit = 0-7 $hexit = [$decdigit A-F a-f] -$symchar = [$symbol \:] -$nl = [\n\r] $idchar = [$small $large $digit \'] $pragmachar = [$small $large $digit] $docsym = [\| \^ \* \$] -@varid = $small $idchar* -@conid = $large $idchar* -@varsym = $symbol $symchar* -@consym = \: $symchar* +-- ----------------------------------------------------------------------------- +-- Alex "Regular expression macros" + +@varid = $small $idchar* -- variable identifiers +@conid = $large $idchar* -- constructor identifiers + +@varsym = ($symbol # \:) $symbol* -- variable (operator) symbol +@consym = \: $symbol* -- constructor (operator) symbol @decimal = $decdigit+ @binary = $binit+ @@ -140,8 +156,11 @@ $docsym = [\| \^ \* \$] @hexadecimal = $hexit+ @exponent = [eE] [\-\+]? @decimal --- we support the hierarchical module name extension: @qual = (@conid \.)+ +@qvarid = @qual @varid +@qconid = @qual @conid +@qvarsym = @qual @varsym +@qconsym = @qual @consym @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent @@ -150,9 +169,17 @@ $docsym = [\| \^ \* \$] @negative = \- @signed = @negative ? + +-- ----------------------------------------------------------------------------- +-- Alex "Identifier" + haskell :- --- everywhere: skip whitespace and comments + +-- ----------------------------------------------------------------------------- +-- Alex "Rules" + +-- everywhere: skip whitespace $white_no_nl+ ; $tab+ { warn Opt_WarnTabs (text "Tab character") } @@ -179,7 +206,7 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- have a Haddock comment). The rules then munch the rest of the line. "-- " ~[$docsym \#] .* { lineCommentToken } -"--" [^$symbol : \ ] .* { lineCommentToken } +"--" [^$symbol \ ] .* { lineCommentToken } -- Next, match Haddock comments if no -haddock flag @@ -191,7 +218,7 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- make sure that the first non-dash character isn't a symbol, and munch the -- rest of the line. -"---"\-* [^$symbol :] .* { lineCommentToken } +"---"\-* ~$symbol .* { lineCommentToken } -- Since the previous rules all match dashes followed by at least one -- character, we also need to match a whole line filled with just dashes. @@ -252,13 +279,13 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- single-line line pragmas, of the form -- # "" \n - $decdigit+ { setLine line_prag1a } + @decimal { setLine line_prag1a } \" [$graphic \ ]* \" { setFile line_prag1b } .* { pop } -- Haskell-style line pragmas, of the form -- {-# LINE "" #-} - $decdigit+ { setLine line_prag2a } + @decimal { setLine line_prag2a } \" [$graphic \ ]* \" { setFile line_prag2b } "#-}"|"-}" { pop } -- NOTE: accept -} at the end of a LINE pragma, for compatibility @@ -341,8 +368,8 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } { lex_quasiquote_tok } -- qualified quasi-quote (#5555) - "[" @qual @varid "|" / { ifExtension qqEnabled } - { lex_qquasiquote_tok } + "[" @qvarid "|" / { ifExtension qqEnabled } + { lex_qquasiquote_tok } } <0> { @@ -376,15 +403,15 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } } <0,option_prags> { - @qual @varid { idtoken qvarid } - @qual @conid { idtoken qconid } + @qvarid { idtoken qvarid } + @qconid { idtoken qconid } @varid { varid } @conid { idtoken conid } } <0> { - @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid } - @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid } + @qvarid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid } + @qconid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid } @varid "#"+ / { ifExtension magicHashEnabled } { varid } @conid "#"+ / { ifExtension magicHashEnabled } { idtoken conid } } @@ -392,8 +419,8 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- ToDo: - move `var` and (sym) into lexical syntax? -- - remove backquote from $special? <0> { - @qual @varsym { idtoken qvarsym } - @qual @consym { idtoken qconsym } + @qvarsym { idtoken qvarsym } + @qconsym { idtoken qconsym } @varsym { varsym } @consym { consym } } @@ -453,6 +480,10 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } \" { lex_string_tok } } + +-- ----------------------------------------------------------------------------- +-- Alex "Haskell code fragment bottom" + { -- ----------------------------------------------------------------------------- -- The token type @@ -467,6 +498,7 @@ data Token | ITdo | ITelse | IThiding + | ITforeign | ITif | ITimport | ITin @@ -484,7 +516,6 @@ data Token | ITwhere | ITforall -- GHC extension keywords - | ITforeign | ITexport | ITlabel | ITdynamic @@ -1738,13 +1769,13 @@ alexGetByte (AI loc s) loc' = advanceSrcLoc loc c byte = fromIntegral $ ord adj_c - non_graphic = '\x0' - upper = '\x1' - lower = '\x2' - digit = '\x3' - symbol = '\x4' - space = '\x5' - other_graphic = '\x6' + non_graphic = '\x00' + upper = '\x01' + lower = '\x02' + digit = '\x03' + symbol = '\x04' + space = '\x05' + other_graphic = '\x06' adj_c | c <= '\x06' = non_graphic -- GitLab