diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs index 114a523c2af906a235db7ede84891f6a7403809f..ec9bca88e98e4bb248ea50f4c5c26781835263ef 100644 --- a/compiler/main/DynFlags.hs +++ b/compiler/main/DynFlags.hs @@ -296,6 +296,7 @@ data DynFlag | Opt_KeepSFiles | Opt_KeepRawSFiles | Opt_KeepTmpFiles + | Opt_KeepRawTokenStream deriving (Eq, Show) diff --git a/compiler/main/GHC.hs b/compiler/main/GHC.hs index 472f587e56dd2e063710dc1ac1d81c190de44575..766ed011f29e4a2c99d706d8711989dc883fcedb 100644 --- a/compiler/main/GHC.hs +++ b/compiler/main/GHC.hs @@ -198,6 +198,11 @@ module GHC ( -- * Exceptions GhcException(..), showGhcException, + -- * Token stream manipulations + Token, + getTokenStream, getRichTokenStream, + showRichTokenStream, addSourceToTokens, + -- * Miscellaneous --sessionHscEnv, cyclicModuleErr, @@ -269,13 +274,14 @@ import Bag ( unitBag, listToBag, emptyBag, isEmptyBag ) import ErrUtils import MonadUtils import Util -import StringBuffer ( StringBuffer, hGetStringBuffer ) +import StringBuffer ( StringBuffer, hGetStringBuffer, nextChar ) import Outputable import BasicTypes import Maybes ( expectJust, mapCatMaybes ) import HaddockParse import HaddockLex ( tokenise ) import FastString +import Lexer import Control.Concurrent import System.Directory ( getModificationTime, doesFileExist, @@ -1717,7 +1723,7 @@ topSortModuleGraph -- -- True: eliminate the hi-boot nodes, and instead pretend -- the a source-import of Foo is an import of Foo --- The resulting graph has no hi-boot nodes, but can by cyclic +-- The resulting graph has no hi-boot nodes, but can be cyclic topSortModuleGraph drop_hs_boot_nodes summaries mb_root_mod = map (fmap summaryNodeSummary) $ stronglyConnCompG initial_graph @@ -2440,12 +2446,85 @@ pprParenSymName a = parenSymOcc (getOccName a) (ppr (getName a)) -- :browse will use either lm_toplev or inspect lm_interface, depending -- on whether the module is interpreted or not. --- This is for reconstructing refactored source code --- Calls the lexer repeatedly. --- ToDo: add comment tokens to token stream -getTokenStream :: Session -> Module -> IO [Located Token] #endif +-- Extract the filename, stringbuffer content and dynflags associed to a module +-- +-- XXX: Explain pre-conditions +getModuleSourceAndFlags :: GhcMonad m => Module -> m (String, StringBuffer, DynFlags) +getModuleSourceAndFlags mod = do + m <- getModSummary (moduleName mod) + case ml_hs_file $ ms_location m of + Nothing -> throw $ mkApiErr (text "No source available for module " <+> ppr mod) + Just sourceFile -> do + source <- liftIO $ hGetStringBuffer sourceFile + return (sourceFile, source, ms_hspp_opts m) + + +-- | Return module source as token stream, including comments. +-- +-- The module must be in the module graph and its source must be available. +-- Throws a 'HscTypes.SourceError' on parse error. +getTokenStream :: GhcMonad m => Module -> m [Located Token] +getTokenStream mod = do + (sourceFile, source, flags) <- getModuleSourceAndFlags mod + let startLoc = mkSrcLoc (mkFastString sourceFile) 0 0 + case lexTokenStream source startLoc flags of + POk _ ts -> return ts + PFailed span err -> throw $ mkSrcErr (unitBag $ mkPlainErrMsg span err) + +-- | Give even more information on the source than 'getTokenStream' +-- This function allows reconstructing the source completely with +-- 'showRichTokenStream'. +getRichTokenStream :: GhcMonad m => Module -> m [(Located Token, String)] +getRichTokenStream mod = do + (sourceFile, source, flags) <- getModuleSourceAndFlags mod + let startLoc = mkSrcLoc (mkFastString sourceFile) 0 0 + case lexTokenStream source startLoc flags of + POk _ ts -> return $ addSourceToTokens startLoc source ts + PFailed span err -> throw $ mkSrcErr (unitBag $ mkPlainErrMsg span err) + +-- | Given a source location and a StringBuffer corresponding to this +-- location, return a rich token stream with the source associated to the +-- tokens. +addSourceToTokens :: SrcLoc -> StringBuffer -> [Located Token] + -> [(Located Token, String)] +addSourceToTokens _ _ [] = [] +addSourceToTokens loc buf (t@(L span _) : ts) + | not (isGoodSrcSpan span) = (t,"") : addSourceToTokens loc buf ts + | otherwise = (t,str) : addSourceToTokens newLoc newBuf ts + where + (newLoc, newBuf, str) = go "" loc buf + start = srcSpanStart span + end = srcSpanEnd span + go acc loc buf | loc < start = go acc nLoc nBuf + | start <= loc && loc < end = go (ch:acc) nLoc nBuf + | otherwise = (loc, buf, reverse acc) + where (ch, nBuf) = nextChar buf + nLoc = advanceSrcLoc loc ch + + +-- | Take a rich token stream such as produced from 'getRichTokenStream' and +-- return source code almost identical to the original code (except for +-- insignificant whitespace.) +showRichTokenStream :: [(Located Token, String)] -> String +showRichTokenStream ts = go startLoc ts "" + where sourceFile = srcSpanFile (getLoc . fst . head $ ts) + startLoc = mkSrcLoc sourceFile 0 0 + go _ [] = id + go loc ((L span _, str):ts) + | not (isGoodSrcSpan span) = go loc ts + | locLine == tokLine = ((replicate (tokCol - locCol) ' ') ++) + . (str ++) + . go tokEnd ts + | otherwise = ((replicate (tokLine - locLine) '\n') ++) + . ((replicate tokCol ' ') ++) + . (str ++) + . go tokEnd ts + where (locLine, locCol) = (srcLocLine loc, srcLocCol loc) + (tokLine, tokCol) = (srcSpanStartLine span, srcSpanStartCol span) + tokEnd = srcSpanEnd span + -- ----------------------------------------------------------------------------- -- Interactive evaluation diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x index 66f4fe51366d36d60087e99ef2b32eba25c2f8cb..613848ade970d5c77e824f9c10b21b83779c8961 100644 --- a/compiler/parser/Lexer.x +++ b/compiler/parser/Lexer.x @@ -41,7 +41,8 @@ module Lexer ( popContext, pushCurrentContext, setLastToken, setSrcLoc, getLexState, popLexState, pushLexState, extension, standaloneDerivingEnabled, bangPatEnabled, - addWarning + addWarning, + lexTokenStream ) where import Bag @@ -148,12 +149,12 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- space followed by a Haddock comment symbol (docsym) (in which case we'd -- have a Haddock comment). The rules then munch the rest of the line. -"-- " ~[$docsym \#] .* ; -"--" [^$symbol : \ ] .* ; +"-- " ~[$docsym \#] .* { lineCommentToken } +"--" [^$symbol : \ ] .* { lineCommentToken } -- Next, match Haddock comments if no -haddock flag -"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } ; +"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken } -- Now, when we've matched comments that begin with 2 dashes and continue -- with a different character, we need to match comments that begin with three @@ -161,17 +162,17 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- make sure that the first non-dash character isn't a symbol, and munch the -- rest of the line. -"---"\-* [^$symbol :] .* ; +"---"\-* [^$symbol :] .* { lineCommentToken } -- Since the previous rules all match dashes followed by at least one -- character, we also need to match a whole line filled with just dashes. -"--"\-* / { atEOL } ; +"--"\-* / { atEOL } { lineCommentToken } -- We need this rule since none of the other single line comment rules -- actually match this case. -"-- " / { atEOL } ; +"-- " / { atEOL } { lineCommentToken } -- 'bol' state: beginning of a line. Slurp up all the whitespace (including -- blank lines) until we find a non-whitespace character, then do layout @@ -277,7 +278,7 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } } <0> { - "-- #" .* ; + "-- #" .* { lineCommentToken } } <0,option_prags> { @@ -575,6 +576,8 @@ data Token | ITdocSection Int String -- a section heading | ITdocOptions String -- doc options (prune, ignore-exports, etc) | ITdocOptionsOld String -- doc options declared "-- # ..."-style + | ITlineComment String -- comment starting by "--" + | ITblockComment String -- comment in {- -} #ifdef DEBUG deriving Show -- debugging @@ -802,6 +805,11 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "") | otherwise -> input Nothing -> input +lineCommentToken :: Action +lineCommentToken span buf len = do + b <- extension rawTokenStreamEnabled + if b then strtoken ITlineComment span buf len else lexToken + {- nested comments require traversing by hand, they can't be parsed using regular expressions. @@ -809,20 +817,24 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "") nested_comment :: P (Located Token) -> Action nested_comment cont span _str _len = do input <- getInput - go (1::Int) input + go "" (1::Int) input where - go 0 input = do setInput input; cont - go n input = case alexGetChar input of + go commentAcc 0 input = do setInput input + b <- extension rawTokenStreamEnabled + if b + then docCommentEnd input commentAcc ITblockComment _str span + else cont + go commentAcc n input = case alexGetChar input of Nothing -> errBrace input span Just ('-',input) -> case alexGetChar input of Nothing -> errBrace input span - Just ('\125',input) -> go (n-1) input - Just (_,_) -> go n input + Just ('\125',input) -> go commentAcc (n-1) input + Just (_,_) -> go ('-':commentAcc) n input Just ('\123',input) -> case alexGetChar input of Nothing -> errBrace input span - Just ('-',input) -> go (n+1) input - Just (_,_) -> go n input - Just (_,input) -> go n input + Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input + Just (_,_) -> go ('\123':commentAcc) n input + Just (c,input) -> go (c:commentAcc) n input nested_doc_comment :: Action nested_doc_comment span buf _len = withLexedDocType (go "") @@ -1596,6 +1608,7 @@ standaloneDerivingBit = 16 -- standalone instance deriving declarations transformComprehensionsBit = 17 qqBit = 18 -- enable quasiquoting inRulePragBit = 19 +rawTokenStreamBit = 20 -- producing a token stream with all comments included genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool always _ = True @@ -1618,6 +1631,7 @@ standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit qqEnabled flags = testBit flags qqBit inRulePrag flags = testBit flags inRulePragBit +rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit -- PState for parsing options pragmas -- @@ -1679,7 +1693,8 @@ mkPState buf loc flags = .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags - .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags + .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags + .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags -- setBitIf :: Int -> Bool -> Int b `setBitIf` cond | cond = bit b @@ -1795,4 +1810,13 @@ reportLexError loc1 loc2 buf str if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar# then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)") else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c) + +lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token] +lexTokenStream buf loc dflags = unP go initState + where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream) + go = do + ltok <- lexer return + case ltok of + L _ ITeof -> return [] + _ -> liftM (ltok:) go }