Commit 36104d7a authored by Jedai's avatar Jedai

RichTokenStream support

This patch adds support for raw token streams, that contain more
information than normal token streams (they contains comments at
least). The "lexTokenStream" function brings this support to the
Lexer module. In addition to that, functions have been added to
the GHC module to make easier to recover of the token stream of 
a module ("getTokenStream").

Building on that, I added what could be called "rich token
stream": token stream to which have been added the source string
corresponding to each token, the function addSourceToToken takes
a StringBuffer and a starting SrcLoc and a token stream and build
this rich token stream. getRichTokenStream is a convenience
function to get a module rich token stream. "showRichTokenStream"
use the SrcLoc information in such a token stream to get a string
similar to the original source (except unsignificant
whitespaces). Thus "putStrLn . showRichTokenStream =<<
getRichTokenStream s mod" should print a valid module source, the
interesting part being to modify the token stream between the get
and the show of course.
parent c9bf1a2c
......@@ -296,6 +296,7 @@ data DynFlag
| Opt_KeepSFiles
| Opt_KeepRawSFiles
| Opt_KeepTmpFiles
| Opt_KeepRawTokenStream
deriving (Eq, Show)
......
......@@ -198,6 +198,11 @@ module GHC (
-- * Exceptions
GhcException(..), showGhcException,
-- * Token stream manipulations
Token,
getTokenStream, getRichTokenStream,
showRichTokenStream, addSourceToTokens,
-- * Miscellaneous
--sessionHscEnv,
cyclicModuleErr,
......@@ -269,13 +274,14 @@ import Bag ( unitBag, listToBag, emptyBag, isEmptyBag )
import ErrUtils
import MonadUtils
import Util
import StringBuffer ( StringBuffer, hGetStringBuffer )
import StringBuffer ( StringBuffer, hGetStringBuffer, nextChar )
import Outputable
import BasicTypes
import Maybes ( expectJust, mapCatMaybes )
import HaddockParse
import HaddockLex ( tokenise )
import FastString
import Lexer
import Control.Concurrent
import System.Directory ( getModificationTime, doesFileExist,
......@@ -1717,7 +1723,7 @@ topSortModuleGraph
--
-- True: eliminate the hi-boot nodes, and instead pretend
-- the a source-import of Foo is an import of Foo
-- The resulting graph has no hi-boot nodes, but can by cyclic
-- The resulting graph has no hi-boot nodes, but can be cyclic
topSortModuleGraph drop_hs_boot_nodes summaries mb_root_mod
= map (fmap summaryNodeSummary) $ stronglyConnCompG initial_graph
......@@ -2440,12 +2446,85 @@ pprParenSymName a = parenSymOcc (getOccName a) (ppr (getName a))
-- :browse will use either lm_toplev or inspect lm_interface, depending
-- on whether the module is interpreted or not.
-- This is for reconstructing refactored source code
-- Calls the lexer repeatedly.
-- ToDo: add comment tokens to token stream
getTokenStream :: Session -> Module -> IO [Located Token]
#endif
-- Extract the filename, stringbuffer content and dynflags associed to a module
--
-- XXX: Explain pre-conditions
getModuleSourceAndFlags :: GhcMonad m => Module -> m (String, StringBuffer, DynFlags)
getModuleSourceAndFlags mod = do
m <- getModSummary (moduleName mod)
case ml_hs_file $ ms_location m of
Nothing -> throw $ mkApiErr (text "No source available for module " <+> ppr mod)
Just sourceFile -> do
source <- liftIO $ hGetStringBuffer sourceFile
return (sourceFile, source, ms_hspp_opts m)
-- | Return module source as token stream, including comments.
--
-- The module must be in the module graph and its source must be available.
-- Throws a 'HscTypes.SourceError' on parse error.
getTokenStream :: GhcMonad m => Module -> m [Located Token]
getTokenStream mod = do
(sourceFile, source, flags) <- getModuleSourceAndFlags mod
let startLoc = mkSrcLoc (mkFastString sourceFile) 0 0
case lexTokenStream source startLoc flags of
POk _ ts -> return ts
PFailed span err -> throw $ mkSrcErr (unitBag $ mkPlainErrMsg span err)
-- | Give even more information on the source than 'getTokenStream'
-- This function allows reconstructing the source completely with
-- 'showRichTokenStream'.
getRichTokenStream :: GhcMonad m => Module -> m [(Located Token, String)]
getRichTokenStream mod = do
(sourceFile, source, flags) <- getModuleSourceAndFlags mod
let startLoc = mkSrcLoc (mkFastString sourceFile) 0 0
case lexTokenStream source startLoc flags of
POk _ ts -> return $ addSourceToTokens startLoc source ts
PFailed span err -> throw $ mkSrcErr (unitBag $ mkPlainErrMsg span err)
-- | Given a source location and a StringBuffer corresponding to this
-- location, return a rich token stream with the source associated to the
-- tokens.
addSourceToTokens :: SrcLoc -> StringBuffer -> [Located Token]
-> [(Located Token, String)]
addSourceToTokens _ _ [] = []
addSourceToTokens loc buf (t@(L span _) : ts)
| not (isGoodSrcSpan span) = (t,"") : addSourceToTokens loc buf ts
| otherwise = (t,str) : addSourceToTokens newLoc newBuf ts
where
(newLoc, newBuf, str) = go "" loc buf
start = srcSpanStart span
end = srcSpanEnd span
go acc loc buf | loc < start = go acc nLoc nBuf
| start <= loc && loc < end = go (ch:acc) nLoc nBuf
| otherwise = (loc, buf, reverse acc)
where (ch, nBuf) = nextChar buf
nLoc = advanceSrcLoc loc ch
-- | Take a rich token stream such as produced from 'getRichTokenStream' and
-- return source code almost identical to the original code (except for
-- insignificant whitespace.)
showRichTokenStream :: [(Located Token, String)] -> String
showRichTokenStream ts = go startLoc ts ""
where sourceFile = srcSpanFile (getLoc . fst . head $ ts)
startLoc = mkSrcLoc sourceFile 0 0
go _ [] = id
go loc ((L span _, str):ts)
| not (isGoodSrcSpan span) = go loc ts
| locLine == tokLine = ((replicate (tokCol - locCol) ' ') ++)
. (str ++)
. go tokEnd ts
| otherwise = ((replicate (tokLine - locLine) '\n') ++)
. ((replicate tokCol ' ') ++)
. (str ++)
. go tokEnd ts
where (locLine, locCol) = (srcLocLine loc, srcLocCol loc)
(tokLine, tokCol) = (srcSpanStartLine span, srcSpanStartCol span)
tokEnd = srcSpanEnd span
-- -----------------------------------------------------------------------------
-- Interactive evaluation
......
......@@ -41,7 +41,8 @@ module Lexer (
popContext, pushCurrentContext, setLastToken, setSrcLoc,
getLexState, popLexState, pushLexState,
extension, standaloneDerivingEnabled, bangPatEnabled,
addWarning
addWarning,
lexTokenStream
) where
import Bag
......@@ -148,12 +149,12 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
-- space followed by a Haddock comment symbol (docsym) (in which case we'd
-- have a Haddock comment). The rules then munch the rest of the line.
"-- " ~[$docsym \#] .* ;
"--" [^$symbol : \ ] .* ;
"-- " ~[$docsym \#] .* { lineCommentToken }
"--" [^$symbol : \ ] .* { lineCommentToken }
-- Next, match Haddock comments if no -haddock flag
"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } ;
"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
-- Now, when we've matched comments that begin with 2 dashes and continue
-- with a different character, we need to match comments that begin with three
......@@ -161,17 +162,17 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
-- make sure that the first non-dash character isn't a symbol, and munch the
-- rest of the line.
"---"\-* [^$symbol :] .* ;
"---"\-* [^$symbol :] .* { lineCommentToken }
-- Since the previous rules all match dashes followed by at least one
-- character, we also need to match a whole line filled with just dashes.
"--"\-* / { atEOL } ;
"--"\-* / { atEOL } { lineCommentToken }
-- We need this rule since none of the other single line comment rules
-- actually match this case.
"-- " / { atEOL } ;
"-- " / { atEOL } { lineCommentToken }
-- 'bol' state: beginning of a line. Slurp up all the whitespace (including
-- blank lines) until we find a non-whitespace character, then do layout
......@@ -277,7 +278,7 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") }
}
<0> {
"-- #" .* ;
"-- #" .* { lineCommentToken }
}
<0,option_prags> {
......@@ -575,6 +576,8 @@ data Token
| ITdocSection Int String -- a section heading
| ITdocOptions String -- doc options (prune, ignore-exports, etc)
| ITdocOptionsOld String -- doc options declared "-- # ..."-style
| ITlineComment String -- comment starting by "--"
| ITblockComment String -- comment in {- -}
#ifdef DEBUG
deriving Show -- debugging
......@@ -802,6 +805,11 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "")
| otherwise -> input
Nothing -> input
lineCommentToken :: Action
lineCommentToken span buf len = do
b <- extension rawTokenStreamEnabled
if b then strtoken ITlineComment span buf len else lexToken
{-
nested comments require traversing by hand, they can't be parsed
using regular expressions.
......@@ -809,20 +817,24 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "")
nested_comment :: P (Located Token) -> Action
nested_comment cont span _str _len = do
input <- getInput
go (1::Int) input
go "" (1::Int) input
where
go 0 input = do setInput input; cont
go n input = case alexGetChar input of
go commentAcc 0 input = do setInput input
b <- extension rawTokenStreamEnabled
if b
then docCommentEnd input commentAcc ITblockComment _str span
else cont
go commentAcc n input = case alexGetChar input of
Nothing -> errBrace input span
Just ('-',input) -> case alexGetChar input of
Nothing -> errBrace input span
Just ('\125',input) -> go (n-1) input
Just (_,_) -> go n input
Just ('\125',input) -> go commentAcc (n-1) input
Just (_,_) -> go ('-':commentAcc) n input
Just ('\123',input) -> case alexGetChar input of
Nothing -> errBrace input span
Just ('-',input) -> go (n+1) input
Just (_,_) -> go n input
Just (_,input) -> go n input
Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
Just (_,_) -> go ('\123':commentAcc) n input
Just (c,input) -> go (c:commentAcc) n input
nested_doc_comment :: Action
nested_doc_comment span buf _len = withLexedDocType (go "")
......@@ -1596,6 +1608,7 @@ standaloneDerivingBit = 16 -- standalone instance deriving declarations
transformComprehensionsBit = 17
qqBit = 18 -- enable quasiquoting
inRulePragBit = 19
rawTokenStreamBit = 20 -- producing a token stream with all comments included
genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
always _ = True
......@@ -1618,6 +1631,7 @@ standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
qqEnabled flags = testBit flags qqBit
inRulePrag flags = testBit flags inRulePragBit
rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
-- PState for parsing options pragmas
--
......@@ -1679,7 +1693,8 @@ mkPState buf loc flags =
.|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
.|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
.|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
.|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
.|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
.|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
--
setBitIf :: Int -> Bool -> Int
b `setBitIf` cond | cond = bit b
......@@ -1795,4 +1810,13 @@ reportLexError loc1 loc2 buf str
if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
lexTokenStream buf loc dflags = unP go initState
where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
go = do
ltok <- lexer return
case ltok of
L _ ITeof -> return []
_ -> liftM (ltok:) go
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment