HaddockLex.x 4.83 KB
Newer Older
1 2 3 4 5 6 7 8 9
--
-- Haddock - A Haskell Documentation Tool
--
-- (c) Simon Marlow 2002
--
-- This file was modified and integrated into GHC by David Waern 2006
--

{
10
{-# OPTIONS -Wwarn #-}
11 12 13
-- The above warning supression flag is a temporary kludge.
-- While working on this module you are encouraged to remove it and fix
-- any warnings in the module. See
Ian Lynagh's avatar
Ian Lynagh committed
14
--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
15 16
-- for details

17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
module HaddockLex (
	Token(..),
	tokenise
 ) where

import Lexer hiding (Token)
import Parser ( parseIdentifier )
import StringBuffer
import RdrName
import SrcLoc
import DynFlags

import Char
import Numeric
import System.IO.Unsafe
}

$ws    = $white # \n
$digit = [0-9]
$hexdigit = [0-9a-fA-F]
37
$special =  [\"\@]
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
$alphanum = [A-Za-z0-9]
$ident    = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~]

:-

-- beginning of a paragraph
<0,para> {
 $ws* \n		;
 $ws* \>		{ begin birdtrack }
 $ws* [\*\-]		{ token TokBullet `andBegin` string }
 $ws* \[		{ token TokDefStart `andBegin` def }
 $ws* \( $digit+ \) 	{ token TokNumber `andBegin` string }
 $ws*			{ begin string }		
}

-- beginning of a line
<line> {
  $ws* \>		{ begin birdtrack }
  $ws* \n		{ token TokPara `andBegin` para }
  -- Here, we really want to be able to say
  -- $ws* (\n | <eof>) 	{ token TokPara `andBegin` para}
  -- because otherwise a trailing line of whitespace will result in 
  -- a spurious TokString at the end of a docstring.  We don't have <eof>,
  -- though (NOW I realise what it was for :-).  To get around this, we always
  -- append \n to the end of a docstring.
  () 			{ begin string }
}

66
<birdtrack> .*	\n?	{ strtokenNL TokBirdTrack `andBegin` line }
67 68 69

<string,def> {
  $special			{ strtoken $ \s -> TokSpecial (head s) }
70
  \<\<.*\>\>                    { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
71 72
  \<.*\>			{ strtoken $ \s -> TokURL (init (tail s)) }
  \#.*\#			{ strtoken $ \s -> TokAName (init (tail s)) }
73
  \/ [^\/]* \/                  { strtoken $ \s -> TokEmphasis (init (tail s)) }
74 75 76 77 78 79
  [\'\`] $ident+ [\'\`]		{ ident }
  \\ .				{ strtoken (TokString . tail) }
  "&#" $digit+ \;		{ strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
  "&#" [xX] $hexdigit+ \;	{ strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
  -- allow special characters through if they don't fit one of the previous
  -- patterns.
80
  [\/\'\`\<\#\&\\]			{ strtoken TokString }
81
  [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
82
  [^ $special \/ \< \# \n \'\` \& \\ \]]+    { strtoken TokString }
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
}

<def> {
  \]				{ token TokDefEnd `andBegin` string }
}

-- ']' doesn't have any special meaning outside of the [...] at the beginning
-- of a definition paragraph.
<string> {
  \]				{ strtoken TokString }
}

{
data Token
  = TokPara
  | TokNumber
  | TokBullet
  | TokDefStart
  | TokDefEnd
  | TokSpecial Char
  | TokIdent [RdrName]
  | TokString String
  | TokURL String
106
  | TokPic String
107
  | TokEmphasis String
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
  | TokAName String
  | TokBirdTrack String
--  deriving Show

-- -----------------------------------------------------------------------------
-- Alex support stuff

type StartCode = Int
type Action = String -> StartCode -> (StartCode -> [Token]) -> [Token]

type AlexInput = (Char,String)

alexGetChar (_, [])   = Nothing
alexGetChar (_, c:cs) = Just (c, (c,cs))

alexInputPrevChar (c,_) = c

tokenise :: String -> [Token]
tokenise str = let toks = go ('\n', eofHack str) para in {-trace (show toks)-} toks
  where go inp@(_,str) sc =
	  case alexScan inp sc of
		AlexEOF -> []
		AlexError _ -> error "lexical error"
131
		AlexSkip  inp' _       -> go inp' sc
132 133 134 135 136 137 138
		AlexToken inp' len act -> act (take len str) sc (\sc -> go inp' sc)

-- NB. we add a final \n to the string, (see comment in the beginning of line
-- production above).
eofHack str = str++"\n"

andBegin  :: Action -> StartCode -> Action
139
andBegin act new_sc = \str _ cont -> act str new_sc cont
140 141

token :: Token -> Action
142
token t = \_ sc cont -> t : cont sc
143

144
strtoken, strtokenNL :: (String -> Token) -> Action
145
strtoken t = \str sc cont -> t str : cont sc
146 147 148
strtokenNL t = \str sc cont -> t (filter (/= '\r') str) : cont sc
-- ^ We only want LF line endings in our internal doc string format, so we
-- filter out all CRs.
149 150

begin :: StartCode -> Action
151
begin sc = \_ _ cont -> cont sc
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171

-- -----------------------------------------------------------------------------
-- Lex a string as a Haskell identifier

ident :: Action
ident str sc cont = 
  case strToHsQNames id of
	Just names -> TokIdent names : cont sc
	Nothing -> TokString str : cont sc
 where id = init (tail str)

strToHsQNames :: String -> Maybe [RdrName]
strToHsQNames str0 = 
  let buffer = unsafePerformIO (stringToStringBuffer str0)
      pstate = mkPState buffer noSrcLoc defaultDynFlags
      result = unP parseIdentifier pstate 
  in case result of 
       POk _ name -> Just [unLoc name] 
       _ -> Nothing
}