Skip to content
Snippets Groups Projects
Commit 541c415f authored by Oleg Grenrus's avatar Oleg Grenrus
Browse files

Use alex in latin1 mode

At the moment we have very lax "name" lexer, which is essentially
*everything else* than otherwise specified characters: parens, space,
symbols (but not hyphen -)

This removes `TokNum` and `SecArgNum`, both are merged into `TokSym` and
`SecArgSym` respectively.
parent e98e16e9
No related branches found
No related tags found
No related merge requests found
......@@ -338,7 +338,7 @@ parseName pos args = case args of
[SecArgName _pos secName] ->
pure $ fromUTF8BS secName
[SecArgStr _pos secName] ->
pure secName
pure $ fromUTF8BS secName
[] -> do
parseFailure pos $ "name required"
pure ""
......
......@@ -91,7 +91,6 @@ parser = condOr
-- Number token can have many dots in it: SecArgNum (Position 65 15) "7.6.1"
ident = tokenPrim $ \t -> case t of
SecArgName _ s -> Just $ fromUTF8BS s
SecArgNum _ s -> Just $ fromUTF8BS s
_ -> Nothing
boolLiteral' = tokenPrim $ \t -> case t of
......
This diff is collapsed.
......@@ -90,10 +90,9 @@ getTokenWithPos getTok = tokenPrim (\(L _ t) -> describeToken t) updatePos getTo
describeToken :: Token -> String
describeToken t = case t of
TokSym s -> "name " ++ show s
TokStr s -> "string " ++ show s
TokNum s -> "number " ++ show s
TokOther s -> "symbol " ++ show s
TokSym s -> "symbol " ++ show s
TokStr s -> "string " ++ show s
TokOther s -> "operator " ++ show s
Indent _ -> "new line"
TokFieldLine _ -> "field content"
Colon -> "\":\""
......@@ -103,16 +102,15 @@ describeToken t = case t of
EOF -> "end of file"
LexicalError is -> "character in input " ++ show (B8.head is)
tokName :: Parser (Name Position)
tokName', tokStr, tokNum, tokOther :: Parser (SectionArg Position)
tokSym :: Parser (Name Position)
tokSym', tokStr, tokOther :: Parser (SectionArg Position)
tokIndent :: Parser Int
tokColon, tokOpenBrace, tokCloseBrace :: Parser ()
tokFieldLine :: Parser (FieldLine Position)
tokName = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (mkName pos x); _ -> Nothing
tokName' = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (SecArgName pos x); _ -> Nothing
tokSym = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (mkName pos x); _ -> Nothing
tokSym' = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (SecArgName pos x); _ -> Nothing
tokStr = getTokenWithPos $ \t -> case t of L pos (TokStr x) -> Just (SecArgStr pos x); _ -> Nothing
tokNum = getTokenWithPos $ \t -> case t of L pos (TokNum x) -> Just (SecArgNum pos x); _ -> Nothing
tokOther = getTokenWithPos $ \t -> case t of L pos (TokOther x) -> Just (SecArgOther pos x); _ -> Nothing
tokIndent = getToken $ \t -> case t of Indent x -> Just x; _ -> Nothing
tokColon = getToken $ \t -> case t of Colon -> Just (); _ -> Nothing
......@@ -123,11 +121,10 @@ tokFieldLine = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just
colon, openBrace, closeBrace :: Parser ()
sectionArg :: Parser (SectionArg Position)
sectionArg = tokName' <|> tokStr
<|> tokNum <|> tokOther <?> "section parameter"
sectionArg = tokSym' <|> tokStr <|> tokOther <?> "section parameter"
fieldSecName :: Parser (Name Position)
fieldSecName = tokName <?> "field or section name"
fieldSecName = tokSym <?> "field or section name"
colon = tokColon <?> "\":\""
openBrace = tokOpenBrace <?> "\"{\""
......
......@@ -46,11 +46,9 @@ data FieldLine ann = FieldLine !ann !ByteString
-- | Section arguments, e.g. name of the library
data SectionArg ann
= SecArgName !ann !ByteString
-- ^ identifier
| SecArgStr !ann !String
-- ^ identifier, or omething which loos like number. Also many dot numbers, i.e. "7.6.3"
| SecArgStr !ann !ByteString
-- ^ quoted string
| SecArgNum !ann !ByteString
-- ^ Something which loos like number. Also many dot numbers, i.e. "7.6.3"
| SecArgOther !ann !ByteString
-- ^ everything else, mm. operators (e.g. in if-section conditionals)
deriving (Eq, Show, Functor)
......@@ -59,7 +57,6 @@ data SectionArg ann
sectionArgAnn :: SectionArg ann -> ann
sectionArgAnn (SecArgName ann _) = ann
sectionArgAnn (SecArgStr ann _) = ann
sectionArgAnn (SecArgNum ann _) = ann
sectionArgAnn (SecArgOther ann _) = ann
-------------------------------------------------------------------------------
......
......@@ -48,8 +48,8 @@ warningTest wt fp = testCase (show wt) $ do
let res = parseGenericPackageDescription contents
let (warns, errs, x) = runParseResult res
assertBool "parses successfully" $ isJust x
assertBool "parses without errors" $ null errs
assertBool ("should parse successfully: " ++ show errs) $ isJust x
assertBool ("should parse without errors: " ++ show errs) $ null errs
case warns of
[PWarning wt' _ _] -> assertEqual "warning type" wt wt'
......
......@@ -7,7 +7,7 @@ all : exe lib
lexer : $(LEXER_HS)
$(LEXER_HS) : boot/Lexer.x
alex --ghc -o $@ $^
alex --latin1 --ghc -o $@ $^
lib : $(LEXER_HS)
cabal new-build --enable-tests Cabal
......
......@@ -54,15 +54,12 @@ import qualified Data.Text.Encoding.Error as T
-- Various character classes
$space = \ -- single space char
$digit = 0-9 -- digits
$alpha = [a-z A-Z] -- alphabetic characters
$symbol = [\= \< \> \+ \* \- \& \| \! \$ \% \^ \@ \# \? \/ \\ \~]
$ctlchar = [\x0-\x1f \x7f]
$printable = \x0-\x10ffff # $ctlchar -- so no \n \r
$nbsp = \xa0
$printable = \x0-\xff # $ctlchar -- so no \n \r
$symbol' = [ \, \= \< \> \+ \* \& \| \! \$ \% \^ \@ \# \? \/ \\ \~ ]
$symbol = [$symbol' \- \.]
$spacetab = [$space \t]
$bom = \xfeff
$nbspspacetab = [$nbsp $space \t]
$paren = [ \( \) \[ \] ]
$field_layout = [$printable \t]
......@@ -70,16 +67,18 @@ $field_layout' = [$printable] # [$space]
$field_braces = [$printable \t] # [\{ \}]
$field_braces' = [$printable] # [\{ \} $space]
$comment = [$printable \t]
$namecore = [$alpha]
$nameextra = [$namecore $digit \- \_ \. \']
$namecore = [$printable] # [$space \: \" \{ \} $paren $symbol']
$instr = [$printable $space] # [\"]
$instresc = $printable
@nl = \n | \r\n | \r
@name = $nameextra* $namecore $nameextra*
@string = \" ( $instr | \\ $instresc )* \"
@numlike = $digit [$digit \.]*
@oplike = [ \, \. \= \< \> \+ \* \- \& \| \! \$ \% \^ \@ \# \? \/ \\ \~ ]+
@bom = \xef \xbb \xbf
@nbsp = \xc2 \xa0
@nbspspacetab = ($spacetab | @nbsp)
@nl = \n | \r\n | \r
@name = $namecore+
@string = \" ( $instr | \\ $instresc )* \"
@oplike = $symbol+
tokens :-
......@@ -89,14 +88,14 @@ tokens :-
}
<bol_section, bol_field_layout, bol_field_braces> {
$nbspspacetab* @nl { \_pos len inp -> checkWhitespace len inp >> adjustPos retPos >> lexToken }
@nbspspacetab* @nl { \_pos len inp -> checkWhitespace len inp >> adjustPos retPos >> lexToken }
-- no @nl here to allow for comments on last line of the file with no trailing \n
$spacetab* "--" $comment* ; -- TODO: check the lack of @nl works here
-- including counting line numbers
}
<bol_section> {
$nbspspacetab* --TODO prevent or record leading tabs
@nbspspacetab* --TODO prevent or record leading tabs
{ \pos len inp -> checkWhitespace len inp >>
if B.length inp == len
then return (L pos EOF)
......@@ -112,10 +111,7 @@ tokens :-
"--" $comment* ;
@name { toki TokSym }
@string { \p l i -> case reads (B.Char8.unpack (B.take l i)) of
[(str,[])] -> return (L p (TokStr str))
_ -> lexicalError p i }
@numlike { toki TokNum }
@string { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) }
@oplike { toki TokOther }
$paren { toki TokOther }
\: { tok Colon }
......@@ -125,7 +121,7 @@ tokens :-
}
<bol_field_layout> {
$nbspspacetab* --TODO prevent or record leading tabs
@nbspspacetab* --TODO prevent or record leading tabs
{ \pos len inp -> checkWhitespace len inp >>= \len' ->
if B.length inp == len
then return (L pos EOF)
......@@ -154,10 +150,9 @@ tokens :-
{
-- | Tokens of outer cabal file structure. Field values are treated opaquely.
data Token = TokSym !ByteString -- ^ Haskell-like identifier
| TokStr !String -- ^ String in quotes
| TokNum !ByteString -- ^ Integral
| TokOther !ByteString -- ^ Operator like token
data Token = TokSym !ByteString -- ^ Haskell-like identifier, number or operator
| TokStr !ByteString -- ^ String in quotes
| TokOther !ByteString -- ^ Operators and parens
| Indent !Int -- ^ Indentation token
| TokFieldLine !ByteString -- ^ Lines after @:@
| Colon
......@@ -170,10 +165,10 @@ data Token = TokSym !ByteString -- ^ Haskell-like identifier
data LToken = L !Position !Token
deriving Show
toki :: Monad m => (ByteString -> Token) -> Position -> Int -> ByteString -> m LToken
toki :: (ByteString -> Token) -> Position -> Int -> ByteString -> Lex LToken
toki t pos len input = return $! L pos (t (B.take len input))
tok :: Monad m => Token -> Position -> t -> t1 -> m LToken
tok :: Token -> Position -> Int -> ByteString -> Lex LToken
tok t pos _len _input = return $! L pos t
checkWhitespace :: Int -> ByteString -> Lex Int
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment