Use alex in latin1 mode

At the moment we have very lax "name" lexer, which is essentially *everything else* than otherwise specified characters: parens, space, symbols (but not hyphen -) This removes `TokNum` and `SecArgNum`, both are merged into `TokSym` and `SecArgSym` respectively.

Use alex in latin1 mode
At the moment we have very lax "name" lexer, which is essentially *everything else* than otherwise specified characters: parens, space, symbols (but not hyphen -) This removes `TokNum` and `SecArgNum`, both are merged into `TokSym` and `SecArgSym` respectively.
541c415f · Oleg Grenrus · e98e16e9 · 541c415f · 541c415f · 541c415f
Commit 541c415f authored 7 years ago by Oleg Grenrus
--- a/Cabal/Distribution/PackageDescription/Parsec.hs
+++ b/Cabal/Distribution/PackageDescription/Parsec.hs
@@ -338,7 +338,7 @@ parseName pos args = case args of
    [SecArgName _pos secName] ->
         pure $ fromUTF8BS secName
    [SecArgStr _pos secName] ->
-         pure secName
+         pure $ fromUTF8BS secName
    [] -> do
         parseFailure pos $ "name required"
         pure ""

--- a/Cabal/Distribution/Parsec/ConfVar.hs
+++ b/Cabal/Distribution/Parsec/ConfVar.hs
@@ -91,7 +91,6 @@ parser = condOr
    -- Number token can have many dots in it: SecArgNum (Position 65 15) "7.6.1"
    ident = tokenPrim $ \t -> case t of
        SecArgName _ s -> Just $ fromUTF8BS s
-        SecArgNum  _ s -> Just $ fromUTF8BS s
        _              -> Nothing

    boolLiteral' = tokenPrim $ \t -> case t of

--- a/Cabal/Distribution/Parsec/Lexer.hs
+++ b/Cabal/Distribution/Parsec/Lexer.hs
--- a/Cabal/Distribution/Parsec/Parser.hs
+++ b/Cabal/Distribution/Parsec/Parser.hs
@@ -90,10 +90,9 @@ getTokenWithPos getTok = tokenPrim (\(L _ t) -> describeToken t) updatePos getTo

 describeToken :: Token -> String
 describeToken t = case t of
-  TokSym   s      -> "name "   ++ show s
-  TokStr   s      -> "string " ++ show s
-  TokNum   s      -> "number " ++ show s
-  TokOther s      -> "symbol " ++ show s
+  TokSym   s      -> "symbol "   ++ show s
+  TokStr   s      -> "string "   ++ show s
+  TokOther s      -> "operator " ++ show s
  Indent _        -> "new line"
  TokFieldLine _  -> "field content"
  Colon           -> "\":\""
@@ -103,16 +102,15 @@ describeToken t = case t of
  EOF             -> "end of file"
  LexicalError is -> "character in input " ++ show (B8.head is)

-tokName :: Parser (Name Position)
-tokName', tokStr, tokNum, tokOther :: Parser (SectionArg Position)
+tokSym :: Parser (Name Position)
+tokSym', tokStr, tokOther :: Parser (SectionArg Position)
 tokIndent :: Parser Int
 tokColon, tokOpenBrace, tokCloseBrace :: Parser ()
 tokFieldLine :: Parser (FieldLine Position)

-tokName       = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (mkName pos x);  _ -> Nothing
-tokName'      = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (SecArgName pos x);  _ -> Nothing
+tokSym        = getTokenWithPos $ \t -> case t of L pos (TokSym   x) -> Just (mkName pos x);  _ -> Nothing
+tokSym'       = getTokenWithPos $ \t -> case t of L pos (TokSym   x) -> Just (SecArgName pos x);  _ -> Nothing
 tokStr        = getTokenWithPos $ \t -> case t of L pos (TokStr   x) -> Just (SecArgStr pos x);  _ -> Nothing
-tokNum        = getTokenWithPos $ \t -> case t of L pos (TokNum   x) -> Just (SecArgNum pos x);  _ -> Nothing
 tokOther      = getTokenWithPos $ \t -> case t of L pos (TokOther x) -> Just (SecArgOther pos x);  _ -> Nothing
 tokIndent     = getToken $ \t -> case t of Indent   x -> Just x;  _ -> Nothing
 tokColon      = getToken $ \t -> case t of Colon      -> Just (); _ -> Nothing
@@ -123,11 +121,10 @@ tokFieldLine  = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just
 colon, openBrace, closeBrace :: Parser ()

 sectionArg :: Parser (SectionArg Position)
-sectionArg   = tokName' <|> tokStr
-            <|> tokNum <|> tokOther <?> "section parameter"
+sectionArg   = tokSym' <|> tokStr <|> tokOther <?> "section parameter"

 fieldSecName :: Parser (Name Position)
-fieldSecName = tokName              <?> "field or section name"
+fieldSecName = tokSym              <?> "field or section name"

 colon        = tokColon      <?> "\":\""
 openBrace    = tokOpenBrace  <?> "\"{\""

--- a/Cabal/Distribution/Parsec/Types/Field.hs
+++ b/Cabal/Distribution/Parsec/Types/Field.hs
@@ -46,11 +46,9 @@ data FieldLine ann  = FieldLine  !ann !ByteString
 -- | Section arguments, e.g. name of the library
 data SectionArg ann
    = SecArgName  !ann !ByteString
-      -- ^ identifier
-    | SecArgStr   !ann !String
+      -- ^ identifier, or omething which loos like number. Also many dot numbers, i.e. "7.6.3"
+    | SecArgStr   !ann !ByteString
      -- ^ quoted string
-    | SecArgNum   !ann !ByteString
-      -- ^ Something which loos like number. Also many dot numbers, i.e. "7.6.3"
    | SecArgOther !ann !ByteString
      -- ^ everything else, mm. operators (e.g. in if-section conditionals)
  deriving (Eq, Show, Functor)
@@ -59,7 +57,6 @@ data SectionArg ann
 sectionArgAnn :: SectionArg ann -> ann
 sectionArgAnn (SecArgName ann _)  = ann
 sectionArgAnn (SecArgStr ann _)   = ann
-sectionArgAnn (SecArgNum ann _)   = ann
 sectionArgAnn (SecArgOther ann _) = ann

 -------------------------------------------------------------------------------

--- a/Cabal/tests/ParserTests.hs
+++ b/Cabal/tests/ParserTests.hs
@@ -48,8 +48,8 @@ warningTest wt fp = testCase (show wt) $ do
    let res =  parseGenericPackageDescription contents
    let (warns, errs, x) = runParseResult res

-    assertBool "parses successfully" $ isJust x
-    assertBool "parses without errors" $ null errs
+    assertBool ("should parse successfully: " ++ show errs) $ isJust x
+    assertBool ("should parse without errors:  " ++  show errs) $ null errs

    case warns of
        [PWarning wt' _ _] -> assertEqual "warning type" wt wt'

--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ all : exe lib
 lexer : $(LEXER_HS)

 $(LEXER_HS) : boot/Lexer.x
-	alex --ghc -o $@ $^
+	alex --latin1 --ghc -o $@ $^

 lib : $(LEXER_HS)
 	cabal new-build --enable-tests Cabal

--- a/boot/Lexer.x
+++ b/boot/Lexer.x
@@ -54,15 +54,12 @@ import qualified Data.Text.Encoding.Error as T
 -- Various character classes

 $space           = \          -- single space char
-$digit           = 0-9        -- digits
-$alpha           = [a-z A-Z]  -- alphabetic characters
-$symbol          = [\= \< \> \+ \* \- \& \| \! \$ \% \^ \@ \# \? \/ \\ \~]
 $ctlchar         = [\x0-\x1f \x7f]
-$printable       = \x0-\x10ffff # $ctlchar   -- so no \n \r
-$nbsp            = \xa0
+$printable       = \x0-\xff # $ctlchar   -- so no \n \r
+$symbol'         = [ \, \= \< \> \+ \* \& \| \! \$ \% \^ \@ \# \? \/ \\ \~ ]
+$symbol          = [$symbol' \- \.]
 $spacetab        = [$space \t]
 $bom             = \xfeff
-$nbspspacetab    = [$nbsp $space \t]

 $paren           = [ \( \) \[ \] ]
 $field_layout    = [$printable \t]
@@ -70,16 +67,18 @@ $field_layout'   = [$printable] # [$space]
 $field_braces    = [$printable \t] # [\{ \}]
 $field_braces'   = [$printable] # [\{ \} $space]
 $comment         = [$printable \t]
-$namecore        = [$alpha]
-$nameextra       = [$namecore $digit \- \_ \. \']
+$namecore        = [$printable] # [$space \: \" \{ \} $paren $symbol']
 $instr           = [$printable $space] # [\"]
 $instresc        = $printable

-@nl          = \n | \r\n | \r
-@name        = $nameextra* $namecore $nameextra*
-@string      = \" ( $instr | \\ $instresc )* \"
-@numlike     = $digit [$digit \.]*
-@oplike      = [ \, \. \= \< \> \+ \* \- \& \| \! \$ \% \^ \@ \# \? \/ \\ \~ ]+
+@bom          = \xef \xbb \xbf
+@nbsp         = \xc2 \xa0
+@nbspspacetab = ($spacetab | @nbsp)
+@nl           = \n | \r\n | \r
+@name         = $namecore+
+@string       = \" ( $instr | \\ $instresc )* \"
+@oplike       = $symbol+
+

 tokens :-

@@ -89,14 +88,14 @@ tokens :-
 }

 <bol_section, bol_field_layout, bol_field_braces> {
-  $nbspspacetab* @nl         { \_pos len inp -> checkWhitespace len inp >> adjustPos retPos >> lexToken }
+  @nbspspacetab* @nl         { \_pos len inp -> checkWhitespace len inp >> adjustPos retPos >> lexToken }
  -- no @nl here to allow for comments on last line of the file with no trailing \n
  $spacetab* "--" $comment*  ;  -- TODO: check the lack of @nl works here
                                -- including counting line numbers
 }

 <bol_section> {
-  $nbspspacetab*  --TODO prevent or record leading tabs
+  @nbspspacetab*  --TODO prevent or record leading tabs
                   { \pos len inp -> checkWhitespace len inp >>
                                     if B.length inp == len
                                       then return (L pos EOF)
@@ -112,10 +111,7 @@ tokens :-
  "--" $comment* ;

  @name        { toki TokSym }
-  @string      { \p l i -> case reads (B.Char8.unpack (B.take l i)) of
-                             [(str,[])] -> return (L p (TokStr str))
-                             _          -> lexicalError p i }
-  @numlike     { toki TokNum }
+  @string      { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) }
  @oplike      { toki TokOther }
  $paren       { toki TokOther }
  \:           { tok  Colon }
@@ -125,7 +121,7 @@ tokens :-
 }

 <bol_field_layout> {
-  $nbspspacetab*   --TODO prevent or record leading tabs
+  @nbspspacetab* --TODO prevent or record leading tabs
                { \pos len inp -> checkWhitespace len inp >>= \len' ->
                                  if B.length inp == len
                                    then return (L pos EOF)
@@ -154,10 +150,9 @@ tokens :-
 {

 -- | Tokens of outer cabal file structure. Field values are treated opaquely.
-data Token = TokSym   !ByteString       -- ^ Haskell-like identifier
-           | TokStr   !String           -- ^ String in quotes
-           | TokNum   !ByteString       -- ^ Integral
-           | TokOther !ByteString       -- ^ Operator like token
+data Token = TokSym   !ByteString       -- ^ Haskell-like identifier, number or operator
+           | TokStr   !ByteString       -- ^ String in quotes
+           | TokOther !ByteString       -- ^ Operators and parens
           | Indent   !Int              -- ^ Indentation token
           | TokFieldLine !ByteString   -- ^ Lines after @:@
           | Colon
@@ -170,10 +165,10 @@ data Token = TokSym   !ByteString       -- ^ Haskell-like identifier
 data LToken = L !Position !Token
  deriving Show

-toki :: Monad m => (ByteString -> Token) -> Position -> Int -> ByteString -> m LToken
+toki :: (ByteString -> Token) -> Position -> Int -> ByteString -> Lex LToken
 toki t pos  len  input = return $! L pos (t (B.take len input))

-tok :: Monad m => Token -> Position -> t -> t1 -> m LToken
+tok :: Token -> Position -> Int -> ByteString -> Lex LToken
 tok  t pos _len _input = return $! L pos t

 checkWhitespace :: Int -> ByteString -> Lex Int