Lexeme.hs 8.42 KB
Newer Older
1 2 3 4
-- (c) The GHC Team
--
-- Functions to evaluate whether or not a string is a valid identifier.
-- There is considerable overlap between the logic here and the logic
jhb563's avatar
jhb563 committed
5
-- in Lexer.x, but sadly there seems to be no way to merge them.
6 7 8

module Lexeme (
          -- * Lexical characteristics of Haskell names
9

10 11 12
          -- | Use these functions to figure what kind of name a 'FastString'
          -- represents; these functions do /not/ check that the identifier
          -- is valid.
13

14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
        isLexCon, isLexVar, isLexId, isLexSym,
        isLexConId, isLexConSym, isLexVarId, isLexVarSym,
        startsVarSym, startsVarId, startsConSym, startsConId,

          -- * Validating identifiers

          -- | These functions (working over plain old 'String's) check
          -- to make sure that the identifier is valid.
        okVarOcc, okConOcc, okTcOcc,
        okVarIdOcc, okVarSymOcc, okConIdOcc, okConSymOcc

        -- Some of the exports above are not used within GHC, but may
        -- be of value to GHC API users.

  ) where

30 31
import GhcPrelude

32 33 34 35 36
import FastString

import Data.Char
import qualified Data.Set as Set

37 38
import GHC.Lexeme

39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
{-

************************************************************************
*                                                                      *
    Lexical categories
*                                                                      *
************************************************************************

These functions test strings to see if they fit the lexical categories
defined in the Haskell report.

Note [Classification of generated names]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Some names generated for internal use can show up in debugging output,
e.g.  when using -ddump-simpl. These generated names start with a $
but should still be pretty-printed using prefix notation. We make sure
this is the case in isLexVarSym by only classifying a name as a symbol
if all its characters are symbols, not just its first one.
-}

isLexCon,   isLexVar,    isLexId,    isLexSym    :: FastString -> Bool
isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FastString -> Bool

isLexCon cs = isLexConId  cs || isLexConSym cs
isLexVar cs = isLexVarId  cs || isLexVarSym cs

isLexId  cs = isLexConId  cs || isLexVarId  cs
isLexSym cs = isLexConSym cs || isLexVarSym cs

-------------
isLexConId cs                           -- Prefix type or data constructors
  | nullFS cs          = False          --      e.g. "Foo", "[]", "(,)"
  | cs == (fsLit "[]") = True
  | otherwise          = startsConId (headFS cs)

isLexVarId cs                           -- Ordinary prefix identifiers
  | nullFS cs         = False           --      e.g. "x", "_x"
  | otherwise         = startsVarId (headFS cs)

isLexConSym cs                          -- Infix type or data constructors
  | nullFS cs          = False          --      e.g. ":-:", ":", "->"
  | cs == (fsLit "->") = True
  | otherwise          = startsConSym (headFS cs)

isLexVarSym fs                          -- Infix identifiers e.g. "+"
  | fs == (fsLit "~R#") = True
  | otherwise
  = case (if nullFS fs then [] else unpackFS fs) of
      [] -> False
      (c:cs) -> startsVarSym c && all isVarSymChar cs
        -- See Note [Classification of generated names]

{-

************************************************************************
*                                                                      *
    Detecting valid names for Template Haskell
*                                                                      *
************************************************************************

-}

----------------------
103
-- External interface
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
----------------------

-- | Is this an acceptable variable name?
okVarOcc :: String -> Bool
okVarOcc str@(c:_)
  | startsVarId c
  = okVarIdOcc str
  | startsVarSym c
  = okVarSymOcc str
okVarOcc _ = False

-- | Is this an acceptable constructor name?
okConOcc :: String -> Bool
okConOcc str@(c:_)
  | startsConId c
  = okConIdOcc str
  | startsConSym c
  = okConSymOcc str
  | str == "[]"
  = True
okConOcc _ = False

-- | Is this an acceptable type name?
okTcOcc :: String -> Bool
okTcOcc "[]" = True
okTcOcc "->" = True
okTcOcc "~"  = True
okTcOcc str@(c:_)
  | startsConId c
  = okConIdOcc str
  | startsConSym c
  = okConSymOcc str
  | startsVarSym c
  = okVarSymOcc str
okTcOcc _ = False

-- | Is this an acceptable alphanumeric variable name, assuming it starts
-- with an acceptable letter?
okVarIdOcc :: String -> Bool
okVarIdOcc str = okIdOcc str &&
144 145 146
                 -- admit "_" as a valid identifier.  Required to support typed
                 -- holes in Template Haskell.  See #10267
                 (str == "_" || not (str `Set.member` reservedIds))
147 148 149 150 151 152 153 154 155 156 157 158

-- | Is this an acceptable symbolic variable name, assuming it starts
-- with an acceptable character?
okVarSymOcc :: String -> Bool
okVarSymOcc str = all okSymChar str &&
                  not (str `Set.member` reservedOps) &&
                  not (isDashes str)

-- | Is this an acceptable alphanumeric constructor name, assuming it
-- starts with an acceptable letter?
okConIdOcc :: String -> Bool
okConIdOcc str = okIdOcc str ||
159 160
                 is_tuple_name1 True  str ||
                   -- Is it a boxed tuple...
161
                 is_tuple_name1 False str ||
162
                   -- ...or an unboxed tuple (#12407)...
163
                 is_sum_name1 str
164
                   -- ...or an unboxed sum (#12514)?
165 166
  where
    -- check for tuple name, starting at the beginning
167 168 169
    is_tuple_name1 True  ('(' : rest)       = is_tuple_name2 True  rest
    is_tuple_name1 False ('(' : '#' : rest) = is_tuple_name2 False rest
    is_tuple_name1 _     _                  = False
170 171

    -- check for tuple tail
172 173 174 175 176 177
    is_tuple_name2 True  ")"          = True
    is_tuple_name2 False "#)"         = True
    is_tuple_name2 boxed (',' : rest) = is_tuple_name2 boxed rest
    is_tuple_name2 boxed (ws  : rest)
      | isSpace ws                    = is_tuple_name2 boxed rest
    is_tuple_name2 _     _            = False
178

179 180 181 182 183 184 185 186 187 188 189 190
    -- check for sum name, starting at the beginning
    is_sum_name1 ('(' : '#' : rest) = is_sum_name2 False rest
    is_sum_name1 _                  = False

    -- check for sum tail, only allowing at most one underscore
    is_sum_name2 _          "#)"         = True
    is_sum_name2 underscore ('|' : rest) = is_sum_name2 underscore rest
    is_sum_name2 False      ('_' : rest) = is_sum_name2 True rest
    is_sum_name2 underscore (ws  : rest)
      | isSpace ws                       = is_sum_name2 underscore rest
    is_sum_name2 _          _            = False

191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
-- | Is this an acceptable symbolic constructor name, assuming it
-- starts with an acceptable character?
okConSymOcc :: String -> Bool
okConSymOcc ":" = True
okConSymOcc str = all okSymChar str &&
                  not (str `Set.member` reservedOps)

----------------------
-- Internal functions
----------------------

-- | Is this string an acceptable id, possibly with a suffix of hashes,
-- but not worrying about case or clashing with reserved words?
okIdOcc :: String -> Bool
okIdOcc str
206
  = let hashes = dropWhile okIdChar str in
207 208 209 210 211 212 213 214 215 216
    all (== '#') hashes   -- -XMagicHash allows a suffix of hashes
                          -- of course, `all` says "True" to an empty list

-- | Is this character acceptable in an identifier (after the first letter)?
-- See alexGetByte in Lexer.x
okIdChar :: Char -> Bool
okIdChar c = case generalCategory c of
  UppercaseLetter -> True
  LowercaseLetter -> True
  TitlecaseLetter -> True
217
  ModifierLetter  -> True -- See #10196
218 219
  OtherLetter     -> True -- See #1103
  NonSpacingMark  -> True -- See #7650
220
  DecimalNumber   -> True
221
  OtherNumber     -> True -- See #4373
222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
  _               -> c == '\'' || c == '_'

-- | All reserved identifiers. Taken from section 2.4 of the 2010 Report.
reservedIds :: Set.Set String
reservedIds = Set.fromList [ "case", "class", "data", "default", "deriving"
                           , "do", "else", "foreign", "if", "import", "in"
                           , "infix", "infixl", "infixr", "instance", "let"
                           , "module", "newtype", "of", "then", "type", "where"
                           , "_" ]

-- | All reserved operators. Taken from section 2.4 of the 2010 Report.
reservedOps :: Set.Set String
reservedOps = Set.fromList [ "..", ":", "::", "=", "\\", "|", "<-", "->"
                           , "@", "~", "=>" ]

-- | Does this string contain only dashes and has at least 2 of them?
isDashes :: String -> Bool
isDashes ('-' : '-' : rest) = all (== '-') rest
isDashes _                  = False