From 358e0a8d4cb49baa29cf6b001eaa9d4ac428bb2d Mon Sep 17 00:00:00 2001
From: Thomas Miedema <thomasmiedema@gmail.com>
Date: Fri, 3 Jul 2015 22:37:18 +0200
Subject: [PATCH] parser: Allow Lm (MODIFIER LETTER) category in identifiers

Easy fix in the parser to stop regressions, due to Unicode 7.0 changing
the classification of some prior code points.

Signed-off-by: Austin Seipp <austin@well-typed.com>

Test Plan: `tests/parser/should_compile/T10196.hs`

Reviewers: hvr, austin, bgamari

Reviewed By: austin, bgamari

Subscribers: thomie, bgamari

Differential Revision: https://phabricator.haskell.org/D969

GHC Trac Issues: #10196
---
 compiler/basicTypes/Lexeme.hs                      | 11 ++++++++++-
 compiler/parser/Lexer.x                            |  8 ++++++--
 compiler/utils/Util.hs                             | 14 ++++++++++++++
 testsuite/tests/parser/should_compile/T10196.hs    | 13 +++++++++++++
 testsuite/tests/parser/should_compile/all.T        |  1 +
 testsuite/tests/parser/should_fail/T10196Fail1.hs  |  4 ++++
 .../tests/parser/should_fail/T10196Fail1.stderr    |  2 ++
 testsuite/tests/parser/should_fail/T10196Fail2.hs  |  4 ++++
 .../tests/parser/should_fail/T10196Fail2.stderr    |  2 ++
 testsuite/tests/parser/should_fail/T10196Fail3.hs  |  6 ++++++
 .../tests/parser/should_fail/T10196Fail3.stderr    |  2 ++
 testsuite/tests/parser/should_fail/all.T           |  3 +++
 12 files changed, 67 insertions(+), 3 deletions(-)
 create mode 100644 testsuite/tests/parser/should_compile/T10196.hs
 create mode 100644 testsuite/tests/parser/should_fail/T10196Fail1.hs
 create mode 100644 testsuite/tests/parser/should_fail/T10196Fail1.stderr
 create mode 100644 testsuite/tests/parser/should_fail/T10196Fail2.hs
 create mode 100644 testsuite/tests/parser/should_fail/T10196Fail2.stderr
 create mode 100644 testsuite/tests/parser/should_fail/T10196Fail3.hs
 create mode 100644 testsuite/tests/parser/should_fail/T10196Fail3.stderr

diff --git a/compiler/basicTypes/Lexeme.hs b/compiler/basicTypes/Lexeme.hs
index c5bda4dbdd36..a2409614d1f4 100644
--- a/compiler/basicTypes/Lexeme.hs
+++ b/compiler/basicTypes/Lexeme.hs
@@ -28,6 +28,7 @@ module Lexeme (
   ) where
 
 import FastString
+import Util ((<||>))
 
 import Data.Char
 import qualified Data.Set as Set
@@ -194,7 +195,8 @@ okConSymOcc str = all okSymChar str &&
 -- but not worrying about case or clashing with reserved words?
 okIdOcc :: String -> Bool
 okIdOcc str
-  = let hashes = dropWhile okIdChar str in
+  -- TODO. #10196. Only allow modifier letters in the suffix of an identifier.
+  = let hashes = dropWhile (okIdChar <||> okIdSuffixChar) str in
     all (== '#') hashes   -- -XMagicHash allows a suffix of hashes
                           -- of course, `all` says "True" to an empty list
 
@@ -210,6 +212,13 @@ okIdChar c = case generalCategory c of
   OtherNumber     -> True
   _               -> c == '\'' || c == '_'
 
+-- | Is this character acceptable in the suffix of an identifier.
+-- See alexGetByte in Lexer.x
+okIdSuffixChar :: Char -> Bool
+okIdSuffixChar c = case generalCategory c of
+  ModifierLetter  -> True  -- See #10196
+  _               -> False
+
 -- | Is this character acceptable in a symbol (after the first char)?
 -- See alexGetByte in Lexer.x
 okSymChar :: Char -> Bool
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 432686e5af32..7fbbd132fc25 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -156,7 +156,10 @@ $graphic   = [$small $large $symbol $digit $special $unigraphic \"\']
 $binit     = 0-1
 $octit     = 0-7
 $hexit     = [$decdigit A-F a-f]
-$idchar    = [$small $large $digit \']
+
+$suffix    = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
+-- TODO #10196. Only allow modifier letters in the suffix of an identifier.
+$idchar    = [$small $large $digit $suffix \']
 
 $pragmachar = [$small $large $digit]
 
@@ -1835,6 +1838,7 @@ alexGetByte (AI loc s)
         symbol          = '\x04'
         space           = '\x05'
         other_graphic   = '\x06'
+        suffix          = '\x07'
 
         adj_c
           | c <= '\x06' = non_graphic
@@ -1851,7 +1855,7 @@ alexGetByte (AI loc s)
                   UppercaseLetter       -> upper
                   LowercaseLetter       -> lower
                   TitlecaseLetter       -> upper
-                  ModifierLetter        -> other_graphic
+                  ModifierLetter        -> suffix -- see #10196
                   OtherLetter           -> lower -- see #1103
                   NonSpacingMark        -> other_graphic
                   SpacingCombiningMark  -> other_graphic
diff --git a/compiler/utils/Util.hs b/compiler/utils/Util.hs
index aa3a19b64c27..b2492f169fd5 100644
--- a/compiler/utils/Util.hs
+++ b/compiler/utils/Util.hs
@@ -55,6 +55,7 @@ module Util (
         isEqual, eqListBy, eqMaybeBy,
         thenCmp, cmpList,
         removeSpaces,
+        (<&&>), (<||>),
 
         -- * Edit distance
         fuzzyMatch, fuzzyLookup,
@@ -115,6 +116,10 @@ import Data.List        hiding (group)
 import FastTypes
 #endif
 
+#if __GLASGOW_HASKELL__ < 709
+import Control.Applicative (Applicative)
+#endif
+import Control.Applicative ( liftA2 )
 import Control.Monad    ( liftM )
 import System.IO.Error as IO ( isDoesNotExistError )
 import System.Directory ( doesDirectoryExist, getModificationTime )
@@ -653,6 +658,15 @@ cmpList cmp (a:as) (b:bs)
 removeSpaces :: String -> String
 removeSpaces = dropWhileEndLE isSpace . dropWhile isSpace
 
+-- Boolean operators lifted to Applicative
+(<&&>) :: Applicative f => f Bool -> f Bool -> f Bool
+(<&&>) = liftA2 (&&)
+infixr 3 <&&> -- same as (&&)
+
+(<||>) :: Applicative f => f Bool -> f Bool -> f Bool
+(<||>) = liftA2 (||)
+infixr 2 <||> -- same as (||)
+
 {-
 ************************************************************************
 *                                                                      *
diff --git a/testsuite/tests/parser/should_compile/T10196.hs b/testsuite/tests/parser/should_compile/T10196.hs
new file mode 100644
index 000000000000..f80911829b33
--- /dev/null
+++ b/testsuite/tests/parser/should_compile/T10196.hs
@@ -0,0 +1,13 @@
+module T10196 where
+
+data X = Xᵦ | Xᵤ | Xᵩ | Xᵢ | Xᵪ | Xᵣ
+
+f :: Int
+f =
+  let xᵦ = 1
+      xᵤ = xᵦ
+      xᵩ = xᵤ
+      xᵢ = xᵩ
+      xᵪ = xᵢ
+      xᵣ = xᵪ
+  in xáµ£
diff --git a/testsuite/tests/parser/should_compile/all.T b/testsuite/tests/parser/should_compile/all.T
index 13acedf01413..0bff4085c7fc 100644
--- a/testsuite/tests/parser/should_compile/all.T
+++ b/testsuite/tests/parser/should_compile/all.T
@@ -98,3 +98,4 @@ test('T7118', normal, compile, [''])
 test('T7776', normal, compile, [''])
 test('RdrNoStaticPointers01', when(compiler_lt('ghc', '7.9'), skip), compile, [''])
 test('T5682', normal, compile, [''])
+test('T10196', normal, compile, [''])
diff --git a/testsuite/tests/parser/should_fail/T10196Fail1.hs b/testsuite/tests/parser/should_fail/T10196Fail1.hs
new file mode 100644
index 000000000000..2f1c8f39ea3a
--- /dev/null
+++ b/testsuite/tests/parser/should_fail/T10196Fail1.hs
@@ -0,0 +1,4 @@
+module T10196Fail1 where
+
+-- Constructors are not allowed to start with a modifier letter.
+data Foo = ᵦfoo
diff --git a/testsuite/tests/parser/should_fail/T10196Fail1.stderr b/testsuite/tests/parser/should_fail/T10196Fail1.stderr
new file mode 100644
index 000000000000..3c4a173eefd4
--- /dev/null
+++ b/testsuite/tests/parser/should_fail/T10196Fail1.stderr
@@ -0,0 +1,2 @@
+
+T10196Fail1.hs:4:12: error: lexical error at character '\7526'
diff --git a/testsuite/tests/parser/should_fail/T10196Fail2.hs b/testsuite/tests/parser/should_fail/T10196Fail2.hs
new file mode 100644
index 000000000000..64b3cacd6210
--- /dev/null
+++ b/testsuite/tests/parser/should_fail/T10196Fail2.hs
@@ -0,0 +1,4 @@
+module T10196Fail2 where
+
+-- Variables are not allowed to start with a modifier letter.
+ᵦ = 1
diff --git a/testsuite/tests/parser/should_fail/T10196Fail2.stderr b/testsuite/tests/parser/should_fail/T10196Fail2.stderr
new file mode 100644
index 000000000000..abba8aa04c01
--- /dev/null
+++ b/testsuite/tests/parser/should_fail/T10196Fail2.stderr
@@ -0,0 +1,2 @@
+
+T10196Fail2.hs:4:1: error: lexical error at character '\7526'
diff --git a/testsuite/tests/parser/should_fail/T10196Fail3.hs b/testsuite/tests/parser/should_fail/T10196Fail3.hs
new file mode 100644
index 000000000000..09b80ddeff01
--- /dev/null
+++ b/testsuite/tests/parser/should_fail/T10196Fail3.hs
@@ -0,0 +1,6 @@
+module T10196Fail3 where
+
+-- Modifier letters are not allowed in the middle of an identifier.
+-- And this should not be lexed as 2 separate identifiers either.
+xᵦx :: Int
+xᵦx = 1
diff --git a/testsuite/tests/parser/should_fail/T10196Fail3.stderr b/testsuite/tests/parser/should_fail/T10196Fail3.stderr
new file mode 100644
index 000000000000..64037440e23a
--- /dev/null
+++ b/testsuite/tests/parser/should_fail/T10196Fail3.stderr
@@ -0,0 +1,2 @@
+
+T10196Fail3.hs:5:2: error: lexical error at character '/7526'
diff --git a/testsuite/tests/parser/should_fail/all.T b/testsuite/tests/parser/should_fail/all.T
index 7e286cf3f23b..26d576757a5a 100644
--- a/testsuite/tests/parser/should_fail/all.T
+++ b/testsuite/tests/parser/should_fail/all.T
@@ -86,3 +86,6 @@ test('ExportCommaComma', normal, compile_fail, [''])
 test('T8430', literate, compile_fail, [''])
 test('T8431', [timeout_multiplier(0.05)], compile_fail, ['-XAlternativeLayoutRule'])
 test('T8506', normal, compile_fail, [''])
+test('T10196Fail1', normal, compile_fail, [''])
+test('T10196Fail2', normal, compile_fail, [''])
+test('T10196Fail3', expect_broken(10196), compile_fail, [''])
-- 
GitLab