From 7bd407a67cd7810d3ff1e6d18885555175383a35 Mon Sep 17 00:00:00 2001
From: Brandon Chinn <brandonchinn178@gmail.com>
Date: Tue, 15 Oct 2024 20:32:33 -0700
Subject: [PATCH] Fix CRLF in multiline strings (#25375)

---
 .gitattributes                                |  1 +
 compiler/GHC/Parser/String.hs                 | 14 +++++++
 docs/users_guide/exts/multiline_strings.rst   |  8 +++-
 testsuite/tests/parser/should_run/T25375.hs   | 38 +++++++++++++++++++
 .../tests/parser/should_run/T25375.stdout     |  5 +++
 testsuite/tests/parser/should_run/all.T       |  1 +
 6 files changed, 65 insertions(+), 2 deletions(-)
 create mode 100644 testsuite/tests/parser/should_run/T25375.hs
 create mode 100644 testsuite/tests/parser/should_run/T25375.stdout

diff --git a/.gitattributes b/.gitattributes
index aa7ab64d11d..454cd0e7dfa 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2,3 +2,4 @@
 # don't convert anything on checkout
 * text=auto eol=lf
 mk/win32-tarballs.md5sum text=auto eol=LF
+testsuite/tests/parser/should_run/T25375.hs text=auto eol=crlf
diff --git a/compiler/GHC/Parser/String.hs b/compiler/GHC/Parser/String.hs
index 00687e3d84f..5c226e8b62e 100644
--- a/compiler/GHC/Parser/String.hs
+++ b/compiler/GHC/Parser/String.hs
@@ -261,6 +261,7 @@ lexMultilineString = lexStringWith processChars processChars
     processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
     processChars =
           collapseGaps             -- Step 1
+      >>> normalizeEOL
       >>> expandLeadingTabs        -- Step 3
       >>> rmCommonWhitespacePrefix -- Step 4
       >>> collapseOnlyWsLines      -- Step 5
@@ -268,6 +269,19 @@ lexMultilineString = lexStringWith processChars processChars
       >>> rmLastNewline            -- Step 7b
       >>> resolveEscapes           -- Step 8
 
+    -- Normalize line endings to LF. The spec dictates that lines should be
+    -- split on newline characters and rejoined with ``\n``. But because we
+    -- aren't actually splitting/rejoining, we'll manually normalize here
+    normalizeEOL :: HasChar c => [c] -> [c]
+    normalizeEOL =
+      let go = \case
+            Char '\r' : c@(Char '\n') : cs -> c : go cs
+            c@(Char '\r') : cs -> setChar '\n' c : go cs
+            c@(Char '\f') : cs -> setChar '\n' c : go cs
+            c : cs -> c : go cs
+            [] -> []
+       in go
+
     -- expands all tabs, since the lexer will verify that tabs can only appear
     -- as leading indentation
     expandLeadingTabs :: HasChar c => [c] -> [c]
diff --git a/docs/users_guide/exts/multiline_strings.rst b/docs/users_guide/exts/multiline_strings.rst
index a3c71dd000c..e42853224ed 100644
--- a/docs/users_guide/exts/multiline_strings.rst
+++ b/docs/users_guide/exts/multiline_strings.rst
@@ -14,7 +14,9 @@ With this extension, GHC now recognizes multiline string literals with ``"""`` d
 
 Normal string literals are lexed, then string gaps are collapsed, then escape characters are resolved. Multiline string literals add the following post-processing steps between collapsing string gaps and resolving escape characters:
 
-#. Split the string by newlines
+#. Split the string by newline characters
+
+      * Includes ``\r\n``, ``\r``, ``\n``, ``\f``
 
 #. Replace leading tabs with spaces up to the next tab stop
 
@@ -24,7 +26,9 @@ Normal string literals are lexed, then string gaps are collapsed, then escape ch
 
 #. Join the string back with ``\n`` delimiters
 
-#. If the first character of the string is a newline, remove it
+#. If the first character of the string is ``\n``, remove it
+
+#. If the last character of the string is ``\n``, remove it
 
 Examples
 ~~~~~~~~
diff --git a/testsuite/tests/parser/should_run/T25375.hs b/testsuite/tests/parser/should_run/T25375.hs
new file mode 100644
index 00000000000..1b516edabdc
--- /dev/null
+++ b/testsuite/tests/parser/should_run/T25375.hs
@@ -0,0 +1,38 @@
+{-# LANGUAGE MultilineStrings #-}
+
+str1 = unlines
+  [ "aaa"
+  , "bbb"
+  , "ccc"
+  ]
+
+str2 = "aaa\n\
+       \bbb\n\
+       \ccc\n"
+
+str3 = """
+       aaa
+       bbb
+       ccc
+       """
+
+str4 = """
+
+       aaa
+       bbb
+       ccc
+
+       """
+
+str5 = """
+       aaa
+       bbb
+       ccc\n
+       """
+
+main = do
+  print str1
+  print str2
+  print str3
+  print str4
+  print str5
diff --git a/testsuite/tests/parser/should_run/T25375.stdout b/testsuite/tests/parser/should_run/T25375.stdout
new file mode 100644
index 00000000000..0cdf50b16d7
--- /dev/null
+++ b/testsuite/tests/parser/should_run/T25375.stdout
@@ -0,0 +1,5 @@
+"aaa\nbbb\nccc\n"
+"aaa\nbbb\nccc\n"
+"aaa\nbbb\nccc"
+"\naaa\nbbb\nccc\n"
+"aaa\nbbb\nccc\n"
diff --git a/testsuite/tests/parser/should_run/all.T b/testsuite/tests/parser/should_run/all.T
index 03f355d88e2..df886c4a24f 100644
--- a/testsuite/tests/parser/should_run/all.T
+++ b/testsuite/tests/parser/should_run/all.T
@@ -23,3 +23,4 @@ test('RecordDotSyntax5', normal, compile_and_run, [''])
 test('ListTuplePunsConstraints', extra_files(['ListTuplePunsConstraints.hs']), ghci_script, ['ListTuplePunsConstraints.script'])
 test('MultilineStrings', normal, compile_and_run, [''])
 test('MultilineStringsOverloaded', normal, compile_and_run, [''])
+test('T25375', normal, compile_and_run, [''])
-- 
GitLab