From 7bd407a67cd7810d3ff1e6d18885555175383a35 Mon Sep 17 00:00:00 2001 From: Brandon Chinn <brandonchinn178@gmail.com> Date: Tue, 15 Oct 2024 20:32:33 -0700 Subject: [PATCH] Fix CRLF in multiline strings (#25375) --- .gitattributes | 1 + compiler/GHC/Parser/String.hs | 14 +++++++ docs/users_guide/exts/multiline_strings.rst | 8 +++- testsuite/tests/parser/should_run/T25375.hs | 38 +++++++++++++++++++ .../tests/parser/should_run/T25375.stdout | 5 +++ testsuite/tests/parser/should_run/all.T | 1 + 6 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 testsuite/tests/parser/should_run/T25375.hs create mode 100644 testsuite/tests/parser/should_run/T25375.stdout diff --git a/.gitattributes b/.gitattributes index aa7ab64d11d..454cd0e7dfa 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,3 +2,4 @@ # don't convert anything on checkout * text=auto eol=lf mk/win32-tarballs.md5sum text=auto eol=LF +testsuite/tests/parser/should_run/T25375.hs text=auto eol=crlf diff --git a/compiler/GHC/Parser/String.hs b/compiler/GHC/Parser/String.hs index 00687e3d84f..5c226e8b62e 100644 --- a/compiler/GHC/Parser/String.hs +++ b/compiler/GHC/Parser/String.hs @@ -261,6 +261,7 @@ lexMultilineString = lexStringWith processChars processChars processChars :: HasChar c => [c] -> Either (c, LexErr) [c] processChars = collapseGaps -- Step 1 + >>> normalizeEOL >>> expandLeadingTabs -- Step 3 >>> rmCommonWhitespacePrefix -- Step 4 >>> collapseOnlyWsLines -- Step 5 @@ -268,6 +269,19 @@ lexMultilineString = lexStringWith processChars processChars >>> rmLastNewline -- Step 7b >>> resolveEscapes -- Step 8 + -- Normalize line endings to LF. The spec dictates that lines should be + -- split on newline characters and rejoined with ``\n``. But because we + -- aren't actually splitting/rejoining, we'll manually normalize here + normalizeEOL :: HasChar c => [c] -> [c] + normalizeEOL = + let go = \case + Char '\r' : c@(Char '\n') : cs -> c : go cs + c@(Char '\r') : cs -> setChar '\n' c : go cs + c@(Char '\f') : cs -> setChar '\n' c : go cs + c : cs -> c : go cs + [] -> [] + in go + -- expands all tabs, since the lexer will verify that tabs can only appear -- as leading indentation expandLeadingTabs :: HasChar c => [c] -> [c] diff --git a/docs/users_guide/exts/multiline_strings.rst b/docs/users_guide/exts/multiline_strings.rst index a3c71dd000c..e42853224ed 100644 --- a/docs/users_guide/exts/multiline_strings.rst +++ b/docs/users_guide/exts/multiline_strings.rst @@ -14,7 +14,9 @@ With this extension, GHC now recognizes multiline string literals with ``"""`` d Normal string literals are lexed, then string gaps are collapsed, then escape characters are resolved. Multiline string literals add the following post-processing steps between collapsing string gaps and resolving escape characters: -#. Split the string by newlines +#. Split the string by newline characters + + * Includes ``\r\n``, ``\r``, ``\n``, ``\f`` #. Replace leading tabs with spaces up to the next tab stop @@ -24,7 +26,9 @@ Normal string literals are lexed, then string gaps are collapsed, then escape ch #. Join the string back with ``\n`` delimiters -#. If the first character of the string is a newline, remove it +#. If the first character of the string is ``\n``, remove it + +#. If the last character of the string is ``\n``, remove it Examples ~~~~~~~~ diff --git a/testsuite/tests/parser/should_run/T25375.hs b/testsuite/tests/parser/should_run/T25375.hs new file mode 100644 index 00000000000..1b516edabdc --- /dev/null +++ b/testsuite/tests/parser/should_run/T25375.hs @@ -0,0 +1,38 @@ +{-# LANGUAGE MultilineStrings #-} + +str1 = unlines + [ "aaa" + , "bbb" + , "ccc" + ] + +str2 = "aaa\n\ + \bbb\n\ + \ccc\n" + +str3 = """ + aaa + bbb + ccc + """ + +str4 = """ + + aaa + bbb + ccc + + """ + +str5 = """ + aaa + bbb + ccc\n + """ + +main = do + print str1 + print str2 + print str3 + print str4 + print str5 diff --git a/testsuite/tests/parser/should_run/T25375.stdout b/testsuite/tests/parser/should_run/T25375.stdout new file mode 100644 index 00000000000..0cdf50b16d7 --- /dev/null +++ b/testsuite/tests/parser/should_run/T25375.stdout @@ -0,0 +1,5 @@ +"aaa\nbbb\nccc\n" +"aaa\nbbb\nccc\n" +"aaa\nbbb\nccc" +"\naaa\nbbb\nccc\n" +"aaa\nbbb\nccc\n" diff --git a/testsuite/tests/parser/should_run/all.T b/testsuite/tests/parser/should_run/all.T index 03f355d88e2..df886c4a24f 100644 --- a/testsuite/tests/parser/should_run/all.T +++ b/testsuite/tests/parser/should_run/all.T @@ -23,3 +23,4 @@ test('RecordDotSyntax5', normal, compile_and_run, ['']) test('ListTuplePunsConstraints', extra_files(['ListTuplePunsConstraints.hs']), ghci_script, ['ListTuplePunsConstraints.script']) test('MultilineStrings', normal, compile_and_run, ['']) test('MultilineStringsOverloaded', normal, compile_and_run, ['']) +test('T25375', normal, compile_and_run, ['']) -- GitLab