From 43fb21b6ae30133d5ed3fab3453305721679e2b7 Mon Sep 17 00:00:00 2001
From: Cheng Shao <terrorjack@type.dance>
Date: Wed, 22 May 2024 14:30:03 +0000
Subject: [PATCH] compiler: avoid saving foreign call target to local when
 there are no caller-save GlobalRegs

This patch makes the STG->Cmm backend avoid saving foreign call target
to local when there are no caller-save GlobalRegs.

Since 321941a8ebe25192cdeece723e1058f2f47809ea, when we lower a
foreign call, we unconditionally save the foreign call target to a
temporary local first, then rely on cmmSink to clean it up later,
which only happens with -fcmm-sink (implied by -O) and not in
unoptimized code.

And this is troublesome for the wasm backend NCG, which needs to infer
a foreign call target symbol's type signature from the Cmm call site.
Previously, the NCG has been emitting incorrect type signatures for
unoptimized code, which happens to work with `wasm-ld` most of the
time, but this is never future-proof against upstream toolchain
updates, and it causes horrible breakages when LTO objects are
included in linker input. Hence this patch.

(cherry picked from commit 8dd8a076058baca45ac52ace25b9c2797d61ef84)
(cherry picked from commit 2b2a3db8a92f4a39052b67b38af79c26117f3d53)
---
 compiler/GHC/Driver/Config/StgToCmm.hs |  3 +
 compiler/GHC/StgToCmm/Config.hs        |  2 +
 compiler/GHC/StgToCmm/Foreign.hs       | 76 +++++++++++++++++++++++---
 3 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/compiler/GHC/Driver/Config/StgToCmm.hs b/compiler/GHC/Driver/Config/StgToCmm.hs
index 87c72753349..fc17ae7404a 100644
--- a/compiler/GHC/Driver/Config/StgToCmm.hs
+++ b/compiler/GHC/Driver/Config/StgToCmm.hs
@@ -14,6 +14,7 @@ import GHC.Driver.Backend
 import GHC.Driver.Session
 import GHC.Platform
 import GHC.Platform.Profile
+import GHC.Platform.Regs
 import GHC.Utils.Error
 import GHC.Unit.Module
 import GHC.Utils.Outputable
@@ -81,6 +82,8 @@ initStgToCmmConfig dflags mod = StgToCmmConfig
   , stgToCmmAvx2          = isAvx2Enabled                  dflags
   , stgToCmmAvx512f       = isAvx512fEnabled               dflags
   , stgToCmmTickyAP       = gopt Opt_Ticky_AP dflags
+  -- See Note [Saving foreign call target to local]
+  , stgToCmmSaveFCallTargetToLocal = any (callerSaves platform) $ activeStgRegs platform
   } where profile  = targetProfile dflags
           platform = profilePlatform profile
           bk_end  = backend dflags
diff --git a/compiler/GHC/StgToCmm/Config.hs b/compiler/GHC/StgToCmm/Config.hs
index 1274782725d..6817256d4e5 100644
--- a/compiler/GHC/StgToCmm/Config.hs
+++ b/compiler/GHC/StgToCmm/Config.hs
@@ -70,6 +70,8 @@ data StgToCmmConfig = StgToCmmConfig
   , stgToCmmAllowIntMul2Instr         :: !Bool   -- ^ Allowed to generate IntMul2 instruction
   , stgToCmmAllowFMAInstr             :: FMASign -> Bool -- ^ Allowed to generate FMA instruction
   , stgToCmmTickyAP                   :: !Bool   -- ^ Disable use of precomputed standard thunks.
+  , stgToCmmSaveFCallTargetToLocal    :: !Bool   -- ^ Save a foreign call target to a Cmm local, see
+                                                 -- Note [Saving foreign call target to local] for details
   ------------------------------ SIMD flags ------------------------------------
   -- Each of these flags checks vector compatibility with the backend requested
   -- during compilation. In essence, this means checking for @-fllvm@ which is
diff --git a/compiler/GHC/StgToCmm/Foreign.hs b/compiler/GHC/StgToCmm/Foreign.hs
index 7d9173d123a..7464eda718c 100644
--- a/compiler/GHC/StgToCmm/Foreign.hs
+++ b/compiler/GHC/StgToCmm/Foreign.hs
@@ -277,23 +277,83 @@ load_target_into_temp (ForeignTarget expr conv) = do
 load_target_into_temp other_target@(PrimTarget _) =
   return other_target
 
+-- Note [Saving foreign call target to local]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
 -- What we want to do here is create a new temporary for the foreign
 -- call argument if it is not safe to use the expression directly,
 -- because the expression mentions caller-saves GlobalRegs (see
 -- Note [Register parameter passing]).
 --
 -- However, we can't pattern-match on the expression here, because
--- this is used in a loop by GHC.Cmm.Parser, and testing the expression
--- results in a black hole.  So we always create a temporary, and rely
--- on GHC.Cmm.Sink to clean it up later.  (Yuck, ToDo).  The generated code
--- ends up being the same, at least for the RTS .cmm code.
+-- this is used in a loop by GHC.Cmm.Parser, and testing the
+-- expression results in a black hole.  So when there exist
+-- caller-saves GlobalRegs, we create a temporary, and rely on
+-- GHC.Cmm.Sink to clean it up later. The generated code ends up being
+-- the same if -fcmm-sink is enabled (implied by -O).
+--
+-- When there doesn't exist caller-save GlobalRegs, keep the original
+-- target in place. This matters for the wasm backend, otherwise it
+-- cannot infer the target symbol's correct foreign function type in
+-- unoptimized Cmm. For instance:
+--
+-- foreign import ccall unsafe "foo" c_foo :: IO ()
+--
+-- Without optimization, previously this would lower to something like:
+--
+-- [Test.c_foo_entry() { //  []
+--          { []
+--          }
+--      {offset
+--        cDk:
+--            goto cDm;
+--        cDm:
+--            _cDj::I32 = foo;
+--            call "ccall" arg hints:  []  result hints:  [] (_cDj::I32)();
+--            R1 = GHC.Tuple.()_closure+1;
+--            call (I32[P32[Sp]])(R1) args: 4, res: 0, upd: 4;
+--      }
+--  },
 --
+-- The wasm backend only sees "foo" being assigned to a local, but
+-- there's no type signature associated with a CLabel! So it has to
+-- emit a dummy .functype directive and fingers crossed that wasm-ld
+-- tolerates function type mismatch. THis is horrible, not future
+-- proof against upstream toolchain upgrades, and already known to
+-- break in certain cases (e.g. when LTO objects are involved).
+--
+-- Therefore, on wasm as well as other targets that don't risk
+-- mentioning caller-saved GlobalRegs in a foreign call target, just
+-- keep the original call target in place and don't assign it to a
+-- local. So this would now lower to something like:
+--
+-- [Test.c_foo_entry() { //  []
+--          { []
+--          }
+--      {offset
+--        cDo:
+--            goto cDq;
+--        cDq:
+--            call "ccall" arg hints:  []  result hints:  [] foo();
+--            R1 = GHC.Tuple.()_closure+1;
+--            call (I32[P32[Sp]])(R1) args: 4, res: 0, upd: 4;
+--      }
+--  },
+--
+-- Since "foo" appears at call site directly, the wasm backend would
+-- now be able to infer its type signature correctly.
+
 maybe_assign_temp :: CmmExpr -> FCode CmmExpr
 maybe_assign_temp e = do
-  platform <- getPlatform
-  reg <- newTemp (cmmExprType platform e)
-  emitAssign (CmmLocal reg) e
-  return (CmmReg (CmmLocal reg))
+  do_save <- stgToCmmSaveFCallTargetToLocal <$> getStgToCmmConfig
+  if do_save
+    then do
+      platform <- getPlatform
+      reg <- newTemp (cmmExprType platform e)
+      emitAssign (CmmLocal reg) e
+      return (CmmReg (CmmLocal reg))
+    else
+      pure e
 
 -- -----------------------------------------------------------------------------
 -- Save/restore the thread state in the TSO
-- 
GitLab