From 0b34b4eaac65fb5a5ece8f7846077c4a3d627520 Mon Sep 17 00:00:00 2001
From: Tom Smeding <tomsmeding@users.noreply.github.com>
Date: Thu, 25 Jan 2024 04:25:36 +0100
Subject: [PATCH] Ignore invalid Unicode in pkg-config descriptions (#9609)

* Ignore invalid Unicode in pkg-config descriptions

Previously, if any of the pkg-config packages on the system had invalid
Unicode in their description fields (like the Intel vpl package has at
the time of writing, 2024-01-11, see #9608), cabal would crash because
it tried to interpret the entire `pkg-config --list-all` output as
Unicode.

This change, as suggested by gbaz in
  https://github.com/haskell/cabal/issues/9608#issuecomment-1886120831
switches to using a lazy ByteString for reading in the output, splitting
on the first space in byte land, and then parsing only the package
_name_ to a String.

For further future-proofing, package names that don't parse as valid
Unicode don't crash Cabal, but are instead ignored.

* Add changelog entry

* cabal-install-solver: Add bounds on 'text'

* No literal ASCII values, use 'ord'

* Address review comments re invalid unicode from pkg-config

* Add test for invalid unicode from pkg-config

* Compatibility with text-1.2.5.0

* Align imports

* Handle different exception type

* Use only POSIX shell syntax

* Add invalid-input handler in pkg-config shim

This is to appease shellcheck

* Actually implement all required stuff in the pkg-config shim

* Less exception dance

* Fix shebang lines

MacOS doesn't have /usr/bin/sh, and /bin/sh is the standard (for a POSIX
shell) anyway

* Don't expect a particular representation of invalid characters

---------

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 Cabal/src/Distribution/Simple/Program/Run.hs  |  8 +++
 .../cabal-install-solver.cabal                |  1 +
 .../Distribution/Solver/Types/PkgConfigDb.hs  | 61 ++++++++++++++++---
 .../PackageTests/ExtraProgPath/pkg-config     |  2 +-
 .../PackageTests/PkgConfigParse/MyLibrary.hs  |  1 +
 .../PackageTests/PkgConfigParse/cabal.project |  1 +
 .../PackageTests/PkgConfigParse/my.cabal      | 19 ++++++
 .../PackageTests/PkgConfigParse/pkg-config    | 49 +++++++++++++++
 .../PackageTests/PkgConfigParse/setup.out     |  1 +
 .../PackageTests/PkgConfigParse/setup.test.hs |  9 +++
 changelog.d/pr-9609                           | 12 ++++
 11 files changed, 154 insertions(+), 10 deletions(-)
 create mode 100644 cabal-testsuite/PackageTests/PkgConfigParse/MyLibrary.hs
 create mode 100644 cabal-testsuite/PackageTests/PkgConfigParse/cabal.project
 create mode 100644 cabal-testsuite/PackageTests/PkgConfigParse/my.cabal
 create mode 100755 cabal-testsuite/PackageTests/PkgConfigParse/pkg-config
 create mode 100644 cabal-testsuite/PackageTests/PkgConfigParse/setup.out
 create mode 100644 cabal-testsuite/PackageTests/PkgConfigParse/setup.test.hs
 create mode 100644 changelog.d/pr-9609

diff --git a/Cabal/src/Distribution/Simple/Program/Run.hs b/Cabal/src/Distribution/Simple/Program/Run.hs
index 268cb79450..e9a1298559 100644
--- a/Cabal/src/Distribution/Simple/Program/Run.hs
+++ b/Cabal/src/Distribution/Simple/Program/Run.hs
@@ -24,6 +24,7 @@ module Distribution.Simple.Program.Run
   , getProgramInvocationOutput
   , getProgramInvocationLBS
   , getProgramInvocationOutputAndErrors
+  , getProgramInvocationLBSAndErrors
   , getEffectiveEnvironment
   ) where
 
@@ -181,6 +182,13 @@ getProgramInvocationOutputAndErrors verbosity inv = case progInvokeOutputEncodin
     (output', errors, exitCode) <- getProgramInvocationIODataAndErrors verbosity inv IODataModeBinary
     return (normaliseLineEndings (fromUTF8LBS output'), errors, exitCode)
 
+getProgramInvocationLBSAndErrors
+  :: Verbosity
+  -> ProgramInvocation
+  -> IO (LBS.ByteString, String, ExitCode)
+getProgramInvocationLBSAndErrors verbosity inv =
+  getProgramInvocationIODataAndErrors verbosity inv IODataModeBinary
+
 getProgramInvocationIODataAndErrors
   :: KnownIODataMode mode
   => Verbosity
diff --git a/cabal-install-solver/cabal-install-solver.cabal b/cabal-install-solver/cabal-install-solver.cabal
index fe440a7896..2f1e8d6bdf 100644
--- a/cabal-install-solver/cabal-install-solver.cabal
+++ b/cabal-install-solver/cabal-install-solver.cabal
@@ -110,6 +110,7 @@ library
     , mtl           >=2.0      && <2.4
     , pretty        ^>=1.1
     , transformers  >=0.4.2.0  && <0.7
+    , text          (>= 1.2.3.0  && < 1.3) || (>= 2.0 && < 2.2)
 
   if flag(debug-expensive-assertions)
     cpp-options: -DDEBUG_EXPENSIVE_ASSERTIONS
diff --git a/cabal-install-solver/src/Distribution/Solver/Types/PkgConfigDb.hs b/cabal-install-solver/src/Distribution/Solver/Types/PkgConfigDb.hs
index ee2f22032c..21845eafde 100644
--- a/cabal-install-solver/src/Distribution/Solver/Types/PkgConfigDb.hs
+++ b/cabal-install-solver/src/Distribution/Solver/Types/PkgConfigDb.hs
@@ -1,5 +1,6 @@
 {-# LANGUAGE DeriveDataTypeable #-}
 {-# LANGUAGE DeriveGeneric      #-}
+{-# LANGUAGE LambdaCase         #-}
 -----------------------------------------------------------------------------
 -- |
 -- Module      :  Distribution.Solver.Types.PkgConfigDb
@@ -23,17 +24,23 @@ module Distribution.Solver.Types.PkgConfigDb
 import Distribution.Solver.Compat.Prelude
 import Prelude ()
 
-import           Control.Exception (handle)
-import           Control.Monad     (mapM)
-import qualified Data.Map          as M
-import           System.FilePath   (splitSearchPath)
+import           Control.Exception        (handle)
+import           Control.Monad            (mapM)
+import           Data.ByteString          (ByteString)
+import qualified Data.ByteString.Lazy     as LBS
+import qualified Data.Map                 as M
+import qualified Data.Text                as T
+import qualified Data.Text.Encoding       as T
+import qualified Data.Text.Encoding.Error as T
+import           System.FilePath          (splitSearchPath)
 
 import Distribution.Compat.Environment          (lookupEnv)
 import Distribution.Package                     (PkgconfigName, mkPkgconfigName)
 import Distribution.Parsec
 import Distribution.Simple.Program
        (ProgramDb, getProgramOutput, pkgConfigProgram, needProgram, ConfiguredProgram)
-import Distribution.Simple.Program.Run          (getProgramInvocationOutputAndErrors, programInvocation)
+import Distribution.Simple.Program.Run
+       (getProgramInvocationOutputAndErrors, programInvocation, getProgramInvocationLBSAndErrors)
 import Distribution.Simple.Utils                (info)
 import Distribution.Types.PkgconfigVersion
 import Distribution.Types.PkgconfigVersionRange
@@ -63,10 +70,37 @@ readPkgConfigDb verbosity progdb = handle ioErrorHandler $ do
     case mpkgConfig of
       Nothing             -> noPkgConfig "Cannot find pkg-config program"
       Just (pkgConfig, _) -> do
-        pkgList <- lines <$> getProgramOutput verbosity pkgConfig ["--list-all"]
-        -- The output of @pkg-config --list-all@ also includes a description
-        -- for each package, which we do not need.
-        let pkgNames = map (takeWhile (not . isSpace)) pkgList
+        -- To prevent malformed Unicode in the descriptions from crashing cabal,
+        -- read without interpreting any encoding first. (#9608)
+        (listAllOutput, listAllErrs, listAllExitcode) <-
+          getProgramInvocationLBSAndErrors verbosity (programInvocation pkgConfig ["--list-all"])
+        when (listAllExitcode /= ExitSuccess) $
+          ioError (userError ("pkg-config --list-all failed: " ++ listAllErrs))
+        let pkgList = LBS.split (fromIntegral (ord '\n')) listAllOutput
+        -- Now decode the package *names* to a String. The ones where decoding
+        -- failed end up in 'failedPkgNames'.
+        let (failedPkgNames, pkgNames) =
+              partitionEithers
+              -- Drop empty package names. This will handle empty lines
+              -- in pkg-config's output, including the spurious one
+              -- after the last newline (because of LBS.split).
+              . filter (either (const True) (not . null))
+              -- Try decoding strictly; if it fails, put the lenient
+              -- decoding in a Left for later reporting.
+              . map (\bsname ->
+                       let sbsname = LBS.toStrict bsname
+                       in case T.decodeUtf8' sbsname of
+                            Left _ -> Left (T.unpack (decodeUtf8LenientCompat sbsname))
+                            Right name -> Right (T.unpack name))
+              -- The output of @pkg-config --list-all@ also includes a
+              -- description for each package, which we do not need.
+              -- We don't use Data.Char.isSpace because that would also
+              -- include 0xA0, the non-breaking space, which can occur
+              -- in multi-byte UTF-8 sequences.
+              . map (LBS.takeWhile (not . isAsciiSpace))
+              $ pkgList
+        when (not (null failedPkgNames)) $
+          info verbosity ("Some pkg-config packages have names containing invalid unicode: " ++ intercalate ", " failedPkgNames)
         (outs, _errs, exitCode) <-
                      getProgramInvocationOutputAndErrors verbosity
                        (programInvocation pkgConfig ("--modversion" : pkgNames))
@@ -104,6 +138,15 @@ readPkgConfigDb verbosity progdb = handle ioErrorHandler $ do
          ExitSuccess -> Just (pkg, pkgVersion)
          _ -> Nothing
 
+    isAsciiSpace :: Word8 -> Bool
+    isAsciiSpace c = c `elem` map (fromIntegral . ord) " \t"
+
+    -- The decodeUtf8Lenient function is defined starting with text-2.0.1; this
+    -- function simply reimplements it. When the minimum supported GHC version
+    -- is >= 9.4, switch to decodeUtf8Lenient.
+    decodeUtf8LenientCompat :: ByteString -> T.Text
+    decodeUtf8LenientCompat = T.decodeUtf8With T.lenientDecode
+
 -- | Create a `PkgConfigDb` from a list of @(packageName, version)@ pairs.
 pkgConfigDbFromList :: [(String, String)] -> PkgConfigDb
 pkgConfigDbFromList pairs = (PkgConfigDb . M.fromList . map convert) pairs
diff --git a/cabal-testsuite/PackageTests/ExtraProgPath/pkg-config b/cabal-testsuite/PackageTests/ExtraProgPath/pkg-config
index 7c5fafbf0c..195df7c2c4 100755
--- a/cabal-testsuite/PackageTests/ExtraProgPath/pkg-config
+++ b/cabal-testsuite/PackageTests/ExtraProgPath/pkg-config
@@ -1,3 +1,3 @@
-#!/usr/bin/sh
+#!/bin/sh
 
 exit 1;
diff --git a/cabal-testsuite/PackageTests/PkgConfigParse/MyLibrary.hs b/cabal-testsuite/PackageTests/PkgConfigParse/MyLibrary.hs
new file mode 100644
index 0000000000..a51c414bcd
--- /dev/null
+++ b/cabal-testsuite/PackageTests/PkgConfigParse/MyLibrary.hs
@@ -0,0 +1 @@
+module MyLibrary () where
diff --git a/cabal-testsuite/PackageTests/PkgConfigParse/cabal.project b/cabal-testsuite/PackageTests/PkgConfigParse/cabal.project
new file mode 100644
index 0000000000..5a93e28e87
--- /dev/null
+++ b/cabal-testsuite/PackageTests/PkgConfigParse/cabal.project
@@ -0,0 +1 @@
+packages: *.cabal
\ No newline at end of file
diff --git a/cabal-testsuite/PackageTests/PkgConfigParse/my.cabal b/cabal-testsuite/PackageTests/PkgConfigParse/my.cabal
new file mode 100644
index 0000000000..38b7020b8a
--- /dev/null
+++ b/cabal-testsuite/PackageTests/PkgConfigParse/my.cabal
@@ -0,0 +1,19 @@
+name: PkgConfigParse
+version: 0.1
+license: BSD3
+author: Tom Smeding
+maintainer: Tom Smeding
+synopsis: Pkg Config Parse
+category: PackageTests
+build-type: Simple
+cabal-version: 2.0
+
+description:
+    Check that Cabal does not crash when pkg-config outputs invalid Unicode.
+
+Library
+    pkgconfig-depends: vpl
+    default-language: Haskell2010
+    build-depends: base <5.0
+    exposed-modules:
+        MyLibrary
diff --git a/cabal-testsuite/PackageTests/PkgConfigParse/pkg-config b/cabal-testsuite/PackageTests/PkgConfigParse/pkg-config
new file mode 100755
index 0000000000..183d08e0a0
--- /dev/null
+++ b/cabal-testsuite/PackageTests/PkgConfigParse/pkg-config
@@ -0,0 +1,49 @@
+#!/bin/sh
+
+set -eu
+
+# ugly, but "good enough" for this test
+# This will need to be updated whenever cabal invokes pkg-config
+# in new ways
+case "$*" in
+  '--version')
+    echo 2.1.0  # whatever
+    ;;
+
+  '--variable pc_path pkg-config')
+    echo '.'
+    ;;
+
+  '--list-all')
+    printf 'zlib   zlib - zlib compression library\n'
+    # \256 = \xAE is the iso-8859-1 (latin-1) encoded version of U+00AE,
+    # i.e. the "registered sign": ®
+    # This resulted in problems, see #9608
+    printf 'vpl    Intel\256 Video Processing Library - Accelerated video decode, encode, and frame processing capabilities on Intel\256 GPUs\n'
+    # \360 = \xF0 is latin-1 for ð; this is orð, Icelandic for "word"/"words".
+    printf 'or\360   Icelandic characters\n'
+    ;;
+
+  '--modversion '*)
+    shift  # drop the --modversion
+    for arg; do
+      case "$arg" in
+        zlib) echo 1.3; ;;  # whatever
+        vpl) echo 2.10; ;;  # whatever
+        # No entry for orð here; let's not even try to match on that
+        *)
+          echo >&2 "Package $arg was not found in the pkg-config search path."
+          exit 1
+      esac
+    done
+    ;;
+
+  # Ignore some stuff we're not implementing
+  '--cflags '*) ;;
+  '--libs '*) ;;
+
+  *)
+    echo >&2 "pkg-config: unrecognised arguments $* (this is an incomplete shim)"
+    exit 1
+    ;;
+esac
diff --git a/cabal-testsuite/PackageTests/PkgConfigParse/setup.out b/cabal-testsuite/PackageTests/PkgConfigParse/setup.out
new file mode 100644
index 0000000000..92fd8204a4
--- /dev/null
+++ b/cabal-testsuite/PackageTests/PkgConfigParse/setup.out
@@ -0,0 +1 @@
+# cabal v2-build
diff --git a/cabal-testsuite/PackageTests/PkgConfigParse/setup.test.hs b/cabal-testsuite/PackageTests/PkgConfigParse/setup.test.hs
new file mode 100644
index 0000000000..0f860ab637
--- /dev/null
+++ b/cabal-testsuite/PackageTests/PkgConfigParse/setup.test.hs
@@ -0,0 +1,9 @@
+import Test.Cabal.Prelude
+
+-- Test that invalid unicode in pkg-config output doesn't trip up cabal very much
+main = cabalTest $ do
+  -- skipped on windows because using a script to dummy up an executable doesn't work the same.
+  skipIfWindows
+  cdir <- testCurrentDir `fmap` getTestEnv
+  res <- cabal' "v2-build" ["--extra-prog-path="++cdir, "-v2"]
+  assertOutputContains "Some pkg-config packages have names containing invalid unicode: or" res
diff --git a/changelog.d/pr-9609 b/changelog.d/pr-9609
new file mode 100644
index 0000000000..c156706ef0
--- /dev/null
+++ b/changelog.d/pr-9609
@@ -0,0 +1,12 @@
+synopsis: Ignore invalid Unicode in pkg-config descriptions
+packages: cabal-install-solver
+prs: #9609
+issues: #9608
+
+description: {
+
+Previously, cabal-install would crash when `pkg-config --list-all` contained
+invalid Unicode. With this change, invalid unicode in package descriptions is
+ignored, and unparseable package names are considered nonexistent.
+
+}
-- 
GitLab