Skip to content
Snippets Groups Projects
Commit 7d46115b authored by Matthew Pickering's avatar Matthew Pickering Committed by Mikolaj
Browse files

perf: Group together packages by repo when verifying tarballs

verifyFetchedTarball has the effect of deserialising the index tarball
(see call to Sec.withIndex).

verifyFetchedTarball is called individually for each package in the
build plan (see ProjectPlanning.hs). Not once per repo.

The hackage tarball is now 880mb so it takes a non significant amount of
time to deserialise this (much better after haskell/tar#95).

This code path is important as it can add 1s with these 38 calls on to
the initial load of a project and scales linearly with the size of your
build tree.

Reproducer: Simple project with "lens" dependency deserialises the index tarball 38 times.

Solution: Refactor verifyFetchedTarball to run once per repo rather than once per package.

In future it would be much better to refactor this function so that the
items are not immediately grouped and ungrouped but I didn't want to
take that on immediately.

Fixes #10110
parent 3169b879
No related branches found
No related tags found
No related merge requests found
...@@ -92,6 +92,7 @@ ...@@ -92,6 +92,7 @@
- ignore: {name: "Use unwords"} # 8 hints - ignore: {name: "Use unwords"} # 8 hints
- ignore: {name: "Use void"} # 22 hints - ignore: {name: "Use void"} # 22 hints
- ignore: {name: "Use when"} # 1 hint - ignore: {name: "Use when"} # 1 hint
- ignore: {name: "Use uncurry"} # 1 hint
- arguments: - arguments:
- --ignore-glob=Cabal-syntax/src/Distribution/Fields/Lexer.hs - --ignore-glob=Cabal-syntax/src/Distribution/Fields/Lexer.hs
......
...@@ -25,7 +25,7 @@ module Distribution.Client.FetchUtils ...@@ -25,7 +25,7 @@ module Distribution.Client.FetchUtils
-- ** specifically for repo packages -- ** specifically for repo packages
, checkRepoTarballFetched , checkRepoTarballFetched
, fetchRepoTarball , fetchRepoTarball
, verifyFetchedTarball , verifyFetchedTarballs
-- ** fetching packages asynchronously -- ** fetching packages asynchronously
, asyncFetchPackages , asyncFetchPackages
...@@ -98,6 +98,7 @@ import System.IO ...@@ -98,6 +98,7 @@ import System.IO
, openTempFile , openTempFile
) )
import Control.Monad (forM)
import Distribution.Client.Errors import Distribution.Client.Errors
import qualified Hackage.Security.Client as Sec import qualified Hackage.Security.Client as Sec
import qualified Hackage.Security.Util.Checked as Sec import qualified Hackage.Security.Util.Checked as Sec
...@@ -152,40 +153,66 @@ checkRepoTarballFetched repo pkgid = do ...@@ -152,40 +153,66 @@ checkRepoTarballFetched repo pkgid = do
then return (Just file) then return (Just file)
else return Nothing else return Nothing
verifyFetchedTarball :: Verbosity -> RepoContext -> Repo -> PackageId -> IO Bool verifyFetchedTarballs
verifyFetchedTarball verbosity repoCtxt repo pkgid = :: Verbosity
let file = packageFile repo pkgid -> RepoContext
handleError :: IO Bool -> IO Bool -> Repo
handleError act = do -> [PackageId]
res <- Safe.try act -> IO
case res of ( [ Either
Left e -> warn verbosity ("Error verifying fetched tarball " ++ file ++ ", will redownload: " ++ show (e :: SomeException)) >> pure False (Repo, PackageId) -- Verified
Right b -> pure b (Repo, PackageId) -- unverified)
in handleError $ do ]
exists <- doesFileExist file )
if not exists verifyFetchedTarballs verbosity repoCtxt repo pkgids =
then return True -- if the file does not exist, it vacuously passes validation, since it will be downloaded as necessary with what we will then check is a valid hash. -- Establish the context once per repo (see #10110), this codepath is important
else case repo of -- to be fast as it can happen when no other building happens.
-- a secure repo has hashes we can compare against to confirm this is the correct file. let establishContext k =
RepoSecure{} -> case repo of
repoContextWithSecureRepo repoCtxt repo $ \repoSecure -> RepoSecure{} ->
Sec.withIndex repoSecure $ \callbacks -> repoContextWithSecureRepo repoCtxt repo $ \repoSecure ->
let warnAndFail s = warn verbosity ("Fetched tarball " ++ file ++ " does not match server, will redownload: " ++ s) >> return False Sec.withIndex repoSecure $ \callbacks -> k (Just callbacks)
in -- the do block in parens is due to dealing with the checked exceptions mechanism. _ -> k Nothing
( do in do
fileInfo <- Sec.indexLookupFileInfo callbacks pkgid establishContext $ \mCallbacks ->
sz <- Sec.FileLength . fromInteger <$> getFileSize file forM pkgids $ \pkgid -> do
if sz /= Sec.fileInfoLength (Sec.trusted fileInfo) let file = packageFile repo pkgid
then warnAndFail "file length mismatch" res <- verifyFetchedTarball verbosity file mCallbacks pkgid
else do return $ if res then Left (repo, pkgid) else Right (repo, pkgid)
res <- Sec.compareTrustedFileInfo (Sec.trusted fileInfo) <$> Sec.computeFileInfo (Sec.Path file :: Sec.Path Sec.Absolute)
if res verifyFetchedTarball :: Verbosity -> FilePath -> Maybe Sec.IndexCallbacks -> PackageId -> IO Bool
then pure True verifyFetchedTarball verbosity file mCallbacks pkgid =
else warnAndFail "file hash mismatch" let
) handleError :: IO Bool -> IO Bool
`Sec.catchChecked` (\(e :: Sec.InvalidPackageException) -> warnAndFail (show e)) handleError act = do
`Sec.catchChecked` (\(e :: Sec.VerificationError) -> warnAndFail (show e)) res <- Safe.try act
_ -> pure True case res of
Left e -> warn verbosity ("Error verifying fetched tarball " ++ file ++ ", will redownload: " ++ show (e :: SomeException)) >> pure False
Right b -> pure b
in
handleError $ do
exists <- doesFileExist file
if not exists
then return True -- if the file does not exist, it vacuously passes validation, since it will be downloaded as necessary with what we will then check is a valid hash.
else case mCallbacks of
-- a secure repo has hashes we can compare against to confirm this is the correct file.
Just callbacks ->
let warnAndFail s = warn verbosity ("Fetched tarball " ++ file ++ " does not match server, will redownload: " ++ s) >> return False
in -- the do block in parens is due to dealing with the checked exceptions mechanism.
( do
fileInfo <- Sec.indexLookupFileInfo callbacks pkgid
sz <- Sec.FileLength . fromInteger <$> getFileSize file
if sz /= Sec.fileInfoLength (Sec.trusted fileInfo)
then warnAndFail "file length mismatch"
else do
res <- Sec.compareTrustedFileInfo (Sec.trusted fileInfo) <$> Sec.computeFileInfo (Sec.Path file :: Sec.Path Sec.Absolute)
if res
then pure True
else warnAndFail "file hash mismatch"
)
`Sec.catchChecked` (\(e :: Sec.InvalidPackageException) -> warnAndFail (show e))
`Sec.catchChecked` (\(e :: Sec.VerificationError) -> warnAndFail (show e))
_ -> pure True
-- | Fetch a package if we don't have it already. -- | Fetch a package if we don't have it already.
fetchPackage fetchPackage
......
...@@ -127,7 +127,7 @@ import Distribution.Client.SetupWrapper ...@@ -127,7 +127,7 @@ import Distribution.Client.SetupWrapper
import Distribution.Client.Store import Distribution.Client.Store
import Distribution.Client.Targets (userToPackageConstraint) import Distribution.Client.Targets (userToPackageConstraint)
import Distribution.Client.Types import Distribution.Client.Types
import Distribution.Client.Utils (incVersion) import Distribution.Client.Utils (concatMapM, incVersion)
import qualified Distribution.Client.BuildReports.Storage as BuildReports import qualified Distribution.Client.BuildReports.Storage as BuildReports
import qualified Distribution.Client.IndexUtils as IndexUtils import qualified Distribution.Client.IndexUtils as IndexUtils
...@@ -206,7 +206,7 @@ import qualified Distribution.Solver.Types.ComponentDeps as CD ...@@ -206,7 +206,7 @@ import qualified Distribution.Solver.Types.ComponentDeps as CD
import qualified Distribution.Compat.Graph as Graph import qualified Distribution.Compat.Graph as Graph
import Control.Exception (assert) import Control.Exception (assert)
import Control.Monad (forM, sequence) import Control.Monad (sequence)
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (liftIO)
import Control.Monad.State as State (State, execState, runState, state) import Control.Monad.State as State (State, execState, runState, state)
import Data.Foldable (fold) import Data.Foldable (fold)
...@@ -1069,25 +1069,29 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do ...@@ -1069,25 +1069,29 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do
-- Tarballs from repositories, either where the repository provides -- Tarballs from repositories, either where the repository provides
-- hashes as part of the repo metadata, or where we will have to -- hashes as part of the repo metadata, or where we will have to
-- download and hash the tarball. -- download and hash the tarball.
repoTarballPkgsWithMetadataUnvalidated :: [(PackageId, Repo)] repoTarballPkgsWithMetadataUnvalidated :: [(Repo, [PackageId])]
repoTarballPkgsWithoutMetadata :: [(PackageId, Repo)] repoTarballPkgsWithoutMetadata :: [(Repo, PackageId)]
( repoTarballPkgsWithMetadataUnvalidated ( repoTarballPkgsWithMetadataUnvalidated
, repoTarballPkgsWithoutMetadata , repoTarballPkgsWithoutMetadata
) = ) =
partitionEithers partitionEithers
[ case repo of [ case repo of
RepoSecure{} -> Left (pkgid, repo) RepoSecure{} -> Left (repo, [pkgid])
_ -> Right (pkgid, repo) _ -> Right (repo, pkgid)
| (pkgid, RepoTarballPackage repo _ _) <- allPkgLocations | (pkgid, RepoTarballPackage repo _ _) <- allPkgLocations
] ]
-- Group up the unvalidated packages by repo so we only read the remote
-- index once per repo (see #10110). The packages are ungrouped here and then regrouped
-- below, it would be better in future to refactor this whole code path so that we don't
-- repeatedly group and ungroup.
repoTarballPkgsWithMetadataUnvalidatedMap = Map.fromListWith (++) repoTarballPkgsWithMetadataUnvalidated
(repoTarballPkgsWithMetadata, repoTarballPkgsToDownloadWithMeta) <- fmap partitionEithers $ (repoTarballPkgsWithMetadata, repoTarballPkgsToDownloadWithMeta) <- fmap partitionEithers $
liftIO $ liftIO $
withRepoCtx $ \repoctx -> forM repoTarballPkgsWithMetadataUnvalidated $ withRepoCtx $ \repoctx -> flip concatMapM (Map.toList repoTarballPkgsWithMetadataUnvalidatedMap) $
\x@(pkg, repo) -> \(repo, pkgids) ->
verifyFetchedTarball verbosity repoctx repo pkg >>= \b -> case b of verifyFetchedTarballs verbosity repoctx repo pkgids
True -> return $ Left x
False -> return $ Right x
-- For tarballs from repos that do not have hashes available we now have -- For tarballs from repos that do not have hashes available we now have
-- to check if the packages were downloaded already. -- to check if the packages were downloaded already.
...@@ -1101,9 +1105,9 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do ...@@ -1101,9 +1105,9 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do
[ do [ do
mtarball <- checkRepoTarballFetched repo pkgid mtarball <- checkRepoTarballFetched repo pkgid
case mtarball of case mtarball of
Nothing -> return (Left (pkgid, repo)) Nothing -> return (Left (repo, pkgid))
Just tarball -> return (Right (pkgid, tarball)) Just tarball -> return (Right (pkgid, tarball))
| (pkgid, repo) <- repoTarballPkgsWithoutMetadata | (repo, pkgid) <- repoTarballPkgsWithoutMetadata
] ]
let repoTarballPkgsToDownload = repoTarballPkgsToDownloadWithMeta ++ repoTarballPkgsToDownloadWithNoMeta let repoTarballPkgsToDownload = repoTarballPkgsToDownloadWithMeta ++ repoTarballPkgsToDownloadWithNoMeta
...@@ -1139,9 +1143,9 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do ...@@ -1139,9 +1143,9 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do
| pkgid <- pkgids | pkgid <- pkgids
] ]
| (repo, pkgids) <- | (repo, pkgids) <-
map (\grp@((_, repo) :| _) -> (repo, map fst (NE.toList grp))) map (\grp@((repo, _) :| _) -> (repo, map snd (NE.toList grp)))
. NE.groupBy ((==) `on` (remoteRepoName . repoRemote . snd)) . NE.groupBy ((==) `on` (remoteRepoName . repoRemote . fst))
. sortBy (compare `on` (remoteRepoName . repoRemote . snd)) . sortBy (compare `on` (remoteRepoName . repoRemote . fst))
$ repoTarballPkgsWithMetadata $ repoTarballPkgsWithMetadata
] ]
...@@ -1153,7 +1157,7 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do ...@@ -1153,7 +1157,7 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do
[ do [ do
tarball <- fetchRepoTarball verbosity repoctx repo pkgid tarball <- fetchRepoTarball verbosity repoctx repo pkgid
return (pkgid, tarball) return (pkgid, tarball)
| (pkgid, repo) <- repoTarballPkgsToDownload | (repo, pkgid) <- repoTarballPkgsToDownload
] ]
return return
......
...@@ -38,6 +38,7 @@ module Distribution.Client.Utils ...@@ -38,6 +38,7 @@ module Distribution.Client.Utils
, listFilesInside , listFilesInside
, safeRead , safeRead
, hasElem , hasElem
, concatMapM
, occursOnlyOrBefore , occursOnlyOrBefore
, giveRTSWarning , giveRTSWarning
) where ) where
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment