float-out causes extra allocation

The Haskell fannkuchredux (included in nofib as fannkuch-redux) contribution of Louis Wasserman to "The Computer Language Benchmarks Game" at shootout.alioth.debian.org times out on the amd64 machines, but not on the i386. I can reproduce it on my Debian amd64 machine.

It turns out that compiling without optimisation or with a simple -O produces a fast program, but with enormously large heap space allocated (10G compared with 67k on a virtual i386 machine) and also more garbage collector activity.

The source is below (because I don't find a way to attach the file). At the end of the source I copied my make command line, run command line and output produced with option -sstderr.

{-  The Computer Language Benchmarks Game
    http://shootout.alioth.debian.org/
    contributed by Louis Wasserman
    
    This should be compiled with:
    	-threaded -O2 -fexcess-precision -fasm
    and run with:
    	+RTS -N<number of cores> -RTS <input>
-}

import Control.Concurrent
import Control.Monad
import System.Environment
import Foreign hiding (rotate)
import Data.Monoid

type Perm = Ptr Word8

data F = F {-# UNPACK #-} !Int {-# UNPACK #-} !Int

instance Monoid F where
	mempty = F 0 0
	F s1 m1 `mappend` F s2 m2 = F (s1 + s2) (max m1 m2)

incPtr = (`advancePtr` 1)
decPtr = (`advancePtr` (-1))

flop :: Int -> Perm -> IO ()
flop k xs = flopp xs (xs `advancePtr` k)
 where flopp i j = when (i < j) $ swap i j >> flopp (incPtr i) (decPtr j)
       swap i j = do
	a <- peek i
	b <- peek j
	poke j a
	poke i b

flopS :: Perm -> (Int -> IO a) -> IO a
flopS !xs f = do
	let go !acc = do
		k <- peekElemOff xs 0
		if k == 0 then f acc else flop (fromIntegral k) xs >> go (acc+1)
	go 0

increment :: Ptr Word8 -> Ptr Word8 -> IO ()
increment !p !ct = do
	first <- peekElemOff p 1
	pokeElemOff p 1 =<< peekElemOff p 0
	pokeElemOff p 0 first
	
	let go !i !first = do
		ci <- peekElemOff ct i
		if fromIntegral ci < i then pokeElemOff ct i (ci+1) else do
			pokeElemOff ct i 0
			let !i' = i + 1
			moveArray p (incPtr p) i'
			pokeElemOff p i' first
			go i' =<< peekElemOff p 0
	go 1 first  

genPermutations :: Int -> Int -> Int -> Ptr Word8 -> Ptr Word8 -> IO F
genPermutations !n !l !r !perm !count = allocaArray n $ \ destF -> do
	let upd j !f run = do
		p0 <- peekElemOff perm 0
		if p0 == 0 then increment perm count >> run f else do
			copyArray destF perm n
			increment perm count
			flopS destF $ \ flops -> 
				run (f `mappend` F (checksum j flops) flops)
	let go j !f = if j >= r then return f else upd j f (go (j+1))
	go l mempty
 where checksum i f = if i .&. 1 == 0 then f else -f

facts :: [Int]
facts = scanl (*) 1 [1..12]

unrank :: Int -> Int -> (Ptr Word8 -> Ptr Word8 -> IO a) -> IO a
unrank !idx !n f = allocaArray n $ \ p -> allocaArray n $ \ count ->
  allocaArray n $ \ pp -> do
	mapM_ (\ i -> pokeElemOff p i (fromIntegral i)) [0..n-1]
	let go i !idx = when (i >= 0) $ do
		let fi = facts !! i
		let (q, r) = idx `quotRem` fi
		pokeElemOff count i (fromIntegral q)
		copyArray pp p (i+1)
		let go' j = when (j <= i) $ do
			let jq = j + q
			pokeElemOff p j =<< peekElemOff pp (if jq <= i then jq else jq - i - 1)
			go' (j+1)
		go' 0
		go (i-1) r
	go (n-1) idx
	f p count

main = do
   n <- fmap (read.head) getArgs
   let fact = product [1..n]
   let bk = fact `quot` 4
   vars <- forM [0,bk..fact-1] $ \ ix -> do
   	var <- newEmptyMVar
   	forkIO (unrank ix n $ \ p -> genPermutations n ix (min fact (ix + bk)) p >=> putMVar var)
   	return var
   F chksm mflops <- liftM mconcat (mapM takeMVar vars)
   putStrLn $ (show chksm) ++ "\nPfannkuchen(" ++ (show n) ++ ") = " ++ (show $ mflops)


{-

wurmli@noah-nofen:~/hpw/haskell/work/fannkuch$ ghc  --make
 -XBangPatterns -O -threaded -fllvm -rtsopts fannkuchredux.ghc-3.hs 
[1 of 1] Compiling Main             ( fannkuchredux.ghc-3.hs, fannkuchredux.ghc-3.o )
Linking fannkuchredux.ghc-3 ...
wurmli@noah-nofen:~/hpw/haskell/work/fannkuch$ ./fannkuchredux.ghc-3 +RTS -N4 -sstderr   -RTS 12
3968050
Pfannkuchen(12) = 65
  10,538,122,952 bytes allocated in the heap
         359,512 bytes copied during GC
          47,184 bytes maximum residency (2 sample(s))
          51,120 bytes maximum slop
               4 MB total memory in use (1 MB lost due to fragmentation)

                                    Tot time (elapsed)  Avg pause  Max pause
  Gen  0      6053 colls,  6053 par    0.16s    0.04s     0.0000s    0.0001s
  Gen  1         2 colls,     1 par    0.00s    0.00s     0.0001s    0.0001s

  Parallel GC work balance: 40.82% (serial 0%, perfect 100%)

  TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4)

  SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)

  INIT    time    0.00s  (  0.00s elapsed)
  MUT     time   44.73s  ( 11.51s elapsed)
  GC      time    0.16s  (  0.04s elapsed)
  EXIT    time    0.00s  (  0.00s elapsed)
  Total   time   44.89s  ( 11.55s elapsed)

  Alloc rate    235,589,887 bytes per MUT second

  Productivity  99.6% of total user, 387.3% of total elapsed

gc_alloc_block_sync: 31024
whitehole_spin: 0
gen[0].sync: 0
gen[1].sync: 0

-}

Edited Mar 10, 2019 by Ben Gamari

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information

Admin message

float-out causes extra allocation