Skip to content

Branchless arithmetic operations

While working on #9281 (closed) I noticed sub-optimal code generation for common arithmetic operations on Int. Below are some examples of code generation, where the branchless versions may be desirable on modern CPUs:

abs

-- base version
absI# :: Int# -> Int#
absI# i# = i'#
  where
    !(I# i'#) = abs (I# i#)

-- optimized version
optAbsI# :: Int# -> Int#
optAbsI# i# = (i# `xorI#` nsign) -# nsign
  where
    -- nsign = i# <# 0#
    nsign = uncheckedIShiftRA# i# (WORD_SIZE_IN_BITS# -# 1#)

results in

absI#_info:
_c1To:
	testq %r14,%r14
	jge _c1Tx
_c1Ty:
	movq %r14,%rbx
	negq %rbx
	jmp *(%rbp)
_c1Tx:
	movq %r14,%rbx
	jmp *(%rbp)


optAbsI#_info:
_c1SX:
	movq %r14,%rax
	sarq $63,%rax
	movq %r14,%rbx
	xorq %rax,%rbx
	subq %rax,%rbx
	jmp *(%rbp)

signum

sgnI# :: Int# -> Int#
sgnI# i# = i'#
  where
    !(I# i'#) = signum (I# i#)

optSgnI# :: Int# -> Int#
optSgnI# x# = (x# ># 0#) -# (x# <# 0#)
sgnI#_info:
_c27W:
        testq %r14,%r14
        jl _c283
_c284:
        testq %r14,%r14
        jne _c28a
_c28b:
        xorl %ebx,%ebx
        jmp *(%rbp)
_c283:
        movq $-1,%rbx
        jmp *(%rbp)
_c28a:
        movl $1,%ebx
        jmp *(%rbp)


optSgnI#_info:
_c26P:
        testq %r14,%r14
        setl %al
        movzbl %al,%eax
        testq %r14,%r14
        setg %bl
        movzbl %bl,%ebx
        subq %rax,%rbx
        jmp *(%rbp)

compare

cmpI# :: Int# -> Int# -> Int#
cmpI# x# y# = dataToTag# (compare (I# x#) (I# y#))

-- returns 0, 1 or 2, can be fed to tagToEnum# :: Ordering
optCmpI# :: Int# -> Int# -> Int#
optCmpI# x# y# = 1# +# (x# ># y#) -# (x# <# y#)
cmpI#_info:
_c25s:
	cmpq %rsi,%r14
	jl _c25z
_c25A:
	cmpq %rsi,%r14
	je _c25M
_c25N:
	movl $2,%ebx
	jmp *(%rbp)
_c25z:
	xorl %ebx,%ebx
	jmp *(%rbp)
_c25M:
	movl $1,%ebx
	jmp *(%rbp)


optCmpI#_info:
_c24N:
	cmpq %rsi,%r14
	setl %al
	movzbl %al,%eax
	movl $1,%ebx
	subq %rax,%rbx
	cmpq %rsi,%r14
	setg %al
	movq %rbx,%rcx
	movzbl %al,%ebx
	addq %rcx,%rbx
	jmp *(%rbp)
Trac metadata
Trac field Value
Version 7.8.3
Type FeatureRequest
TypeOfFailure OtherFailure
Priority normal
Resolution Unresolved
Component Compiler (CodeGen)
Test case
Differential revisions
BlockedBy
Related
Blocking
CC simonmar
Operating system
Architecture
To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information