This is an archive of the discontinued LLVM Phabricator instance.

optimize vector fabs of bitcasted constant integer values
ClosedPublic

Authored by spatel on Aug 4 2014, 12:51 PM.

Download Raw Diff

Details

Reviewers

rengolin
chandlerc
pete

Commits

rG8e5beb6edbb5: Optimize vector fabs of bitcasted constant integer values.
rL214892: Optimize vector fabs of bitcasted constant integer values.

Summary

This patch allows vector fabs operations of bitcasted constant integer values to be optimized in the same way that we already optimize scalar fabs.

So for code like this:
%bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
%ret = bitcast <2 x float> %fabs to i64

Instead of generating something like this:

movabsq (constant pool loadi of mask for sign bits) 
vmovq   (move from integer register to vector/fp register)
vandps  (mask off sign bits)
vmovq   (move vector/fp register back to integer return register)

We should generate:

mov     (put constant value in return register)

I have also removed a redundant clause in the first 'if' statement:
N0.getOperand(0).getValueType().isInteger()

is the same thing as:
IntVT.isInteger()

For more background, please see:
http://reviews.llvm.org/D4770

And:
http://llvm.org/bugs/show_bug.cgi?id=20354

Diff Detail

Repository: rL LLVM

Event Timeline

spatel updated this revision to Diff 12174.Aug 4 2014, 12:51 PM

spatel retitled this revision from to optimize vector fabs of bitcasted constant integer values.

spatel updated this object.

spatel edited the test plan for this revision. (Show Details)

spatel added reviewers: rengolin, pete, chandlerc.

spatel added a subscriber: Unknown Object (MLST).

I know Renato is interested in the change to ARM codegen for the FNEG version of this optimization (not included in this patch) because it will change an existing test case. Here's ARM codegen for the 2 new FABS cases in this proposed patch.

I'm not sure what LLVM policy is for repeating codegen testcases per architecture, but if that's considered a good thing, I can add these to codegen/ARM:

Using:
$ ./llc -o - vec_fabs.ll -march=arm -mcpu=cortex-a8 -mattr=+neon

We get:

_fabs_v2f32_1:                          @ @fabs_v2f32_1
@ BB#0:
 	mvn	r0, #0
 	mov	r1, #0
 	vmov	d16, r1, r0
 	vabs.f32	d16, d16
 	vmov	r0, r1, d16
 	bx	lr
 
_fabs_v2f32_2:                          @ @fabs_v2f32_2
@ BB#0:
 	mov	r0, #0
 	mvn	r1, #0
 	vmov	d16, r1, r0
 	vabs.f32	d16, d16
 	vmov	r0, r1, d16
 	bx	lr

After the optimization, this would become:

_fabs_v2f32_1:                          @ @fabs_v2f32_1
@ BB#0:
	mov	r0, #0
	mvn	r1, #-2147483648
	mov	pc, lr

_fabs_v2f32_2:                          @ @fabs_v2f32_2
@ BB#0:
	mvn	r0, #-2147483648
	mov	r1, #0
	mov	pc, lr

Hi Sanjay,

That looks good to me, thanks for taking the time.

Duplicating the fabs case in the ARM tree adding a CHECK: mvn + CHECK-NOT: vabs would be great, with a little explanation just like you did for x86.

Thanks!
--renato

This revision is now accepted and ready to land.Aug 5 2014, 9:43 AM

Closed by commit rL214892 (authored by @spatel).

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

24 lines

test/

CodeGen/

ARM/

fabs-neon.ll

37 lines

X86/

vec_fabs.ll

47 lines

Diff 12202

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,403 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::visitFABS(SDNode *N) {
// fold (fabs (fabs x)) -> (fabs x)		// fold (fabs (fabs x)) -> (fabs x)
if (N0.getOpcode() == ISD::FABS)		if (N0.getOpcode() == ISD::FABS)
return N->getOperand(0);		return N->getOperand(0);
// fold (fabs (fneg x)) -> (fabs x)		// fold (fabs (fneg x)) -> (fabs x)
// fold (fabs (fcopysign x, y)) -> (fabs x)		// fold (fabs (fcopysign x, y)) -> (fabs x)
if (N0.getOpcode() == ISD::FNEG \|\| N0.getOpcode() == ISD::FCOPYSIGN)		if (N0.getOpcode() == ISD::FNEG \|\| N0.getOpcode() == ISD::FCOPYSIGN)
return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));		return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));

// Transform fabs(bitconvert(x)) -> bitconvert(x&~sign) to avoid loading		// Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading
// constant pool values.		// constant pool values.
// TODO: We can also optimize for vectors here, but we need to make sure
// that the sign mask is created properly for each vector element.
if (!TLI.isFAbsFree(VT) &&		if (!TLI.isFAbsFree(VT) &&
N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse() &&		N0.getOpcode() == ISD::BITCAST &&
N0.getOperand(0).getValueType().isInteger() &&		N0.getNode()->hasOneUse()) {
!VT.isVector()) {
SDValue Int = N0.getOperand(0);		SDValue Int = N0.getOperand(0);
EVT IntVT = Int.getValueType();		EVT IntVT = Int.getValueType();
if (IntVT.isInteger() && !IntVT.isVector()) {		if (IntVT.isInteger() && !IntVT.isVector()) {
		APInt SignMask;
		if (N0.getValueType().isVector()) {
		// For a vector, get a mask such as 0x7f... per scalar element
		// and splat it.
		SignMask = ~APInt::getSignBit(N0.getValueType().getScalarSizeInBits());
		SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
		} else {
		// For a scalar, just generate 0x7f...
		SignMask = ~APInt::getSignBit(IntVT.getSizeInBits());
		}
Int = DAG.getNode(ISD::AND, SDLoc(N0), IntVT, Int,		Int = DAG.getNode(ISD::AND, SDLoc(N0), IntVT, Int,
DAG.getConstant(~APInt::getSignBit(IntVT.getSizeInBits()), IntVT));		DAG.getConstant(SignMask, IntVT));
AddToWorklist(Int.getNode());		AddToWorklist(Int.getNode());
return DAG.getNode(ISD::BITCAST, SDLoc(N),		return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Int);
N->getValueType(0), Int);
}		}
}		}

return SDValue();		return SDValue();
}		}

SDValue DAGCombiner::visitBRCOND(SDNode *N) {		SDValue DAGCombiner::visitBRCOND(SDNode *N) {
SDValue Chain = N->getOperand(0);		SDValue Chain = N->getOperand(0);
▲ Show 20 Lines • Show All 4,459 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/fabs-neon.ll

	Show All 9 Lines

	; CHECK-LABEL: test2:			; CHECK-LABEL: test2:
	; CHECK: vabs.f32 d0, d0			; CHECK: vabs.f32 d0, d0
	define <2 x float> @test2(<2 x float> %a) {			define <2 x float> @test2(<2 x float> %a) {
	%foo = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)			%foo = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
	ret <2 x float> %foo			ret <2 x float> %foo
	}			}
	declare <2 x float> @llvm.fabs.v2f32(<2 x float> %a)			declare <2 x float> @llvm.fabs.v2f32(<2 x float> %a)

				; No constant pool loads or vector ops are needed for the fabs of a
				; bitcasted integer constant; we should just return integer constants
				; that have the sign bits turned off.
				;
				; So instead of something like this:
				; mvn r0, #0
				; mov r1, #0
				; vmov d16, r1, r0
				; vabs.f32 d16, d16
				; vmov r0, r1, d16
				; bx lr
				;
				; We should generate:
				; mov r0, #0
				; mvn r1, #-2147483648
				; mov pc, lr

				; CHECK-LABEL: fabs_v2f32_1
				define i64 @fabs_v2f32_1() {
				%bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
				%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
				%ret = bitcast <2 x float> %fabs to i64
				ret i64 %ret
				; CHECK: mvn r1, #-2147483648
				; CHECK-NOT: vabs
				}

				; CHECK-LABEL: fabs_v2f32_2
				define i64 @fabs_v2f32_2() {
				%bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
				%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
				%ret = bitcast <2 x float> %fabs to i64
				ret i64 %ret
				; CHECK: mvn r0, #-2147483648
				; CHECK-NOT: vabs
				}

llvm/trunk/test/CodeGen/X86/vec_fabs.ll

Show All 32 Lines	define <8 x float> @fabs_v8f32(<8 x float> %p)
; CHECK-LABEL: fabs_v8f32		; CHECK-LABEL: fabs_v8f32
; CHECK: vandps		; CHECK: vandps
%t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)		%t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
ret <8 x float> %t		ret <8 x float> %t
}		}
declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)		declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)

; PR20354: when generating code for a vector fabs op,		; PR20354: when generating code for a vector fabs op,
; make sure the correct mask is used for all vector elements.		; make sure that we're only turning off the sign bit of each float value.
; CHECK-LABEL: .LCPI4_0:		; No constant pool loads or vector ops are needed for the fabs of a
; CHECK-NEXT: .long 2147483647		; bitcasted integer constant; we should just return an integer constant
; CHECK-NEXT: .long 2147483647		; that has the sign bits turned off.
define i64 @fabs_v2f32(<2 x float> %v) {		;
; CHECK-LABEL: fabs_v2f32:		; So instead of something like this:
; CHECK: movabsq $-9223372034707292160, %[[R:r[^ ]+]]		; movabsq (constant pool load of mask for sign bits)
; CHECK-NEXT: vmovq %[[R]], %[[X:xmm[0-9]+]]		; vmovq (move from integer register to vector/fp register)
; CHECK-NEXT: vandps {{.}}.LCPI4_0{{.}}, %[[X]], %[[X]]		; vandps (mask off sign bits)
; CHECK-NEXT: vmovq %[[X]], %rax		; vmovq (move vector/fp register back to integer return register)
; CHECK-NEXT: retq		;
%highbits = bitcast i64 9223372039002259456 to <2 x float> ; 0x8000_0000_8000_0000		; We should generate:
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %highbits)		; mov (put constant value in return register)

		; CHECK-LABEL: fabs_v2f32_1
		define i64 @fabs_v2f32_1() {
		%bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
		%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
%ret = bitcast <2 x float> %fabs to i64		%ret = bitcast <2 x float> %fabs to i64
ret i64 %ret		ret i64 %ret
		; CHECK: movabsq $9223372032559808512, %rax
		; # imm = 0x7FFF_FFFF_0000_0000
		}

		; CHECK-LABEL: fabs_v2f32_2
		define i64 @fabs_v2f32_2() {
		%bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
		%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
		%ret = bitcast <2 x float> %fabs to i64
		ret i64 %ret
		; CHECK: movl $2147483647, %eax
		; # imm = 0x0000_0000_7FFF_FFFF
}		}

declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p)		declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p)