This is an archive of the discontinued LLVM Phabricator instance.

optimize vector fneg of bitcasted integer value
ClosedPublic

Authored by spatel on Aug 11 2014, 1:24 PM.

Download Raw Diff

Details

Reviewers

rengolin
chandlerc
t.p.northover
asl

Commits

rG35d3133650e7: optimize vector fneg of bitcasted integer value
rL215646: optimize vector fneg of bitcasted integer value

Summary

This patch allows a vector fneg of a bitcasted integer value to be optimized in the same way that we already optimize a scalar fneg. If the integer variable is a constant, we can precompute the result and not require any logic ops.

So for x86, instead of something like this:

movd       %rdi, %xmm0
xorps      .LCPI2_0(%rip), %xmm0  ; constant pool load of sign mask

We should generate:

movabsq     (put sign bit mask in integer register via immediate)
xorq        (flip sign bits)

For ARM, this patch replaces the test case in test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll with a new test case in test/CodeGen/ARM/fnegs.ll. That test file covers several ARM hardware variants. In each run of the new testcase, we should now just be using the most basic integer op (eor) rather than VFP/NEON.

For reference, the replaced test case used to generate:

add	r1, sp, #36
add	r0, r0, #48
vld1.32	{d16[0]}, [r1:32]
add	r1, r1, #4
vld1.32	{d16[1]}, [r1:32]
add	r1, sp, #44
vld1.32	{d17[0]}, [r1:32]
add	r1, r1, #4
vld1.32	{d17[1]}, [r1:32]
vneg.f32	q8, q8
vst1.64	{d16, d17}, [r0:128]
bx	lr

And should now generate:

push	{r4, lr}
ldr	r1, [sp, #48]
ldr	r12, [sp, #52]
ldr	r2, [sp, #56]
eor	r1, r1, #-2147483648
ldr	lr, [sp, #44]
eor	r3, r12, #-2147483648
eor	r4, r2, #-2147483648
add	r12, r0, #52
eor	r2, lr, #-2147483648
str	r2, [r0, #48]
stm	r12, {r1, r3, r4}
pop	{r4, pc}

This is a sibling patch to an fabs optimization that was checked in at r214892:
http://reviews.llvm.org/D4785

Ideally, we can refactor the visitFNEG and visitFABS functions in DAGCombiner since they are very similar, but I'll leave that for another patch.

Both patches originated from PR20354:
http://llvm.org/bugs/show_bug.cgi?id=20354

Diff Detail

Repository: rL LLVM

Event Timeline

spatel updated this revision to Diff 12365.Aug 11 2014, 1:24 PM

spatel retitled this revision from to optimize vector fneg of bitcasted integer value.

spatel updated this object.

spatel edited the test plan for this revision. (Show Details)

spatel added reviewers: rengolin, asl, t.p.northover, chandlerc.

spatel added a subscriber: Unknown Object (MLST).

Herald added a subscriber: aemerson. · View Herald TranscriptAug 11 2014, 1:24 PM

Fixed FileCheck prefixes for ARM testcase.

Hi Sanjay,

The new code's "estimated" cost is so close the old one that I won't object. But I fear this might create some regressions, since the cost of using the vector pipeline in modern ARM cores is next to zero, and using both pipelines at the same time might actually be faster.

However, I tried to run them on an ARM OOO chip and made no noticeable difference, so go for it! Any relevant side effect should be picked up later and could be fixed on the ARM DAG legalizer, if needed.

cheers,
--renato

This revision is now accepted and ready to land.Aug 13 2014, 6:26 AM

Closed by commit rL215646 (authored by @spatel).

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

23 lines

test/

CodeGen/

ARM/

2009-10-21-InvalidFNeg.ll

48 lines

fnegs.ll

46 lines

X86/

vec_fneg.ll

22 lines

Diff 12506

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,315 Lines • ▼ Show 20 Lines	if (VT.isVector()) {
SDValue FoldedVOp = SimplifyVUnaryOp(N);		SDValue FoldedVOp = SimplifyVUnaryOp(N);
if (FoldedVOp.getNode()) return FoldedVOp;		if (FoldedVOp.getNode()) return FoldedVOp;
}		}

if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),		if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
&DAG.getTarget().Options))		&DAG.getTarget().Options))
return GetNegatedExpression(N0, DAG, LegalOperations);		return GetNegatedExpression(N0, DAG, LegalOperations);

// Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading		// Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
// constant pool values.		// constant pool values.
// TODO: We can also optimize for vectors here, but we need to make sure
// that the sign mask is created properly for each vector element.
if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST &&		if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST &&
!VT.isVector() &&		N0.getNode()->hasOneUse()) {
N0.getNode()->hasOneUse() &&
N0.getOperand(0).getValueType().isInteger()) {
SDValue Int = N0.getOperand(0);		SDValue Int = N0.getOperand(0);
EVT IntVT = Int.getValueType();		EVT IntVT = Int.getValueType();
if (IntVT.isInteger() && !IntVT.isVector()) {		if (IntVT.isInteger() && !IntVT.isVector()) {
		APInt SignMask;
		if (N0.getValueType().isVector()) {
		// For a vector, get a mask such as 0x80... per scalar element
		// and splat it.
		SignMask = APInt::getSignBit(N0.getValueType().getScalarSizeInBits());
		SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
		} else {
		// For a scalar, just generate 0x80...
		SignMask = APInt::getSignBit(IntVT.getSizeInBits());
		}
Int = DAG.getNode(ISD::XOR, SDLoc(N0), IntVT, Int,		Int = DAG.getNode(ISD::XOR, SDLoc(N0), IntVT, Int,
DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT));		DAG.getConstant(SignMask, IntVT));
AddToWorklist(Int.getNode());		AddToWorklist(Int.getNode());
return DAG.getNode(ISD::BITCAST, SDLoc(N),		return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Int);
VT, Int);
}		}
}		}

// (fneg (fmul c, x)) -> (fmul -c, x)		// (fneg (fmul c, x)) -> (fmul -c, x)
if (N0.getOpcode() == ISD::FMUL) {		if (N0.getOpcode() == ISD::FMUL) {
ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));		ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
if (CFP1) {		if (CFP1) {
APFloat CVal = CFP1->getValueAPF();		APFloat CVal = CFP1->getValueAPF();
▲ Show 20 Lines • Show All 4,598 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll

	; RUN: llc -mcpu=cortex-a8 -mattr=+neon < %s \| grep vneg
	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
	target triple = "armv7-eabi"

	%aaa = type { %fff, %fff }
	%bbb = type { [6 x %ddd] }
	%ccc = type { %eee, %fff }
	%ddd = type { %fff }
	%eee = type { %fff, %fff, %fff, %fff }
	%fff = type { %struct.vec_float4 }
	%struct.vec_float4 = type { <4 x float> }

	define linkonce_odr arm_aapcs_vfpcc void @foo(%eee* noalias sret %agg.result, i64 %tfrm.0.0, i64 %tfrm.0.1, i64 %tfrm.0.2, i64 %tfrm.0.3, i64 %tfrm.0.4, i64 %tfrm.0.5, i64 %tfrm.0.6, i64 %tfrm.0.7) nounwind noinline {
	entry:
	%tmp104 = zext i64 %tfrm.0.2 to i512 ; <i512> [#uses=1]
	%tmp105 = shl i512 %tmp104, 128 ; <i512> [#uses=1]
	%tmp118 = zext i64 %tfrm.0.3 to i512 ; <i512> [#uses=1]
	%tmp119 = shl i512 %tmp118, 192 ; <i512> [#uses=1]
	%ins121 = or i512 %tmp119, %tmp105 ; <i512> [#uses=1]
	%tmp99 = zext i64 %tfrm.0.4 to i512 ; <i512> [#uses=1]
	%tmp100 = shl i512 %tmp99, 256 ; <i512> [#uses=1]
	%tmp123 = zext i64 %tfrm.0.5 to i512 ; <i512> [#uses=1]
	%tmp124 = shl i512 %tmp123, 320 ; <i512> [#uses=1]
	%tmp96 = zext i64 %tfrm.0.6 to i512 ; <i512> [#uses=1]
	%tmp97 = shl i512 %tmp96, 384 ; <i512> [#uses=1]
	%tmp128 = zext i64 %tfrm.0.7 to i512 ; <i512> [#uses=1]
	%tmp129 = shl i512 %tmp128, 448 ; <i512> [#uses=1]
	%mask.masked = or i512 %tmp124, %tmp100 ; <i512> [#uses=1]
	%ins131 = or i512 %tmp129, %tmp97 ; <i512> [#uses=1]
	%tmp109132 = zext i64 %tfrm.0.0 to i128 ; <i128> [#uses=1]
	%tmp113134 = zext i64 %tfrm.0.1 to i128 ; <i128> [#uses=1]
	%tmp114133 = shl i128 %tmp113134, 64 ; <i128> [#uses=1]
	%tmp94 = or i128 %tmp114133, %tmp109132 ; <i128> [#uses=1]
	%tmp95 = bitcast i128 %tmp94 to <4 x float> ; <<4 x float>> [#uses=0]
	%tmp82 = lshr i512 %ins121, 128 ; <i512> [#uses=1]
	%tmp83 = trunc i512 %tmp82 to i128 ; <i128> [#uses=1]
	%tmp84 = bitcast i128 %tmp83 to <4 x float> ; <<4 x float>> [#uses=0]
	%tmp86 = lshr i512 %mask.masked, 256 ; <i512> [#uses=1]
	%tmp87 = trunc i512 %tmp86 to i128 ; <i128> [#uses=1]
	%tmp88 = bitcast i128 %tmp87 to <4 x float> ; <<4 x float>> [#uses=0]
	%tmp90 = lshr i512 %ins131, 384 ; <i512> [#uses=1]
	%tmp91 = trunc i512 %tmp90 to i128 ; <i128> [#uses=1]
	%tmp92 = bitcast i128 %tmp91 to <4 x float> ; <<4 x float>> [#uses=1]
	%tmp = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %tmp92 ; <<4 x float>> [#uses=1]
	%tmp28 = getelementptr inbounds %eee* %agg.result, i32 0, i32 3, i32 0, i32 0 ; <<4 x float>*> [#uses=1]
	store <4 x float> %tmp, <4 x float>* %tmp28, align 16
	ret void
	}

llvm/trunk/test/CodeGen/ARM/fnegs.ll

	Show First 20 Lines • Show All 67 Lines • ▼ Show 20 Lines
	; CORTEXA8: vneg.f32 s{{.}}, s{{.}}			; CORTEXA8: vneg.f32 s{{.}}, s{{.}}

	; CORTEXA8U-LABEL: test2:			; CORTEXA8U-LABEL: test2:
	; CORTEXA8U: vneg.f32 d{{.}}, d{{.}}			; CORTEXA8U: vneg.f32 d{{.}}, d{{.}}

	; CORTEXA9-LABEL: test2:			; CORTEXA9-LABEL: test2:
	; CORTEXA9: vneg.f32 s{{.}}, s{{.}}			; CORTEXA9: vneg.f32 s{{.}}, s{{.}}

				; If we're bitcasting an integer to an FP vector, we should avoid the FP/vector unit entirely.
				; Make sure that we're flipping the sign bit and only the sign bit of each float (PR20354).
				; So instead of something like this:
				; vmov d16, r0, r1
				; vneg.f32 d16, d16
				; vmov r0, r1, d16
				;
				; We should generate:
				; eor r0, r0, #-214783648
				; eor r1, r1, #-214783648

				define <2 x float> @fneg_bitcast(i64 %i) {
				%bitcast = bitcast i64 %i to <2 x float>
				%fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
				ret <2 x float> %fneg
				}
				; VFP2-LABEL: fneg_bitcast:
				; VFP2-DAG: eor r0, r0, #-2147483648
				; VFP2-DAG: eor r1, r1, #-2147483648
				; VFP2-NOT: vneg.f32

				; NFP1-LABEL: fneg_bitcast:
				; NFP1-DAG: eor r0, r0, #-2147483648
				; NFP1-DAG: eor r1, r1, #-2147483648
				; NFP1-NOT: vneg.f32

				; NFP0-LABEL: fneg_bitcast:
				; NFP0-DAG: eor r0, r0, #-2147483648
				; NFP0-DAG: eor r1, r1, #-2147483648
				; NFP0-NOT: vneg.f32

				; CORTEXA8-LABEL: fneg_bitcast:
				; CORTEXA8-DAG: eor r0, r0, #-2147483648
				; CORTEXA8-DAG: eor r1, r1, #-2147483648
				; CORTEXA8-NOT: vneg.f32

				; CORTEXA8U-LABEL: fneg_bitcast:
				; CORTEXA8U-DAG: eor r0, r0, #-2147483648
				; CORTEXA8U-DAG: eor r1, r1, #-2147483648
				; CORTEXA8U-NOT: vneg.f32

				; CORTEXA9-LABEL: fneg_bitcast:
				; CORTEXA9-DAG: eor r0, r0, #-2147483648
				; CORTEXA9-DAG: eor r1, r1, #-2147483648
				; CORTEXA9-NOT: vneg.f32

llvm/trunk/test/CodeGen/X86/vec_fneg.ll

	Show All 15 Lines
	; CHECK-LABEL: t2:			; CHECK-LABEL: t2:
	; CHECK: xorps %[[X:xmm[0-9]+]], %[[X]]			; CHECK: xorps %[[X:xmm[0-9]+]], %[[X]]
	; CHECK-NEXT: subps %xmm0, %[[X]]			; CHECK-NEXT: subps %xmm0, %[[X]]
	; CHECK-NEXT: movaps %[[X]], %xmm0			; CHECK-NEXT: movaps %[[X]], %xmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%tmp = fsub <4 x float> zeroinitializer, %Q			%tmp = fsub <4 x float> zeroinitializer, %Q
	ret <4 x float> %tmp			ret <4 x float> %tmp
	}			}

				; If we're bitcasting an integer to an FP vector, we should avoid the FPU/vector unit entirely.
				; Make sure that we're flipping the sign bit and only the sign bit of each float.
				; So instead of something like this:
				; movd %rdi, %xmm0
				; xorps .LCPI2_0(%rip), %xmm0
				;
				; We should generate:
				; movabsq (put sign bit mask in integer register))
				; xorq (flip sign bits)
				; movd (move to xmm return register)

				define <2 x float> @fneg_bitcast(i64 %i) {
				; CHECK-LABEL: fneg_bitcast:
				; CHECK: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
				; CHECK-NEXT: xorq %rdi, %rax
				; CHECK-NEXT: movd %rax, %xmm0
				; CHECK-NEXT: retq
				%bitcast = bitcast i64 %i to <2 x float>
				%fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
				ret <2 x float> %fneg
				}