This is an archive of the discontinued LLVM Phabricator instance.

[x86] use psubus for more vsetcc lowering (PR39859)
ClosedPublic

Authored by spatel on Apr 17 2019, 2:51 PM.

Download Raw Diff

Details

Reviewers

andreadb
craig.topper
RKSimon
nikic

Commits

rG12a561fa1b79: [x86] use psubus for more vsetcc lowering (PR39859)
rL358999: [x86] use psubus for more vsetcc lowering (PR39859)

Summary

Circling back to a leftover bit from PR39859:
https://bugs.llvm.org/show_bug.cgi?id=39859#c1

...we have this counter-intuitive (based on the test diffs) opportunity to use 'psubus'. This appears to be the better perf option for both Haswell and Jaguar based on llvm-mca. We already do this transform for the SETULT predicate, so this would make the code more symmetrical too. If we have pminub/pminuw, we prefer those, so this should not affect anything but pre-SSE4.1 subtargets.

$ cat before.s 
	movdqa	-16(%rip), %xmm2    ## xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
	pxor	%xmm0, %xmm2
	pcmpgtw	-32(%rip), %xmm2 ## xmm2 = [255,255,255,255,255,255,255,255]
	pand	%xmm2, %xmm0
	pandn	%xmm1, %xmm2
	por	%xmm2, %xmm0

$ cat after.s 
	movdqa	-16(%rip), %xmm2    ## xmm2 = [256,256,256,256,256,256,256,256]
	psubusw	%xmm0, %xmm2
	pxor	%xmm3, %xmm3
	pcmpeqw	%xmm2, %xmm3
	pand	%xmm3, %xmm0
	pandn	%xmm1, %xmm3
	por	%xmm3, %xmm0

$ llvm-mca before.s -mcpu=haswell
Iterations:        100
Instructions:      600
Total Cycles:      909
Total uOps:        700

Dispatch Width:    4
uOps Per Cycle:    0.77
IPC:               0.66
Block RThroughput: 1.8


$ llvm-mca after.s -mcpu=haswell
Iterations:        100
Instructions:      700
Total Cycles:      409
Total uOps:        700

Dispatch Width:    4
uOps Per Cycle:    1.71
IPC:               1.71
Block RThroughput: 1.8

Diff Detail

Event Timeline

spatel created this revision.Apr 17 2019, 2:51 PM

Herald added a project: Restricted Project. · View Herald TranscriptApr 17 2019, 2:51 PM

Herald added subscribers: hiraditya, mcrosier. · View Herald Transcript

craig.topper added inline comments.Apr 17 2019, 6:33 PM

llvm/lib/Target/X86/X86ISelLowering.cpp
19747	Would it make sense to merge this with decrementVectorConstant using a flag or something?

spatel marked an inline comment as done.Apr 18 2019, 6:28 AM

spatel added inline comments.

llvm/lib/Target/X86/X86ISelLowering.cpp
19747	Yes - I just did the slightly quicker copy/paste version first to see if there were any objections to the direction. Will update.

Patch updated:
Add inc or dec param to helper function to reduce code duplication.

LGTM

This revision is now accepted and ready to land.Apr 23 2019, 4:28 AM

Closed by commit rL358999: [x86] use psubus for more vsetcc lowering (PR39859) (authored by spatel). · Explain WhyApr 23 2019, 8:20 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

40 lines

test/

CodeGen/

X86/

vec_setcc-2.ll

13 lines

Diff 195629

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 19,738 Lines • ▼ Show 20 Lines	for (unsigned i = 0; i < NumElts; ++i) {
if (Elt->getAPIntValue().isNullValue())		if (Elt->getAPIntValue().isNullValue())
return SDValue();		return SDValue();

NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));		NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));
}		}

return DAG.getBuildVector(VT, DL, NewVecC);		return DAG.getBuildVector(VT, DL, NewVecC);
}		}

		craig.topperUnsubmitted Not Done Reply Inline Actions Would it make sense to merge this with decrementVectorConstant using a flag or something? craig.topper: Would it make sense to merge this with decrementVectorConstant using a flag or something?
		spatelAuthorUnsubmitted Done Reply Inline Actions Yes - I just did the slightly quicker copy/paste version first to see if there were any objections to the direction. Will update. spatel: Yes - I just did the slightly quicker copy/paste version first to see if there were any…
		/// Given a simple buildvector constant, return a new vector constant with each
		/// element incremented. If incrementing would result in overflow or this
		/// is not a simple vector constant, return an empty value.
		static SDValue incrementVectorConstant(SDValue V, SelectionDAG &DAG) {
		auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
		if (!BV)
		return SDValue();

		MVT VT = V.getSimpleValueType();
		MVT EltVT = VT.getVectorElementType();
		unsigned NumElts = VT.getVectorNumElements();
		SmallVector<SDValue, 8> NewVecC;
		SDLoc DL(V);
		for (unsigned i = 0; i < NumElts; ++i) {
		auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
		if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EltVT)
		return SDValue();

		// Avoid overflow.
		if (Elt->getAPIntValue().isMaxValue())
		return SDValue();

		NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() + 1, DL, EltVT));
		}

		return DAG.getBuildVector(VT, DL, NewVecC);
		}

/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for		/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
/// Op0 u<= Op1:		/// Op0 u<= Op1:
/// t = psubus Op0, Op1		/// t = psubus Op0, Op1
/// pcmpeq t, <0..0>		/// pcmpeq t, <0..0>
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,		static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
ISD::CondCode Cond, const SDLoc &dl,		ISD::CondCode Cond, const SDLoc &dl,
const X86Subtarget &Subtarget,		const X86Subtarget &Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
Show All 16 Lines	case ISD::SETULT: {
if (Subtarget.hasAVX())		if (Subtarget.hasAVX())
return SDValue();		return SDValue();
SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);		SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);
if (!ULEOp1)		if (!ULEOp1)
return SDValue();		return SDValue();
Op1 = ULEOp1;		Op1 = ULEOp1;
break;		break;
}		}
		case ISD::SETUGT: {
		// If the comparison is against a constant, we can turn this into a setuge.
		// This is beneficial because materializing a constant 0 for the PCMPEQ is
		// probably cheaper than XOR+PCMPGT using 2 different vector constants:
		// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
		SDValue UGEOp1 = incrementVectorConstant(Op1, DAG);
		if (!UGEOp1)
		return SDValue();
		Op1 = Op0;
		Op0 = UGEOp1;
		break;
		}
// Psubus is better than flip-sign because it requires no inversion.		// Psubus is better than flip-sign because it requires no inversion.
case ISD::SETUGE:		case ISD::SETUGE:
std::swap(Op0, Op1);		std::swap(Op0, Op1);
break;		break;
case ISD::SETULE:		case ISD::SETULE:
break;		break;
}		}

▲ Show 20 Lines • Show All 24,247 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vec_setcc-2.ll

	Show First 20 Lines • Show All 188 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%cmp = icmp ugt <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>			%cmp = icmp ugt <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
	ret <16 x i1> %cmp			ret <16 x i1> %cmp
	}			}

	define <8 x i1> @ugt_v8i16_splat(<8 x i16> %x) {			define <8 x i1> @ugt_v8i16_splat(<8 x i16> %x) {
	; SSE2-LABEL: ugt_v8i16_splat:			; SSE2-LABEL: ugt_v8i16_splat:
	; SSE2: ## %bb.0:			; SSE2: ## %bb.0:
	; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0			; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243]
	; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm0			; SSE2-NEXT: psubusw %xmm0, %xmm1
				; SSE2-NEXT: pxor %xmm0, %xmm0
				; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: ugt_v8i16_splat:			; SSE41-LABEL: ugt_v8i16_splat:
	; SSE41: ## %bb.0:			; SSE41: ## %bb.0:
	; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243]			; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243]
	; SSE41-NEXT: pmaxuw %xmm0, %xmm1			; SSE41-NEXT: pmaxuw %xmm0, %xmm1
	; SSE41-NEXT: pcmpeqw %xmm1, %xmm0			; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	▲ Show 20 Lines • Show All 329 Lines • ▼ Show 20 Lines
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	%cmp = icmp ugt <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %x			%cmp = icmp ugt <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %x
	ret <4 x i1> %cmp			ret <4 x i1> %cmp
	}			}

	define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) {			define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) {
	; SSE2-LABEL: PR39859:			; SSE2-LABEL: PR39859:
	; SSE2: ## %bb.0:			; SSE2: ## %bb.0:
	; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]			; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [43,43,43,43,43,43,43,43]
	; SSE2-NEXT: pxor %xmm0, %xmm2			; SSE2-NEXT: psubusw %xmm0, %xmm3
	; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm2			; SSE2-NEXT: pxor %xmm2, %xmm2
				; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
	; SSE2-NEXT: pand %xmm2, %xmm1			; SSE2-NEXT: pand %xmm2, %xmm1
	; SSE2-NEXT: pandn %xmm0, %xmm2			; SSE2-NEXT: pandn %xmm0, %xmm2
	; SSE2-NEXT: por %xmm1, %xmm2			; SSE2-NEXT: por %xmm1, %xmm2
	; SSE2-NEXT: movdqa %xmm2, %xmm0			; SSE2-NEXT: movdqa %xmm2, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: PR39859:			; SSE41-LABEL: PR39859:
	; SSE41: ## %bb.0:			; SSE41: ## %bb.0:
	Show All 12 Lines