This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Convert PTEST to MOVMSK for allsign bits vector results.
ClosedPublic

Authored by RKSimon on May 26 2020, 8:21 AM.

Download Raw Diff

Details

Reviewers

craig.topper
spatel
andreadb

Commits

rG410667f1b74c: [X86][SSE] Convert PTEST to MOVMSK for allsign bits vector results

Summary

If we are using PTEST to check 'allsign bits' vector elements we can use MOVMSK to extract the signbits directly and perform the comparison on the scalar value.

For vXi16 cases, as we don't have a MOVMSK for this type, we must mask each signbit out of a PMOVMSKB v2Xi8 result, which folds into the TEST comparison.

If this allows us to remove a vector op (via the SimplifyMultipleUseDemandedBits call) this is consistently faster than a PTEST (https://godbolt.org/z/ziJUst).

I'm investigating whether we ever get regressions without the SimplifyMultipleUseDemandedBits call, even if this means we don't remove a vector op, but that has exposed some other poor codegen issues that I'm still investigating and would have to wait for a later patch.

Suggested on PR42035 to avoid unnecessary ashr(x,bw-1)/pcmpgt(0,x) sign splat patterns feeding into ptest.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

RKSimon created this revision.May 26 2020, 8:21 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 26 2020, 8:21 AM

Herald added a subscriber: hiraditya. · View Herald Transcript

Harbormaster completed remote builds in B57895: Diff 266222.May 26 2020, 10:50 AM

craig.topper added inline comments.May 26 2020, 10:53 AM

llvm/lib/Target/X86/X86ISelLowering.cpp
40178	Do we need getPMOVMSKB here or can we just use plain getNode? The only thing getPMOVMSKB does is handle AVX and BWI splitting right? But if we start from PTEST we should never need to split?

RKSimon marked an inline comment as done.May 26 2020, 11:13 AM

RKSimon added inline comments.

llvm/lib/Target/X86/X86ISelLowering.cpp
40178	AVX1 code can technically get here for v16i16/v32i8 cases depending how good a job SimplifyMultipleUseDemandedBits has managed. VPTEST is one of the rare 256-bit integer instructions that is available on AVX1!

LGTM

llvm/lib/Target/X86/X86ISelLowering.cpp
40178	Ok thanks for the clarification.

This revision is now accepted and ready to land.May 27 2020, 12:32 AM

Closed by commit rG410667f1b74c: [X86][SSE] Convert PTEST to MOVMSK for allsign bits vector results (authored by RKSimon). · Explain WhyMay 27 2020, 3:44 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

37 lines

test/

CodeGen/

X86/

combine-ptest.ll

20 lines

Diff 266479

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 40,073 Lines • ▼ Show 20 Lines	static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
}		}

return SDValue();		return SDValue();
}		}

/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC		/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
/// to avoid the inversion.		/// to avoid the inversion.
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,		static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
SelectionDAG &DAG) {		SelectionDAG &DAG,
		const X86Subtarget &Subtarget) {
// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.		// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
if (EFLAGS.getOpcode() != X86ISD::PTEST &&		if (EFLAGS.getOpcode() != X86ISD::PTEST &&
EFLAGS.getOpcode() != X86ISD::TESTP)		EFLAGS.getOpcode() != X86ISD::TESTP)
return SDValue();		return SDValue();

// PTEST/TESTP sets EFLAGS as:		// PTEST/TESTP sets EFLAGS as:
// TESTZ: ZF = (Op0 & Op1) == 0		// TESTZ: ZF = (Op0 & Op1) == 0
// TESTC: CF = (~Op0 & Op1) == 0		// TESTC: CF = (~Op0 & Op1) == 0
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
if (SDValue NotOp1 = IsNOT(Op1, DAG)) {		if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);		CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,		return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
DAG.getBitcast(OpVT, NotOp1), Op0);		DAG.getBitcast(OpVT, NotOp1), Op0);
}		}

if (Op0 == Op1) {		if (Op0 == Op1) {
SDValue BC = peekThroughBitcasts(Op0);		SDValue BC = peekThroughBitcasts(Op0);
		EVT BCVT = BC.getValueType();
		assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
		"Unexpected vector type");

// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)		// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
if (BC.getOpcode() == ISD::AND \|\| BC.getOpcode() == X86ISD::FAND) {		if (BC.getOpcode() == ISD::AND \|\| BC.getOpcode() == X86ISD::FAND) {
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,		return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
DAG.getBitcast(OpVT, BC.getOperand(0)),		DAG.getBitcast(OpVT, BC.getOperand(0)),
DAG.getBitcast(OpVT, BC.getOperand(1)));		DAG.getBitcast(OpVT, BC.getOperand(1)));
}		}

// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)		// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
if (BC.getOpcode() == X86ISD::ANDNP \|\| BC.getOpcode() == X86ISD::FANDN) {		if (BC.getOpcode() == X86ISD::ANDNP \|\| BC.getOpcode() == X86ISD::FANDN) {
CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);		CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,		return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
DAG.getBitcast(OpVT, BC.getOperand(0)),		DAG.getBitcast(OpVT, BC.getOperand(0)),
DAG.getBitcast(OpVT, BC.getOperand(1)));		DAG.getBitcast(OpVT, BC.getOperand(1)));
}		}

		// If every element is an all-sign value, see if we can use MOVMSK to
		// more efficiently extract the sign bits and compare that.
		// TODO: Handle TESTC with comparison inversion.
		// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
		// MOVMSK combines to make sure its never worse than PTEST?
		unsigned EltBits = BCVT.getScalarSizeInBits();
		if (DAG.ComputeNumSignBits(BC) == EltBits) {
		assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
		APInt SignMask = APInt::getSignMask(EltBits);
		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
		if (SDValue Res =
		TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
		// For vXi16 cases we need to use pmovmksb and extract every other
		// sign bit.
		SDLoc DL(EFLAGS);
		craig.topperUnsubmitted Not Done Reply Inline Actions Do we need getPMOVMSKB here or can we just use plain getNode? The only thing getPMOVMSKB does is handle AVX and BWI splitting right? But if we start from PTEST we should never need to split? craig.topper: Do we need getPMOVMSKB here or can we just use plain getNode? The only thing getPMOVMSKB does…
		RKSimonAuthorUnsubmitted Done Reply Inline Actions AVX1 code can technically get here for v16i16/v32i8 cases depending how good a job SimplifyMultipleUseDemandedBits has managed. VPTEST is one of the rare 256-bit integer instructions that is available on AVX1! RKSimon: AVX1 code can technically get here for v16i16/v32i8 cases depending how good a job…
		craig.topperUnsubmitted Not Done Reply Inline Actions Ok thanks for the clarification. craig.topper: Ok thanks for the clarification.
		if (EltBits == 16) {
		MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
		Res = DAG.getBitcast(MovmskVT, Res);
		Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
		Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
		DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
		} else {
		Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
		}
		return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
		DAG.getConstant(0, DL, MVT::i32));
		}
		}
}		}

// TESTZ(-1,X) == TESTZ(X,X)		// TESTZ(-1,X) == TESTZ(X,X)
if (ISD::isBuildVectorAllOnes(Op0.getNode()))		if (ISD::isBuildVectorAllOnes(Op0.getNode()))
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);		return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

// TESTZ(X,-1) == TESTZ(X,X)		// TESTZ(X,-1) == TESTZ(X,X)
if (ISD::isBuildVectorAllOnes(Op1.getNode()))		if (ISD::isBuildVectorAllOnes(Op1.getNode()))
Show All 11 Lines	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
if (CC == X86::COND_B)		if (CC == X86::COND_B)
if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))		if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
return Flags;		return Flags;

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))		if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
return R;		return R;

if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG))		if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
return R;		return R;

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);		return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
}		}

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]		/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,		static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
▲ Show 20 Lines • Show All 8,920 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/combine-ptest.ll

Show First 20 Lines • Show All 293 Lines • ▼ Show 20 Lines	start:
%3 = sext <16 x i1> %2 to <16 x i8>		%3 = sext <16 x i1> %2 to <16 x i8>
%4 = bitcast <16 x i8> %3 to <2 x i64>		%4 = bitcast <16 x i8> %3 to <2 x i64>
%5 = tail call i32 @llvm.x86.sse41.ptestc(<2 x i64> %4, <2 x i64> <i64 -1, i64 -1>)		%5 = tail call i32 @llvm.x86.sse41.ptestc(<2 x i64> %4, <2 x i64> <i64 -1, i64 -1>)
%6 = icmp eq i32 %5, 1		%6 = icmp eq i32 %5, 1
ret i1 %6		ret i1 %6
}		}

;		;
; TODO: testz(ashr(X,bw-1),-1) -> movmsk(X)		; testz(ashr(X,bw-1),-1) -> movmsk(X)
;		;

define i32 @ptestz_v2i64_signbits(<2 x i64> %c, i32 %a, i32 %b) {		define i32 @ptestz_v2i64_signbits(<2 x i64> %c, i32 %a, i32 %b) {
; CHECK-LABEL: ptestz_v2i64_signbits:		; CHECK-LABEL: ptestz_v2i64_signbits:
; CHECK: # %bb.0:		; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax		; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1		; CHECK-NEXT: vmovmskpd %xmm0, %ecx
; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0		; CHECK-NEXT: testl %ecx, %ecx
; CHECK-NEXT: vptest %xmm0, %xmm0
; CHECK-NEXT: cmovnel %esi, %eax		; CHECK-NEXT: cmovnel %esi, %eax
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%t1 = ashr <2 x i64> %c, <i64 63, i64 63>		%t1 = ashr <2 x i64> %c, <i64 63, i64 63>
%t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> <i64 -1, i64 -1>)		%t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> <i64 -1, i64 -1>)
%t3 = icmp ne i32 %t2, 0		%t3 = icmp ne i32 %t2, 0
%t4 = select i1 %t3, i32 %a, i32 %b		%t4 = select i1 %t3, i32 %a, i32 %b
ret i32 %t4		ret i32 %t4
}		}
Show All 9 Lines
; AVX1-NEXT: vptest %ymm0, %ymm0		; AVX1-NEXT: vptest %ymm0, %ymm0
; AVX1-NEXT: cmovnel %esi, %eax		; AVX1-NEXT: cmovnel %esi, %eax
; AVX1-NEXT: vzeroupper		; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
; AVX2-LABEL: ptestz_v8i32_signbits:		; AVX2-LABEL: ptestz_v8i32_signbits:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: movl %edi, %eax		; AVX2-NEXT: movl %edi, %eax
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0		; AVX2-NEXT: vmovmskps %ymm0, %ecx
; AVX2-NEXT: vptest %ymm0, %ymm0		; AVX2-NEXT: testl %ecx, %ecx
; AVX2-NEXT: cmovnel %esi, %eax		; AVX2-NEXT: cmovnel %esi, %eax
; AVX2-NEXT: vzeroupper		; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq		; AVX2-NEXT: retq
%t1 = ashr <8 x i32> %c, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>		%t1 = ashr <8 x i32> %c, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
%t2 = bitcast <8 x i32> %t1 to <4 x i64>		%t2 = bitcast <8 x i32> %t1 to <4 x i64>
%t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)		%t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)
%t4 = icmp ne i32 %t3, 0		%t4 = icmp ne i32 %t3, 0
%t5 = select i1 %t4, i32 %a, i32 %b		%t5 = select i1 %t4, i32 %a, i32 %b
ret i32 %t5		ret i32 %t5
}		}

define i32 @ptestz_v8i16_signbits(<8 x i16> %c, i32 %a, i32 %b) {		define i32 @ptestz_v8i16_signbits(<8 x i16> %c, i32 %a, i32 %b) {
; CHECK-LABEL: ptestz_v8i16_signbits:		; CHECK-LABEL: ptestz_v8i16_signbits:
; CHECK: # %bb.0:		; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax		; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: vpsraw $15, %xmm0, %xmm0		; CHECK-NEXT: vpmovmskb %xmm0, %ecx
; CHECK-NEXT: vptest %xmm0, %xmm0		; CHECK-NEXT: testl $43690, %ecx # imm = 0xAAAA
; CHECK-NEXT: cmovnel %esi, %eax		; CHECK-NEXT: cmovnel %esi, %eax
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%t1 = ashr <8 x i16> %c, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>		%t1 = ashr <8 x i16> %c, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%t2 = bitcast <8 x i16> %t1 to <2 x i64>		%t2 = bitcast <8 x i16> %t1 to <2 x i64>
%t3 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t2, <2 x i64> <i64 -1, i64 -1>)		%t3 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t2, <2 x i64> <i64 -1, i64 -1>)
%t4 = icmp ne i32 %t3, 0		%t4 = icmp ne i32 %t3, 0
%t5 = select i1 %t4, i32 %a, i32 %b		%t5 = select i1 %t4, i32 %a, i32 %b
ret i32 %t5		ret i32 %t5
Show All 11 Lines
; AVX1-NEXT: vptest %ymm0, %ymm0		; AVX1-NEXT: vptest %ymm0, %ymm0
; AVX1-NEXT: cmovnel %esi, %eax		; AVX1-NEXT: cmovnel %esi, %eax
; AVX1-NEXT: vzeroupper		; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
; AVX2-LABEL: ptestz_v32i8_signbits:		; AVX2-LABEL: ptestz_v32i8_signbits:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: movl %edi, %eax		; AVX2-NEXT: movl %edi, %eax
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1		; AVX2-NEXT: vpmovmskb %ymm0, %ecx
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0		; AVX2-NEXT: testl %ecx, %ecx
; AVX2-NEXT: vptest %ymm0, %ymm0
; AVX2-NEXT: cmovnel %esi, %eax		; AVX2-NEXT: cmovnel %esi, %eax
; AVX2-NEXT: vzeroupper		; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq		; AVX2-NEXT: retq
%t1 = ashr <32 x i8> %c, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>		%t1 = ashr <32 x i8> %c, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
%t2 = bitcast <32 x i8> %t1 to <4 x i64>		%t2 = bitcast <32 x i8> %t1 to <4 x i64>
%t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)		%t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)
%t4 = icmp ne i32 %t3, 0		%t4 = icmp ne i32 %t3, 0
%t5 = select i1 %t4, i32 %a, i32 %b		%t5 = select i1 %t4, i32 %a, i32 %b
Show All 10 Lines