This is an archive of the discontinued LLVM Phabricator instance.

[x86] eliminate more sign-bit tests with vector select
AbandonedPublic

Authored by spatel on Jun 11 2018, 11:58 AM.

Download Raw Diff

Details

Reviewers

RKSimon
craig.topper

Summary

vselect (pcmpgt 0, X), Y, Z --> shrunkblend X, Y, Z

This shortcoming was noted in D47330, and the test diffs show we already had other examples where we failed to fold to a SHRUNKBLEND:

/// Dynamic (non-constant condition) vector blend where only the sign bits
/// of the condition elements are used. This is used to enforce that the
/// condition mask is not valid for generic VSELECT optimizations.

Diff Detail

Event Timeline

spatel created this revision.Jun 11 2018, 11:58 AM

Herald added a subscriber: mcrosier. · View Herald TranscriptJun 11 2018, 11:58 AM

RKSimon added inline comments.Jun 11 2018, 1:39 PM

test/CodeGen/X86/vsel-cmp-load.ll
251–252	This can go
258	How come this folds but the AVX1 case in slt_zero above doesn't?

Patch updated:
Remove a stale FIXME comment from a test.

spatel marked an inline comment as done.Jun 11 2018, 2:04 PM

spatel added inline comments.

test/CodeGen/X86/vsel-cmp-load.ll

258

AVX1 is more complicated due to ISA limitations, so I was planning to catch that one next. There, we've split the PCMPGT into halves, so I'll need to match a pattern with a concat:

    t41: v4i32 = X86ISD::PCMPGT t37, t32
        t31: v8i16 = vector_shuffle<4,5,6,7,u,u,u,u> t28, undef:v8i16
      t33: v4i32 = sign_extend_vector_inreg t31
    t42: v4i32 = X86ISD::PCMPGT t37, t33
  t40: v8i32 = concat_vectors t41, t42
  t4: v8i32,ch = CopyFromReg t0, Register:v8i32 %1
  t6: v8i32,ch = CopyFromReg t0, Register:v8i32 %2
t23: v8i32 = vselect t40, t4, t6

spatel added inline comments.Jun 11 2018, 2:10 PM

test/CodeGen/X86/vsel-cmp-load.ll

258

Or probably easier - we match the pattern after type legalization, but before vector op legalization:

    t21: v8i32 = BUILD_VECTOR Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>
  t22: v8i32 = setcc t24, t21, setlt:ch
  t4: v8i32,ch = CopyFromReg t0, Register:v8i32 %1
  t6: v8i32,ch = CopyFromReg t0, Register:v8i32 %2
t23: v8i32 = vselect t22, t4, t6

spatel mentioned this in rL334504: [x86] move shrunkblend transform to helper function; NFCI.Jun 12 2018, 7:26 AM

spatel mentioned this in D48078: [x86] eliminate even more sign-bit tests with vector select.Jun 12 2018, 7:50 AM

spatel added inline comments.Jun 12 2018, 7:55 AM

test/CodeGen/X86/vsel-cmp-load.ll
258	See D48078 for an implementation of that suggestion.

spatel mentioned this in rL334592: [x86] eliminate even more sign-bit tests with vector select.Jun 13 2018, 5:33 AM

Abandoning - we do better by matching setcc+vselect earlier.

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

26 lines

test/

CodeGen/

X86/

vsel-cmp-load.ll

6 lines

vselect-pcmp.ll

19 lines

vselect.ll

7 lines

Diff 150837

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 32,364 Lines • ▼ Show 20 Lines	if (!FalseC->isNullValue())
R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));		R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

return R;		return R;
}		}

return SDValue();		return SDValue();
}		}

		/// Try to convert a vector select into a SHRUNKBLEND and eliminate a compare
		/// that it only testing the sign-bits of its operand.
		static SDValue combineVSelectSignBitCmp(SDNode *N, SelectionDAG &DAG) {
		SDValue Cond = N->getOperand(0);
		if (Cond.getOpcode() != X86ISD::PCMPGT)
		return SDValue();

		assert(N->getOpcode() == ISD::VSELECT && "PCMPGT with scalar select?");
		SDValue Cond0 = Cond.getOperand(0);
		if (!ISD::isBuildVectorAllZeros(Cond0.getNode()))
		return SDValue();

		EVT CondOpVT = Cond0.getValueType();
		EVT VT = N->getValueType(0);
		if (CondOpVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
		return SDValue();

		// vselect (pcmpgt 0, X), Y, Z --> shrunkblend X, Y, Z
		SDValue Cond1 = Cond.getOperand(1);
		return DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), VT, Cond1, N->getOperand(1),
		N->getOperand(2));
		}

/// Do target-specific dag combines on SELECT and VSELECT nodes.		/// Do target-specific dag combines on SELECT and VSELECT nodes.
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,		static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
SDLoc DL(N);		SDLoc DL(N);
SDValue Cond = N->getOperand(0);		SDValue Cond = N->getOperand(0);
// Get the LHS/RHS of the select.		// Get the LHS/RHS of the select.
SDValue LHS = N->getOperand(1);		SDValue LHS = N->getOperand(1);
▲ Show 20 Lines • Show All 395 Lines • ▼ Show 20 Lines	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Custom action for SELECT MMX		// Custom action for SELECT MMX
if (VT == MVT::x86mmx) {		if (VT == MVT::x86mmx) {
LHS = DAG.getBitcast(MVT::i64, LHS);		LHS = DAG.getBitcast(MVT::i64, LHS);
RHS = DAG.getBitcast(MVT::i64, RHS);		RHS = DAG.getBitcast(MVT::i64, RHS);
SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);		SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
return DAG.getBitcast(VT, newSelect);		return DAG.getBitcast(VT, newSelect);
}		}

		if (SDValue V = combineVSelectSignBitCmp(N, DAG))
		return V;

return SDValue();		return SDValue();
}		}

/// Combine:		/// Combine:
/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)		/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
/// to:		/// to:
/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)		/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
/// i.e., reusing the EFLAGS produced by the LOCKed instruction.		/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
▲ Show 20 Lines • Show All 7,567 Lines • Show Last 20 Lines

test/CodeGen/X86/vsel-cmp-load.ll

	Show First 20 Lines • Show All 122 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3			; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
	; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0			; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: slt_zero:			; AVX2-LABEL: slt_zero:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vpmovsxbd (%rdi), %ymm2			; AVX2-NEXT: vpmovsxbd (%rdi), %ymm2
	; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2
	; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: slt_zero:			; AVX512-LABEL: slt_zero:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vpmovsxbw (%rdi), %xmm2			; AVX512-NEXT: vpmovsxbw (%rdi), %xmm2
	; AVX512-NEXT: vpmovw2m %xmm2, %k1			; AVX512-NEXT: vpmovw2m %xmm2, %k1
	; AVX512-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}			; AVX512-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
	▲ Show 20 Lines • Show All 104 Lines • ▼ Show 20 Lines
	; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k1			; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k1
	; AVX512-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}			; AVX512-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%load = load <4 x i8>, <4 x i8>* %p			%load = load <4 x i8>, <4 x i8>* %p
	%cmp = icmp sgt <4 x i8> %load, zeroinitializer			%cmp = icmp sgt <4 x i8> %load, zeroinitializer
	%sel = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y			%sel = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y
	ret <4 x double> %sel			ret <4 x double> %sel
	}			}

	; FIXME: The compare with 0 for AVX2 should be eliminated.

	define <8 x float> @slt_zero_fp_select(<8 x i16>* %p, <8 x float> %x, <8 x float> %y) {			define <8 x float> @slt_zero_fp_select(<8 x i16>* %p, <8 x float> %x, <8 x float> %y) {
				RKSimonUnsubmitted Done Reply Inline Actions This can go RKSimon: This can go
	; AVX1-LABEL: slt_zero_fp_select:			; AVX1-LABEL: slt_zero_fp_select:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm2			; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm2
	; AVX1-NEXT: vpmovsxwd (%rdi), %xmm3			; AVX1-NEXT: vpmovsxwd (%rdi), %xmm3
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
	; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0			; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
				RKSimonUnsubmitted Not Done Reply Inline Actions How come this folds but the AVX1 case in slt_zero above doesn't? RKSimon: How come this folds but the AVX1 case in slt_zero above doesn't?
				spatelAuthorUnsubmitted Not Done Reply Inline Actions AVX1 is more complicated due to ISA limitations, so I was planning to catch that one next. There, we've split the PCMPGT into halves, so I'll need to match a pattern with a concat: t41: v4i32 = X86ISD::PCMPGT t37, t32 t31: v8i16 = vector_shuffle<4,5,6,7,u,u,u,u> t28, undef:v8i16 t33: v4i32 = sign_extend_vector_inreg t31 t42: v4i32 = X86ISD::PCMPGT t37, t33 t40: v8i32 = concat_vectors t41, t42 t4: v8i32,ch = CopyFromReg t0, Register:v8i32 %1 t6: v8i32,ch = CopyFromReg t0, Register:v8i32 %2 t23: v8i32 = vselect t40, t4, t6 spatel: AVX1 is more complicated due to ISA limitations, so I was planning to catch that one next.
				spatelAuthorUnsubmitted Not Done Reply Inline Actions Or probably easier - we match the pattern after type legalization, but before vector op legalization: t21: v8i32 = BUILD_VECTOR Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0>, Constant:i32<0> t22: v8i32 = setcc t24, t21, setlt:ch t4: v8i32,ch = CopyFromReg t0, Register:v8i32 %1 t6: v8i32,ch = CopyFromReg t0, Register:v8i32 %2 t23: v8i32 = vselect t22, t4, t6 spatel: Or probably easier - we match the pattern after type legalization, but before vector op…
				spatelAuthorUnsubmitted Not Done Reply Inline Actions See D48078 for an implementation of that suggestion. spatel: See D48078 for an implementation of that suggestion.
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: slt_zero_fp_select:			; AVX2-LABEL: slt_zero_fp_select:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vpmovsxwd (%rdi), %ymm2			; AVX2-NEXT: vpmovsxwd (%rdi), %ymm2
	; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2
	; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: slt_zero_fp_select:			; AVX512-LABEL: slt_zero_fp_select:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2			; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; AVX512-NEXT: vpcmpgtw (%rdi), %xmm2, %k1			; AVX512-NEXT: vpcmpgtw (%rdi), %xmm2, %k1
	; AVX512-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}			; AVX512-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%load = load <8 x i16>, <8 x i16>* %p			%load = load <8 x i16>, <8 x i16>* %p
	%cmp = icmp slt <8 x i16> %load, zeroinitializer			%cmp = icmp slt <8 x i16> %load, zeroinitializer
	%sel = select <8 x i1> %cmp, <8 x float> %x, <8 x float> %y			%sel = select <8 x i1> %cmp, <8 x float> %x, <8 x float> %y
	ret <8 x float> %sel			ret <8 x float> %sel
	}			}

test/CodeGen/X86/vselect-pcmp.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx \| FileCheck %s --check-prefix=AVX --check-prefix=AVX12F --check-prefix=AVX12 --check-prefix=AVX1			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx \| FileCheck %s --check-prefix=AVX --check-prefix=AVX12F --check-prefix=AVX12 --check-prefix=AVX1
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 \| FileCheck %s --check-prefix=AVX --check-prefix=AVX12F --check-prefix=AVX12 --check-prefix=AVX2			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 \| FileCheck %s --check-prefix=AVX --check-prefix=AVX12F --check-prefix=AVX12 --check-prefix=AVX2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f \| FileCheck %s --check-prefix=AVX --check-prefix=AVX12F --check-prefix=AVX512 --check-prefix=AVX512F			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f \| FileCheck %s --check-prefix=AVX --check-prefix=AVX12F --check-prefix=AVX512 --check-prefix=AVX512F
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl \| FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl \| FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL

	; The condition vector for BLENDV* only cares about the sign bit of each element.			; The condition vector for BLENDV* only cares about the sign bit of each element.
	; So in these tests, if we generate BLENDV*, we should be able to remove the redundant cmp op.			; So in these tests, if we generate BLENDV*, we should be able to remove the redundant cmp op.

	; Test 128-bit vectors for all legal element types.			; Test 128-bit vectors for all legal element types.

	; FIXME: Why didn't AVX-512 optimize too?

	define <16 x i8> @signbit_sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) {			define <16 x i8> @signbit_sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) {
	; AVX12-LABEL: signbit_sel_v16i8:			; AVX-LABEL: signbit_sel_v16i8:
	; AVX12: # %bb.0:			; AVX: # %bb.0:
	; AVX12-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0			; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
	; AVX12-NEXT: retq			; AVX-NEXT: retq
	;
	; AVX512-LABEL: signbit_sel_v16i8:
	; AVX512: # %bb.0:
	; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; AVX512-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
	; AVX512-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
	; AVX512-NEXT: retq
	%tr = icmp slt <16 x i8> %mask, zeroinitializer			%tr = icmp slt <16 x i8> %mask, zeroinitializer
	%z = select <16 x i1> %tr, <16 x i8> %x, <16 x i8> %y			%z = select <16 x i1> %tr, <16 x i8> %x, <16 x i8> %y
	ret <16 x i8> %z			ret <16 x i8> %z
	}			}

	; Sorry 16-bit, you're not important enough to support?			; Sorry 16-bit, you're not important enough to support?

	define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) {			define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) {
	▲ Show 20 Lines • Show All 141 Lines • ▼ Show 20 Lines
	;			;
	; AVX2-LABEL: signbit_sel_v32i8:			; AVX2-LABEL: signbit_sel_v32i8:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: signbit_sel_v32i8:			; AVX512-LABEL: signbit_sel_v32i8:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; AVX512-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
	; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0			; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%tr = icmp slt <32 x i8> %mask, zeroinitializer			%tr = icmp slt <32 x i8> %mask, zeroinitializer
	%z = select <32 x i1> %tr, <32 x i8> %x, <32 x i8> %y			%z = select <32 x i1> %tr, <32 x i8> %x, <32 x i8> %y
	ret <32 x i8> %z			ret <32 x i8> %z
	}			}

	; Sorry 16-bit, you'll never be important enough to support?			; Sorry 16-bit, you'll never be important enough to support?
	▲ Show 20 Lines • Show All 211 Lines • Show Last 20 Lines

test/CodeGen/X86/vselect.ll

	Show First 20 Lines • Show All 516 Lines • ▼ Show 20 Lines
	; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0			; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%x = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b			%x = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b
	%y = select <2 x i1> %cond, <2 x i64> %c, <2 x i64> %d			%y = select <2 x i1> %cond, <2 x i64> %c, <2 x i64> %d
	%z = add <2 x i64> %x, %y			%z = add <2 x i64> %x, %y
	ret <2 x i64> %z			ret <2 x i64> %z
	}			}

	; Similar to above, but condition has a use that isn't a condition of a vselect so we can't optimize.			; Similar to above, but condition has a use that isn't a condition of a vselect.
				; The blend does not require a cmp, but we may produce one anyway for the math op.
	define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {			define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
	; SSE2-LABEL: shrunkblend_nonvselectuse:			; SSE2-LABEL: shrunkblend_nonvselectuse:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: psllq $63, %xmm0			; SSE2-NEXT: psllq $63, %xmm0
	; SSE2-NEXT: psrad $31, %xmm0			; SSE2-NEXT: psrad $31, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]			; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
	; SSE2-NEXT: movdqa %xmm3, %xmm0			; SSE2-NEXT: movdqa %xmm3, %xmm0
	; SSE2-NEXT: pandn %xmm2, %xmm0			; SSE2-NEXT: pandn %xmm2, %xmm0
	Show All 9 Lines
	; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]			; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
	; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2			; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
	; SSE41-NEXT: paddq %xmm2, %xmm0			; SSE41-NEXT: paddq %xmm2, %xmm0
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: shrunkblend_nonvselectuse:			; AVX-LABEL: shrunkblend_nonvselectuse:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpsllq $63, %xmm0, %xmm0			; AVX-NEXT: vpsllq $63, %xmm0, %xmm0
	; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
	; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
	; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm1			; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm1
				; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
				; AVX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
	; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0			; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%x = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b			%x = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b
	%y = sext <2 x i1> %cond to <2 x i64>			%y = sext <2 x i1> %cond to <2 x i64>
	%z = add <2 x i64> %x, %y			%z = add <2 x i64> %x, %y
	ret <2 x i64> %z			ret <2 x i64> %z
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[x86] eliminate more sign-bit tests with vector selectAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 150837

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/vsel-cmp-load.ll

test/CodeGen/X86/vselect-pcmp.ll

test/CodeGen/X86/vselect.ll

[x86] eliminate more sign-bit tests with vector select
AbandonedPublic