This is an archive of the discontinued LLVM Phabricator instance.

[X86] Change signatures of avx512 packed fp compare intrinsics to return a vXi1 mask type to be closer to an fcmp.
ClosedPublic

Authored by craig.topper on Feb 9 2018, 11:34 AM.

Download Raw Diff

Details

Reviewers

spatel
delena
RKSimon
zvi

Commits

rG4dccffc84a71: [X86] Change signatures of avx512 packed fp compare intrinsics to return a vXi1…
rL324827: [X86] Change signatures of avx512 packed fp compare intrinsics to return a vXi1…

Summary

This patch changes the signature of the avx512 packed fp compare intrinsics to return a vXi1 vector and no longer take a mask as input. The casts to scalar type will now need to be explicit in the IR. The masking node will now be an explicit and in the IR.

This makes the intrinsic look much more similar to an fcmp instruction that we wish we could use for these but can't. We already use icmp instructions for integer compares.

Previously the lowering step of isel would turn the intrinsic into an X86 specific ISD node and a emit the masking nodes as well as some bitcasts. This means DAG combines can't see the vXi1 type until somewhat late, making it more difficult to combine out gpr<->mask transition sequences. By exposing the vXi1 type explicitly in the IR and initial SelectionDAG we give earlier DAG combines and even InstCombine the chance to see it and optimize it.

This should make any issues with gpr<->mask sequences the same between integer and fp. Meaning we only have to fix them once.

I'll post a clang patch for CGBuiltin.cpp soon.

Diff Detail

Repository: rL LLVM

Event Timeline

craig.topper created this revision.Feb 9 2018, 11:34 AM

craig.topper mentioned this in D43143: [X86] Change the signature of the AVX512 packed fp compare intrinsics to return vXi1 mask. Make bitcasts to scalar explicit in IR.Feb 9 2018, 2:06 PM

Can we remove CMP_MASK_CC completely with this?

I think we still need CMP_MASK_CC. I don't see another type that supports optional rounding mode and truncating the immediate from i32 to i8

OK, LGTM

This revision is now accepted and ready to land.Feb 10 2018, 9:59 AM

Closed by commit rL324827: [X86] Change signatures of avx512 packed fp compare intrinsics to return a vXi1… (authored by ctopper). · Explain WhyFeb 10 2018, 3:36 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

include/

llvm/

IR/

IntrinsicsX86.td

31 lines

lib/

IR/

AutoUpgrade.cpp

59 lines

Target/

X86/

X86ISelLowering.cpp

51 lines

test/

CodeGen/

X86/

avx512-cmp-kor-sequence.ll

50 lines

avx512-intrinsics-upgrade.ll

24 lines

avx512-intrinsics.ll

59 lines

avx512vl-intrinsics-upgrade.ll

47 lines

avx512vl-intrinsics.ll

39 lines

avx512vl-vec-masked-cmp.ll

136 lines

stack-folding-fp-avx512.ll

14 lines

stack-folding-fp-avx512vl.ll

31 lines

Transforms/

InstCombine/

X86/

X86FsubCmpCombine.ll

168 lines

Diff 133773

llvm/trunk/include/llvm/IR/IntrinsicsX86.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,380 Lines • ▼ Show 20 Lines	def int_x86_avx512_maskz_pternlog_q_512 :
Intrinsic<[llvm_v8i64_ty],		Intrinsic<[llvm_v8i64_ty],
[llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,		[llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
llvm_i8_ty], [IntrNoMem]>;		llvm_i8_ty], [IntrNoMem]>;
}		}

// Misc.		// Misc.
let TargetPrefix = "x86" in {		let TargetPrefix = "x86" in {
def int_x86_avx512_mask_cmp_ps_512 :		def int_x86_avx512_mask_cmp_ps_512 :
GCCBuiltin<"__builtin_ia32_cmpps512_mask">,		Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
Intrinsic<[llvm_i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty,		llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
llvm_i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_cmp_pd_512 :		def int_x86_avx512_mask_cmp_pd_512 :
GCCBuiltin<"__builtin_ia32_cmppd512_mask">,		Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_v8f64_ty,		llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_cmp_ps_256 :		def int_x86_avx512_mask_cmp_ps_256 :
GCCBuiltin<"__builtin_ia32_cmpps256_mask">,		Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
Intrinsic<[llvm_i8_ty], [llvm_v8f32_ty, llvm_v8f32_ty,		llvm_i32_ty], [IntrNoMem]>;
llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_cmp_pd_256 :		def int_x86_avx512_mask_cmp_pd_256 :
GCCBuiltin<"__builtin_ia32_cmppd256_mask">,		Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
Intrinsic<[llvm_i8_ty], [llvm_v4f64_ty, llvm_v4f64_ty,		llvm_i32_ty], [IntrNoMem]>;
llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_cmp_ps_128 :		def int_x86_avx512_mask_cmp_ps_128 :
GCCBuiltin<"__builtin_ia32_cmpps128_mask">,		Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,		llvm_i32_ty], [IntrNoMem]>;
llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_cmp_pd_128 :		def int_x86_avx512_mask_cmp_pd_128 :
GCCBuiltin<"__builtin_ia32_cmppd128_mask">,		Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,		llvm_i32_ty], [IntrNoMem]>;
llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_cmp_ss :		def int_x86_avx512_mask_cmp_ss :
GCCBuiltin<"__builtin_ia32_cmpss_mask">,		GCCBuiltin<"__builtin_ia32_cmpss_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,		Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;		llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_cmp_sd :		def int_x86_avx512_mask_cmp_sd :
GCCBuiltin<"__builtin_ia32_cmpsd_mask">,		GCCBuiltin<"__builtin_ia32_cmpsd_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,		Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;		llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
Show All 40 Lines

llvm/trunk/lib/IR/AutoUpgrade.cpp

Show First 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	if (!LastArgType->isIntegerTy(32))
return false;		return false;

// Move this function aside and map down.		// Move this function aside and map down.
rename(F);		rename(F);
NewFn = Intrinsic::getDeclaration(F->getParent(), IID);		NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
return true;		return true;
}		}

		// Upgrade the declaration of fp compare intrinsics that change return type
		// from scalar to vXi1 mask.
		static bool UpgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID,
		Function *&NewFn) {
		// Check if the return type is a vector.
		if (F->getReturnType()->isVectorTy())
		return false;

		rename(F);
		NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
		return true;
		}

static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {		static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
// All of the intrinsics matches below should be marked with which llvm		// All of the intrinsics matches below should be marked with which llvm
// version started autoupgrading them. At some point in the future we would		// version started autoupgrading them. At some point in the future we would
// like to use this information to remove upgrade code for some older		// like to use this information to remove upgrade code for some older
// intrinsics. It is currently undecided how we will determine that future		// intrinsics. It is currently undecided how we will determine that future
// point.		// point.
if (Name=="ssse3.pabs.b.128" \|\| // Added in 6.0		if (Name=="ssse3.pabs.b.128" \|\| // Added in 6.0
Name=="ssse3.pabs.w.128" \|\| // Added in 6.0		Name=="ssse3.pabs.w.128" \|\| // Added in 6.0
▲ Show 20 Lines • Show All 241 Lines • ▼ Show 20 Lines	if (Name == "sse41.mpsadbw") // Added in 3.6
return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,		return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
NewFn);		NewFn);
if (Name == "avx.dp.ps.256") // Added in 3.6		if (Name == "avx.dp.ps.256") // Added in 3.6
return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,		return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
NewFn);		NewFn);
if (Name == "avx2.mpsadbw") // Added in 3.6		if (Name == "avx2.mpsadbw") // Added in 3.6
return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,		return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
NewFn);		NewFn);
		if (Name == "avx512.mask.cmp.pd.128") // Added in 7.0
		return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_128,
		NewFn);
		if (Name == "avx512.mask.cmp.pd.256") // Added in 7.0
		return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_256,
		NewFn);
		if (Name == "avx512.mask.cmp.pd.512") // Added in 7.0
		return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_512,
		NewFn);
		if (Name == "avx512.mask.cmp.ps.128") // Added in 7.0
		return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_128,
		NewFn);
		if (Name == "avx512.mask.cmp.ps.256") // Added in 7.0
		return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_256,
		NewFn);
		if (Name == "avx512.mask.cmp.ps.512") // Added in 7.0
		return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_512,
		NewFn);

// frcz.ss/sd may need to have an argument dropped. Added in 3.2		// frcz.ss/sd may need to have an argument dropped. Added in 3.2
if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {		if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
rename(F);		rename(F);
NewFn = Intrinsic::getDeclaration(F->getParent(),		NewFn = Intrinsic::getDeclaration(F->getParent(),
Intrinsic::x86_xop_vfrcz_ss);		Intrinsic::x86_xop_vfrcz_ss);
return true;		return true;
}		}
▲ Show 20 Lines • Show All 2,079 Lines • ▼ Show 20 Lines	SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
CI->arg_operands().end());		CI->arg_operands().end());

// Replace the last argument with a trunc.		// Replace the last argument with a trunc.
Args.back() = Builder.CreateTrunc(Args.back(), Type::getInt8Ty(C), "trunc");		Args.back() = Builder.CreateTrunc(Args.back(), Type::getInt8Ty(C), "trunc");
NewCall = Builder.CreateCall(NewFn, Args);		NewCall = Builder.CreateCall(NewFn, Args);
break;		break;
}		}

		case Intrinsic::x86_avx512_mask_cmp_pd_128:
		case Intrinsic::x86_avx512_mask_cmp_pd_256:
		case Intrinsic::x86_avx512_mask_cmp_pd_512:
		case Intrinsic::x86_avx512_mask_cmp_ps_128:
		case Intrinsic::x86_avx512_mask_cmp_ps_256:
		case Intrinsic::x86_avx512_mask_cmp_ps_512: {
		SmallVector<Value *, 4> Args;
		Args.push_back(CI->getArgOperand(0));
		Args.push_back(CI->getArgOperand(1));
		Args.push_back(CI->getArgOperand(2));
		if (CI->getNumArgOperands() == 5)
		Args.push_back(CI->getArgOperand(4));

		NewCall = Builder.CreateCall(NewFn, Args);
		unsigned NumElts = Args[0]->getType()->getVectorNumElements();
		Value *Res = ApplyX86MaskOn1BitsVec(Builder, NewCall, CI->getArgOperand(3),
		NumElts);

		std::string Name = CI->getName();
		if (!Name.empty()) {
		CI->setName(Name + ".old");
		NewCall->setName(Name);
		}
		CI->replaceAllUsesWith(Res);
		CI->eraseFromParent();
		return;
		}

case Intrinsic::thread_pointer: {		case Intrinsic::thread_pointer: {
NewCall = Builder.CreateCall(NewFn, {});		NewCall = Builder.CreateCall(NewFn, {});
break;		break;
}		}

case Intrinsic::invariant_start:		case Intrinsic::invariant_start:
case Intrinsic::invariant_end:		case Intrinsic::invariant_end:
case Intrinsic::masked_load:		case Intrinsic::masked_load:
▲ Show 20 Lines • Show All 314 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 20,349 Lines • ▼ Show 20 Lines	case FPCLASSS: {
SDValue Imm = Op.getOperand(2);		SDValue Imm = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);		SDValue Mask = Op.getOperand(3);
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);		SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),		SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
Subtarget, DAG);		Subtarget, DAG);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,		return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
DAG.getIntPtrConstant(0, dl));		DAG.getIntPtrConstant(0, dl));
}		}
case CMP_MASK:		case CMP_MASK: {
case CMP_MASK_CC: {
// Comparison intrinsics with masks.		// Comparison intrinsics with masks.
// Example of transformation:		// Example of transformation:
// (i8 (int_x86_avx512_mask_pcmpeq_q_128		// (i8 (int_x86_avx512_mask_pcmpeq_q_128
// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->		// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
// (i8 (bitcast		// (i8 (bitcast
// (v8i1 (insert_subvector undef,		// (v8i1 (insert_subvector undef,
// (v2i1 (and (PCMPEQM %a, %b),		// (v2i1 (and (PCMPEQM %a, %b),
// (extract_subvector		// (extract_subvector
// (v8i1 (bitcast %mask)), 0))), 0))))		// (v8i1 (bitcast %mask)), 0))), 0))))
MVT VT = Op.getOperand(1).getSimpleValueType();		MVT VT = Op.getOperand(1).getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());		MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);		SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
MVT BitcastVT = MVT::getVectorVT(MVT::i1,		MVT BitcastVT = MVT::getVectorVT(MVT::i1,
Mask.getSimpleValueType().getSizeInBits());		Mask.getSimpleValueType().getSizeInBits());
		SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
		Op.getOperand(2));
		SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
		Subtarget, DAG);
		SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
		DAG.getUNDEF(BitcastVT), CmpMask,
		DAG.getIntPtrConstant(0, dl));
		return DAG.getBitcast(Op.getValueType(), Res);
		}

		case CMP_MASK_CC: {
		MVT VT = Op.getOperand(1).getSimpleValueType();
		MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
SDValue Cmp;		SDValue Cmp;
if (IntrData->Type == CMP_MASK_CC) {
SDValue CC = Op.getOperand(3);		SDValue CC = Op.getOperand(3);
CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);		CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
// We specify 2 possible opcodes for intrinsics with rounding modes.		// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,		// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.		// (IntrData->Opc1 != 0), then we check the rounding mode operand.
if (IntrData->Opc1 != 0) {		if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);		SDValue Rnd = Op.getOperand(4);
if (!isRoundModeCurDirection(Rnd))		if (!isRoundModeCurDirection(Rnd))
Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),		Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2), CC, Rnd);		Op.getOperand(2), CC, Rnd);
}		}
//default rounding mode		//default rounding mode
if(!Cmp.getNode())		if (!Cmp.getNode())
Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),		Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2), CC);		Op.getOperand(2), CC);

} else {		return Cmp;
assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2));
}
SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
Subtarget, DAG);
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
DAG.getUNDEF(BitcastVT), CmpMask,
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}		}
case CMP_MASK_SCALAR_CC: {		case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);		SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);		SDValue Src2 = Op.getOperand(2);
SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));		SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
SDValue Mask = Op.getOperand(4);		SDValue Mask = Op.getOperand(4);

SDValue Cmp;		SDValue Cmp;
▲ Show 20 Lines • Show All 18,645 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/avx512-cmp-kor-sequence.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s \| FileCheck %s			; RUN: llc < %s \| FileCheck %s

	; This test checks optimal passing values between "cmp" and "kor" intrinsics			; This test checks optimal passing values between "cmp" and "kor" intrinsics
	; PR28839			; PR28839

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	; Function Attrs: nounwind readnone uwtable			; Function Attrs: nounwind readnone uwtable
	define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, <16 x float> %x) local_unnamed_addr #0 {			define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, <16 x float> %x) local_unnamed_addr #0 {
	; CHECK-LABEL: cmp_kor_seq_16:			; CHECK-LABEL: cmp_kor_seq_16:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0			; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0
	; CHECK-NEXT: kmovw %k0, %ecx			; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1
	; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k0			; CHECK-NEXT: korw %k1, %k0, %k0
	; CHECK-NEXT: kmovw %k0, %edx			; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1
	; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k0			; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2
	; CHECK-NEXT: kmovw %k0, %esi			; CHECK-NEXT: korw %k2, %k1, %k1
	; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k0			; CHECK-NEXT: korw %k1, %k0, %k0
	; CHECK-NEXT: kmovw %k0, %eax			; CHECK-NEXT: kmovw %k0, %eax
	; CHECK-NEXT: orl %ecx, %edx
	; CHECK-NEXT: orl %esi, %eax
	; CHECK-NEXT: orl %edx, %eax
	; CHECK-NEXT: # kill: def $ax killed $ax killed $eax			; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, i16 -1, i32 4)			%0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, i32 4)
	%1 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %b, <16 x float> %x, i32 13, i16 -1, i32 4)			%1 = bitcast <16 x i1> %0 to i16
	%2 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %c, <16 x float> %x, i32 13, i16 -1, i32 4)			%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %b, <16 x float> %x, i32 13, i32 4)
	%3 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %d, <16 x float> %x, i32 13, i16 -1, i32 4)			%3 = bitcast <16 x i1> %2 to i16
	%4 = tail call i16 @llvm.x86.avx512.kor.w(i16 %0, i16 %1) #2			%4 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %c, <16 x float> %x, i32 13, i32 4)
	%5 = tail call i16 @llvm.x86.avx512.kor.w(i16 %2, i16 %3) #2			%5 = bitcast <16 x i1> %4 to i16
	%6 = tail call i16 @llvm.x86.avx512.kor.w(i16 %4, i16 %5) #2			%6 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %d, <16 x float> %x, i32 13, i32 4)
	ret i16 %6			%7 = bitcast <16 x i1> %6 to i16
				%8 = bitcast i16 %1 to <16 x i1>
				%9 = bitcast i16 %3 to <16 x i1>
				%10 = or <16 x i1> %8, %9
				%11 = bitcast <16 x i1> %10 to i16
				%12 = bitcast i16 %5 to <16 x i1>
				%13 = bitcast i16 %7 to <16 x i1>
				%14 = or <16 x i1> %12, %13
				%15 = bitcast <16 x i1> %14 to i16
				%16 = bitcast i16 %11 to <16 x i1>
				%17 = bitcast i16 %15 to <16 x i1>
				%18 = or <16 x i1> %16, %17
				%19 = bitcast <16 x i1> %18 to i16
				ret i16 %19
	}			}

	; Function Attrs: nounwind readnone			; Function Attrs: nounwind readnone
	declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32) #1			declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) #1

	; Function Attrs: nounwind readnone
	declare i16 @llvm.x86.avx512.kor.w(i16, i16) #1

	attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }			attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }
	attributes #2 = { nounwind }

llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll

Show First 20 Lines • Show All 3,854 Lines • ▼ Show 20 Lines	entry:
%3 = bitcast <8 x i64> %C to <16 x i32>		%3 = bitcast <8 x i64> %C to <16 x i32>
%4 = bitcast <8 x i64> %D to <16 x i32>		%4 = bitcast <8 x i64> %D to <16 x i32>
%5 = icmp ne <16 x i32> %3, %4		%5 = icmp ne <16 x i32> %3, %4
%6 = bitcast <16 x i1> %2 to i16		%6 = bitcast <16 x i1> %2 to i16
%7 = bitcast <16 x i1> %5 to i16		%7 = bitcast <16 x i1> %5 to i16
%res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7)		%res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7)
ret i32 %res		ret i32 %res
}		}

		define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
		; CHECK-LABEL: test_cmpps:
		; CHECK: ## %bb.0:
		; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
		; CHECK-NEXT: kmovw %k0, %eax
		; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
		; CHECK-NEXT: retq
		%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
		ret i16 %res
		}
		declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)

		define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
		; CHECK-LABEL: test_cmppd:
		; CHECK: ## %bb.0:
		; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
		; CHECK-NEXT: kmovw %k0, %eax
		; CHECK-NEXT: ## kill: def $al killed $al killed $eax
		; CHECK-NEXT: retq
		%res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
		ret i8 %res
		}
		declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)

llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll

	Show First 20 Lines • Show All 659 Lines • ▼ Show 20 Lines
	; CHECK: ## %bb.0:			; CHECK: ## %bb.0:
	; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0			; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]			%res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
	ret <8 x double> %res			ret <8 x double> %res
	}			}
	declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly			declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly

	define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {			define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
	; CHECK-LABEL: test_cmpps:			; CHECK-LABEL: test_cmpps:
	; CHECK: ## %bb.0:			; CHECK: ## %bb.0:
	; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0			; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
	; CHECK-NEXT: kmovw %k0, %eax			; CHECK-NEXT: kmovw %k0, %eax
	; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax			; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)			%res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
	ret i16 %res			%1 = bitcast <16 x i1> %res to i16
				ret i16 %1
	}			}
	declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)			declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)

	define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {			define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
	; CHECK-LABEL: test_cmppd:			; CHECK-LABEL: test_cmppd:
	; CHECK: ## %bb.0:			; CHECK: ## %bb.0:
	; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0			; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
	; CHECK-NEXT: kmovw %k0, %eax			; CHECK-NEXT: kmovw %k0, %eax
	; CHECK-NEXT: ## kill: def $al killed $al killed $eax			; CHECK-NEXT: ## kill: def $al killed $al killed $eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)			%res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4)
	ret i8 %res			%1 = bitcast <8 x i1> %res to i8
				ret i8 %1
	}			}
	declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)			declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)

				; Function Attrs: nounwind readnone

	; fp min - max			; fp min - max
	define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {			define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
	; CHECK-LABEL: test_vmaxpd:			; CHECK-LABEL: test_vmaxpd:
	; CHECK: ## %bb.0:			; CHECK: ## %bb.0:
	; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0			; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,			%res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
	▲ Show 20 Lines • Show All 4,297 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: movzbl %al, %eax			; CHECK-NEXT: movzbl %al, %eax
	; CHECK-NEXT: movzbl %cl, %ecx			; CHECK-NEXT: movzbl %cl, %ecx
	; CHECK-NEXT: kmovw %eax, %k0			; CHECK-NEXT: kmovw %eax, %k0
	; CHECK-NEXT: kmovw %ecx, %k1			; CHECK-NEXT: kmovw %ecx, %k1
	; CHECK-NEXT: kunpckbw %k0, %k1, %k1			; CHECK-NEXT: kunpckbw %k0, %k1, %k1
	; CHECK-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}			; CHECK-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i8 -1, i32 4)			%0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
	%1 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i8 -1, i32 4)			%1 = bitcast <8 x i1> %0 to i8
	%conv = zext i8 %0 to i16			%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i32 4)
	%conv2 = zext i8 %1 to i16			%3 = bitcast <8 x i1> %2 to i8
	%2 = bitcast i16 %conv to <16 x i1>			%conv = zext i8 %1 to i16
	%3 = bitcast i16 %conv2 to <16 x i1>			%conv2 = zext i8 %3 to i16
	%4 = shufflevector <16 x i1> %2, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>			%4 = bitcast i16 %conv to <16 x i1>
	%5 = shufflevector <16 x i1> %3, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>			%5 = bitcast i16 %conv2 to <16 x i1>
	%6 = shufflevector <8 x i1> %4, <8 x i1> %5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>			%6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	%7 = select <16 x i1> %6, <16 x float> %f, <16 x float> %e			%7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	ret <16 x float> %7			%8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
				%9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e
				ret <16 x float> %9
	}			}

	define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {			define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {
	; CHECK-LABEL: bad_mask_transition_2:			; CHECK-LABEL: bad_mask_transition_2:
	; CHECK: ## %bb.0: ## %entry			; CHECK: ## %bb.0: ## %entry
	; CHECK-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0			; CHECK-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
	; CHECK-NEXT: kmovw %k0, %eax			; CHECK-NEXT: kmovw %k0, %eax
	; CHECK-NEXT: movzbl %al, %eax			; CHECK-NEXT: movzbl %al, %eax
	; CHECK-NEXT: kmovw %eax, %k1			; CHECK-NEXT: kmovw %eax, %k1
	; CHECK-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}			; CHECK-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i8 -1, i32 4)			%0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
	%conv = zext i8 %0 to i16			%1 = bitcast <8 x i1> %0 to i8
	%1 = bitcast i16 %conv to <16 x i1>			%conv = zext i8 %1 to i16
	%2 = select <16 x i1> %1, <16 x float> %f, <16 x float> %e			%2 = bitcast i16 %conv to <16 x i1>
	ret <16 x float> %2			%3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e
				ret <16 x float> %3
	}			}

llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 6,112 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: ## kill: def $al killed $al killed $eax			; CHECK-NEXT: ## kill: def $al killed $al killed $eax
	; CHECK-NEXT: retq ## encoding: [0xc3]			; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)			%res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
	%res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)			%res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
	%res2 = add i8 %res, %res1			%res2 = add i8 %res, %res1
	ret i8 %res2			ret i8 %res2
	}			}

				define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
				; CHECK-LABEL: test_cmpps_256:
				; CHECK: ## %bb.0:
				; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]
				; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
				; CHECK-NEXT: ## kill: def $al killed $al killed $eax
				; CHECK-NEXT: retq ## encoding: [0xc3]
				%res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1)
				ret i8 %res
				}
				declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> , <8 x float> , i32, i8)

				define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
				; CHECK-LABEL: test_cmpps_128:
				; CHECK: ## %bb.0:
				; CHECK-NEXT: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
				; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
				; CHECK-NEXT: ## kill: def $al killed $al killed $eax
				; CHECK-NEXT: retq ## encoding: [0xc3]
				%res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1)
				ret i8 %res
				}
				declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> , <4 x float> , i32, i8)

				define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
				; CHECK-LABEL: test_cmppd_256:
				; CHECK: ## %bb.0:
				; CHECK-NEXT: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]
				; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
				; CHECK-NEXT: ## kill: def $al killed $al killed $eax
				; CHECK-NEXT: retq ## encoding: [0xc3]
				%res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1)
				ret i8 %res
				}
				declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> , <4 x double> , i32, i8)

				define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
				; CHECK-LABEL: test_cmppd_128:
				; CHECK: ## %bb.0:
				; CHECK-NEXT: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]
				; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
				; CHECK-NEXT: ## kill: def $al killed $al killed $eax
				; CHECK-NEXT: retq ## encoding: [0xc3]
				%res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1)
				ret i8 %res
				}
				declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8)

llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 714 Lines • ▼ Show 20 Lines

	define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {			define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
	; CHECK-LABEL: test_cmpps_256:			; CHECK-LABEL: test_cmpps_256:
	; CHECK: ## %bb.0:			; CHECK: ## %bb.0:
	; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]			; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]
	; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]			; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
	; CHECK-NEXT: ## kill: def $al killed $al killed $eax			; CHECK-NEXT: ## kill: def $al killed $al killed $eax
	; CHECK-NEXT: retq ## encoding: [0xc3]			; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1)			%res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2)
	ret i8 %res			%1 = bitcast <8 x i1> %res to i8
				ret i8 %1
	}			}
	declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> , <8 x float> , i32, i8)			declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32)

	define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {			define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
	; CHECK-LABEL: test_cmpps_128:			; CHECK-LABEL: test_cmpps_128:
	; CHECK: ## %bb.0:			; CHECK: ## %bb.0:
	; CHECK-NEXT: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]			; CHECK-NEXT: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
	; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]			; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
	; CHECK-NEXT: ## kill: def $al killed $al killed $eax			; CHECK-NEXT: ## kill: def $al killed $al killed $eax
	; CHECK-NEXT: retq ## encoding: [0xc3]			; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1)			%res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2)
	ret i8 %res			%1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}
	declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> , <4 x float> , i32, i8)			declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32)

	define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {			define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
	; CHECK-LABEL: test_cmppd_256:			; CHECK-LABEL: test_cmppd_256:
	; CHECK: ## %bb.0:			; CHECK: ## %bb.0:
	; CHECK-NEXT: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]			; CHECK-NEXT: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]
	; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]			; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
	; CHECK-NEXT: ## kill: def $al killed $al killed $eax			; CHECK-NEXT: ## kill: def $al killed $al killed $eax
	; CHECK-NEXT: retq ## encoding: [0xc3]			; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1)			%res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2)
	ret i8 %res			%1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}
	declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> , <4 x double> , i32, i8)			declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32)

	define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {			define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
	; CHECK-LABEL: test_cmppd_128:			; CHECK-LABEL: test_cmppd_128:
	; CHECK: ## %bb.0:			; CHECK: ## %bb.0:
	; CHECK-NEXT: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]			; CHECK-NEXT: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]
	; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]			; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
	; CHECK-NEXT: ## kill: def $al killed $al killed $eax			; CHECK-NEXT: ## kill: def $al killed $al killed $eax
	; CHECK-NEXT: retq ## encoding: [0xc3]			; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1)			%res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2)
	ret i8 %res			%1 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}
	declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8)			declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32)

	define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {			define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
	; CHECK-LABEL: test_mm512_maskz_max_ps_256:			; CHECK-LABEL: test_mm512_maskz_max_ps_256:
	; CHECK: ## %bb.0:			; CHECK: ## %bb.0:
	; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]			; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
	; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1]			; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]			; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)			%1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
	▲ Show 20 Lines • Show All 4,421 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 21,788 Lines • ▼ Show 20 Lines	entry:
%3 = bitcast i8 %__u to <8 x i1>		%3 = bitcast i8 %__u to <8 x i1>
%4 = and <8 x i1> %3, %2		%4 = and <8 x i1> %3, %2
%5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>		%5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%6 = bitcast <64 x i1> %5 to i64		%6 = bitcast <64 x i1> %5 to i64
ret i64 %6		ret i64 %6
}		}


declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)		declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {		define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:		; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0		; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax		; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def $al killed $al killed $eax		; VLX-NEXT: # kill: def $al killed $al killed $eax
; VLX-NEXT: retq		; VLX-NEXT: retq
;		;
▲ Show 20 Lines • Show All 1,450 Lines • ▼ Show 20 Lines
; CHECK: # %bb.0: # %entry		; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0		; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax		; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: vzeroupper		; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq		; CHECK-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <16 x float>		%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>		%1 = bitcast <8 x i64> %__b to <16 x float>
%2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)		%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
%3 = zext i16 %2 to i32		%3 = bitcast <16 x i1> %2 to i16
ret i32 %3		%4 = zext i16 %3 to i32
		ret i32 %4
}		}

define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {		define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:		; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1		; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}		; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: kmovw %k0, %eax		; VLX-NEXT: andl %edi, %eax
; VLX-NEXT: vzeroupper		; VLX-NEXT: vzeroupper
; VLX-NEXT: retq		; VLX-NEXT: retq
;		;
; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:		; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:
; NoVLX: # %bb.0: # %entry		; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1		; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
		; NoVLX-NEXT: andl %edi, %eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <16 x float>		%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>		%1 = bitcast <8 x i64> %__b to <16 x float>
%2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8)		%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
%3 = zext i16 %2 to i32		%3 = bitcast i16 %__u to <16 x i1>
ret i32 %3		%4 = and <16 x i1> %2, %3
		%5 = bitcast <16 x i1> %4 to i16
		%6 = zext i16 %5 to i32
		ret i32 %6
}		}



define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {		define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:		; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0		; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
▲ Show 20 Lines • Show All 168 Lines • ▼ Show 20 Lines
; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0		; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzwl %ax, %eax		; NoVLX-NEXT: movzwl %ax, %eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <16 x float>		%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>		%1 = bitcast <8 x i64> %__b to <16 x float>
%2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)		%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
%3 = zext i16 %2 to i64		%3 = bitcast <16 x i1> %2 to i16
ret i64 %3		%4 = zext i16 %3 to i64
		ret i64 %4
}		}

define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {		define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:		; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1		; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax		; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: movzwl %ax, %eax		; VLX-NEXT: andl %edi, %eax
; VLX-NEXT: vzeroupper		; VLX-NEXT: vzeroupper
; VLX-NEXT: retq		; VLX-NEXT: retq
;		;
; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:		; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:
; NoVLX: # %bb.0: # %entry		; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1		; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzwl %ax, %eax		; NoVLX-NEXT: andl %edi, %eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <16 x float>		%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>		%1 = bitcast <8 x i64> %__b to <16 x float>
%2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8)		%2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8)
%3 = zext i16 %2 to i64		%3 = bitcast i16 %__u to <16 x i1>
ret i64 %3		%4 = and <16 x i1> %2, %3
		%5 = bitcast <16 x i1> %4 to i16
		%6 = zext i16 %5 to i64
		ret i64 %6
}		}



declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)		declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {		define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:		; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0		; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovb %k0, %eax		; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq		; VLX-NEXT: retq
;		;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:		; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
▲ Show 20 Lines • Show All 1,855 Lines • ▼ Show 20 Lines
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzbl %al, %eax		; NoVLX-NEXT: movzbl %al, %eax
; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax		; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <8 x double>		%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>		%1 = bitcast <8 x i64> %__b to <8 x double>
%2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)		%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
%3 = zext i8 %2 to i16		%3 = bitcast <8 x i1> %2 to i8
ret i16 %3		%4 = zext i8 %3 to i16
		ret i16 %4
}		}

define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {		define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:		; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1		; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax		; VLX-NEXT: kmovd %k0, %eax
		; VLX-NEXT: andb %dil, %al
; VLX-NEXT: movzbl %al, %eax		; VLX-NEXT: movzbl %al, %eax
; VLX-NEXT: # kill: def $ax killed $ax killed $eax		; VLX-NEXT: # kill: def $ax killed $ax killed $eax
; VLX-NEXT: vzeroupper		; VLX-NEXT: vzeroupper
; VLX-NEXT: retq		; VLX-NEXT: retq
;		;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:		; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:
; NoVLX: # %bb.0: # %entry		; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1		; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
		; NoVLX-NEXT: andb %dil, %al
; NoVLX-NEXT: movzbl %al, %eax		; NoVLX-NEXT: movzbl %al, %eax
; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax		; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <8 x double>		%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>		%1 = bitcast <8 x i64> %__b to <8 x double>
%2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)		%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
%3 = zext i8 %2 to i16		%3 = bitcast i8 %__u to <8 x i1>
ret i16 %3		%4 = and <8 x i1> %2, %3
		%5 = bitcast <8 x i1> %4 to i8
		%6 = zext i8 %5 to i16
		ret i16 %6
}		}



define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {		define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:		; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0		; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
▲ Show 20 Lines • Show All 164 Lines • ▼ Show 20 Lines
; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0		; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzbl %al, %eax		; NoVLX-NEXT: movzbl %al, %eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <8 x double>		%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>		%1 = bitcast <8 x i64> %__b to <8 x double>
%2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)		%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
%3 = zext i8 %2 to i32		%3 = bitcast <8 x i1> %2 to i8
ret i32 %3		%4 = zext i8 %3 to i32
		ret i32 %4
}		}

define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {		define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:		; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1		; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}		; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: kmovb %k0, %eax		; VLX-NEXT: andb %dil, %al
		; VLX-NEXT: movzbl %al, %eax
; VLX-NEXT: vzeroupper		; VLX-NEXT: vzeroupper
; VLX-NEXT: retq		; VLX-NEXT: retq
;		;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:		; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:
; NoVLX: # %bb.0: # %entry		; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1		; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
		; NoVLX-NEXT: andb %dil, %al
; NoVLX-NEXT: movzbl %al, %eax		; NoVLX-NEXT: movzbl %al, %eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <8 x double>		%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>		%1 = bitcast <8 x i64> %__b to <8 x double>
%2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)		%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
%3 = zext i8 %2 to i32		%3 = bitcast i8 %__u to <8 x i1>
ret i32 %3		%4 = and <8 x i1> %2, %3
		%5 = bitcast <8 x i1> %4 to i8
		%6 = zext i8 %5 to i32
		ret i32 %6
}		}



define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {		define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:		; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0		; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
▲ Show 20 Lines • Show All 171 Lines • ▼ Show 20 Lines
; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0		; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzbl %al, %eax		; NoVLX-NEXT: movzbl %al, %eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <8 x double>		%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>		%1 = bitcast <8 x i64> %__b to <8 x double>
%2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)		%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
%3 = zext i8 %2 to i64		%3 = bitcast <8 x i1> %2 to i8
ret i64 %3		%4 = zext i8 %3 to i64
		ret i64 %4
}		}

define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {		define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:		; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:
; VLX: # %bb.0: # %entry		; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1		; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax		; VLX-NEXT: kmovd %k0, %eax
		; VLX-NEXT: andb %dil, %al
; VLX-NEXT: movzbl %al, %eax		; VLX-NEXT: movzbl %al, %eax
; VLX-NEXT: vzeroupper		; VLX-NEXT: vzeroupper
; VLX-NEXT: retq		; VLX-NEXT: retq
;		;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:		; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:
; NoVLX: # %bb.0: # %entry		; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1		; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
		; NoVLX-NEXT: andb %dil, %al
; NoVLX-NEXT: movzbl %al, %eax		; NoVLX-NEXT: movzbl %al, %eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
entry:		entry:
%0 = bitcast <8 x i64> %__a to <8 x double>		%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>		%1 = bitcast <8 x i64> %__b to <8 x double>
%2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)		%2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8)
%3 = zext i8 %2 to i64		%3 = bitcast i8 %__u to <8 x i1>
ret i64 %3		%4 = and <8 x i1> %2, %3
		%5 = bitcast <8 x i1> %4 to i8
		%6 = zext i8 %5 to i64
		ret i64 %6
}		}

; Test that we understand that cmpps with rounding zeros the upper bits of the mask register.		; Test that we understand that cmpps with rounding zeros the upper bits of the mask register.
define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) {		define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) {
; VLX-LABEL: test_cmpm_rnd_zero:		; VLX-LABEL: test_cmpm_rnd_zero:
; VLX: # %bb.0:		; VLX: # %bb.0:
; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0		; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax		; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper		; VLX-NEXT: vzeroupper
; VLX-NEXT: retq		; VLX-NEXT: retq
;		;
; NoVLX-LABEL: test_cmpm_rnd_zero:		; NoVLX-LABEL: test_cmpm_rnd_zero:
; NoVLX: # %bb.0:		; NoVLX: # %bb.0:
; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0		; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax		; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vzeroupper		; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq		; NoVLX-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)		%res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
%cast = bitcast i16 %res to <16 x i1>		%1 = bitcast <16 x i1> %res to i16
		%cast = bitcast i16 %1 to <16 x i1>
%shuffle = shufflevector <16 x i1> %cast, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>		%shuffle = shufflevector <16 x i1> %cast, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%cast2 = bitcast <32 x i1> %shuffle to i32		%cast2 = bitcast <32 x i1> %shuffle to i32
ret i32 %cast2		ret i32 %cast2
}		}

define i8 @mask_zero_lower(<4 x i32> %a) {		define i8 @mask_zero_lower(<4 x i32> %a) {
; VLX-LABEL: mask_zero_lower:		; VLX-LABEL: mask_zero_lower:
; VLX: # %bb.0:		; VLX: # %bb.0:
Show All 21 Lines

llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll

Show First 20 Lines • Show All 134 Lines • ▼ Show 20 Lines	define <16 x float> @stack_fold_andps_zmm(<16 x float> %a0, <16 x float> %a1) {
%6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>		%6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
ret <16 x float> %6		ret <16 x float> %6
}		}

define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) {		define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) {
;CHECK-LABEL: stack_fold_cmppd		;CHECK-LABEL: stack_fold_cmppd
;CHECK: vcmpeqpd {{-?[0-9]}}(%rsp), {{%zmm[0-9][0-9]}}, {{%k[0-9]}} {{.*#+}} 64-byte Folded Reload		;CHECK: vcmpeqpd {{-?[0-9]}}(%rsp), {{%zmm[0-9][0-9]}}, {{%k[0-9]}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, i8 -1, i32 4)		%res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, i32 4)
ret i8 %res		%2 = bitcast <8 x i1> %res to i8
		ret i8 %2
}		}
declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)		declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)

define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {		define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {
;CHECK-LABEL: stack_fold_cmpps		;CHECK-LABEL: stack_fold_cmpps
;CHECK: vcmpeqps {{-?[0-9]}}(%rsp), {{%zmm[0-9][0-9]}}, {{%k[0-9]}} {{.#+}} 64-byte Folded Reload		;CHECK: vcmpeqps {{-?[0-9]}}(%rsp), {{%zmm[0-9][0-9]}}, {{%k[0-9]}} {{.#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, i16 -1, i32 4)		%res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, i32 4)
ret i16 %res		%2 = bitcast <16 x i1> %res to i16
		ret i16 %2
}		}
declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)		declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)

define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {		define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_divsd_int		;CHECK-LABEL: stack_fold_divsd_int
;CHECK: vdivsd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vdivsd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = extractelement <2 x double> %a0, i32 0		%2 = extractelement <2 x double> %a0, i32 0
%3 = extractelement <2 x double> %a1, i32 0		%3 = extractelement <2 x double> %a1, i32 0
%4 = fdiv double %2, %3		%4 = fdiv double %2, %3
▲ Show 20 Lines • Show All 691 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512vl.ll

Show First 20 Lines • Show All 146 Lines • ▼ Show 20 Lines	define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) {
%6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>		%6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
ret <8 x float> %6		ret <8 x float> %6
}		}

define i8 @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {		define i8 @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_cmppd		;CHECK-LABEL: stack_fold_cmppd
;CHECK: vcmpeqpd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%k[0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: vcmpeqpd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%k[0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a0, <2 x double> %a1, i32 0, i8 -1)		%res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a0, <2 x double> %a1, i32 0)
ret i8 %res		%2 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
		%3 = bitcast <8 x i1> %2 to i8
		ret i8 %3
}		}
declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8)		declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32)

define i8 @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {		define i8 @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
;CHECK-LABEL: stack_fold_cmppd_ymm		;CHECK-LABEL: stack_fold_cmppd_ymm
;CHECK: vcmpeqpd {{-?[0-9]}}(%rsp), {{%ymm[0-9][0-9]}}, {{%k[0-9]}} {{.*#+}} 32-byte Folded Reload		;CHECK: vcmpeqpd {{-?[0-9]}}(%rsp), {{%ymm[0-9][0-9]}}, {{%k[0-9]}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0, i8 -1)		%res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
ret i8 %res		%2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		%3 = bitcast <8 x i1> %2 to i8
		ret i8 %3
}		}
declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> , <4 x double> , i32, i8)		declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32)

define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {		define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
;CHECK-LABEL: stack_fold_cmpps		;CHECK-LABEL: stack_fold_cmpps
;CHECK: vcmpeqps {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%k[0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vcmpeqps {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%k[0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a0, <4 x float> %a1, i32 0, i8 -1)		%res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a0, <4 x float> %a1, i32 0)
ret i8 %res		%2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
		%3 = bitcast <8 x i1> %2 to i8
		ret i8 %3
}		}
declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> , <4 x float> , i32, i8)		declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32)

define i8 @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {		define i8 @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
;CHECK-LABEL: stack_fold_cmpps_ymm		;CHECK-LABEL: stack_fold_cmpps_ymm
;CHECK: vcmpeqps {{-?[0-9]}}(%rsp), {{%ymm[0-9][0-9]}}, {{%k[0-9]}} {{.#+}} 32-byte Folded Reload		;CHECK: vcmpeqps {{-?[0-9]}}(%rsp), {{%ymm[0-9][0-9]}}, {{%k[0-9]}} {{.#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0, i8 -1)		%res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0)
ret i8 %res		%2 = bitcast <8 x i1> %res to i8
		ret i8 %2
}		}
declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> , <8 x float> , i32, i8)		declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32)

define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {		define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_divpd		;CHECK-LABEL: stack_fold_divpd
;CHECK: vdivpd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vdivpd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fdiv <2 x double> %a0, %a1		%2 = fdiv <2 x double> %a0, %a1
ret <2 x double> %2		ret <2 x double> %2
}		}
▲ Show 20 Lines • Show All 619 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -instcombine -S \| FileCheck %s			; RUN: opt < %s -instcombine -S \| FileCheck %s

	; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).			; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).

	define i8 @sub_compare_foldingPD128_safe(<2 x double> %a, <2 x double> %b){			define i8 @sub_compare_foldingPD128_safe(<2 x double> %a, <2 x double> %b){
	; CHECK-LABEL: @sub_compare_foldingPD128_safe(			; CHECK-LABEL: @sub_compare_foldingPD128_safe(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[SUB_SAFE:%.]] = fsub <2 x double> [[A:%.]], [[B:%.*]]			; CHECK-NEXT: [[SUB_SAFE:%.]] = fsub <2 x double> [[A:%.]], [[B:%.*]]
	; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5, i8 -1)			; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
				; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
				; CHECK-NEXT: ret i8 [[TMP2]]
	;			;
	entry:			entry:
	%sub.safe = fsub <2 x double> %a, %b			%sub.safe = fsub <2 x double> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.safe , <2 x double> zeroinitializer, i32 5, i8 -1)			%0 = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.safe, <2 x double> zeroinitializer, i32 5)
	ret i8 %0			%1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}


	define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){			define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){
	; CHECK-LABEL: @sub_compare_foldingPD128(			; CHECK-LABEL: @sub_compare_foldingPD128(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.]], <2 x double> [[B:%.*]], i32 5, i8 -1)			; CHECK-NEXT: [[TMP0:%.]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.]], <2 x double> [[B:%.*]], i32 5)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
				; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
				; CHECK-NEXT: ret i8 [[TMP2]]
	;			;
	entry:			entry:
	%sub.i = fsub ninf <2 x double> %a, %b			%sub.i = fsub ninf <2 x double> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.i , <2 x double> zeroinitializer, i32 5, i8 -1)			%0 = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.i, <2 x double> zeroinitializer, i32 5)
	ret i8 %0			%1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}


	define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){			define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){
	; CHECK-LABEL: @sub_compare_foldingPD256(			; CHECK-LABEL: @sub_compare_foldingPD256(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.]], <4 x double> [[B:%.*]], i32 5, i8 -1)			; CHECK-NEXT: [[TMP0:%.]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.]], <4 x double> [[B:%.*]], i32 5)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
				; CHECK-NEXT: ret i8 [[TMP2]]
	;			;
	entry:			entry:
	%sub.i1 = fsub ninf <4 x double> %a, %b			%sub.i1 = fsub ninf <4 x double> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5, i8 -1)			%0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5)
	ret i8 %0			%1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}


	define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){			define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){
	; CHECK-LABEL: @sub_compare_foldingPD512(			; CHECK-LABEL: @sub_compare_foldingPD512(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.]], <8 x double> [[B:%.*]], i32 11, i8 -1, i32 4)			; CHECK-NEXT: [[TMP0:%.]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.]], <8 x double> [[B:%.*]], i32 11, i32 4)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8
				; CHECK-NEXT: ret i8 [[TMP1]]
	;			;
	entry:			entry:
	%sub.i2 = fsub ninf <8 x double> %a, %b			%sub.i2 = fsub ninf <8 x double> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i8 -1, i32 4)			%0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i32 4)
	ret i8 %0			%1 = bitcast <8 x i1> %0 to i8
				ret i8 %1
	}			}


	define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){			define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){
	; CHECK-LABEL: @sub_compare_foldingPS128(			; CHECK-LABEL: @sub_compare_foldingPS128(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.]], <4 x float> [[B:%.*]], i32 12, i8 -1)			; CHECK-NEXT: [[TMP0:%.]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.]], <4 x float> [[B:%.*]], i32 12)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
				; CHECK-NEXT: ret i8 [[TMP2]]
	;			;
	entry:			entry:
	%sub.i3 = fsub ninf <4 x float> %a, %b			%sub.i3 = fsub ninf <4 x float> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12, i8 -1)			%0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12)
	ret i8 %0			%1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}


	define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){			define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){
	; CHECK-LABEL: @sub_compare_foldingPS256(			; CHECK-LABEL: @sub_compare_foldingPS256(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.]], <8 x float> [[B:%.*]], i32 5, i8 -1)			; CHECK-NEXT: [[TMP0:%.]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.]], <8 x float> [[B:%.*]], i32 5)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8
				; CHECK-NEXT: ret i8 [[TMP1]]
	;			;
	entry:			entry:
	%sub.i4 = fsub ninf <8 x float> %a, %b			%sub.i4 = fsub ninf <8 x float> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5, i8 -1)			%0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5)
	ret i8 %0			%1 = bitcast <8 x i1> %0 to i8
				ret i8 %1
	}			}


	define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){			define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){
	; CHECK-LABEL: @sub_compare_foldingPS512(			; CHECK-LABEL: @sub_compare_foldingPS512(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.]], <16 x float> [[B:%.*]], i32 11, i16 -1, i32 4)			; CHECK-NEXT: [[TMP0:%.]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.]], <16 x float> [[B:%.*]], i32 11, i32 4)
	; CHECK-NEXT: ret i16 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i1> [[TMP0]] to i16
				; CHECK-NEXT: ret i16 [[TMP1]]
	;			;
	entry:			entry:
	%sub.i5 = fsub ninf <16 x float> %a, %b			%sub.i5 = fsub ninf <16 x float> %a, %b
	%0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i16 -1, i32 4)			%0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i32 4)
	ret i16 %0			%1 = bitcast <16 x i1> %0 to i16
				ret i16 %1
	}			}



	define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){			define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){
	; CHECK-LABEL: @sub_compare_folding_swapPD128(			; CHECK-LABEL: @sub_compare_folding_swapPD128(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[B:%.]], <2 x double> [[A:%.*]], i32 5, i8 -1)			; CHECK-NEXT: [[TMP0:%.]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[B:%.]], <2 x double> [[A:%.*]], i32 5)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
				; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
				; CHECK-NEXT: ret i8 [[TMP2]]
	;			;
	entry:			entry:
	%sub.i = fsub ninf <2 x double> %a, %b			%sub.i = fsub ninf <2 x double> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5, i8 -1)			%0 = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5)
	ret i8 %0			%1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}


	define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){			define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){
	; CHECK-LABEL: @sub_compare_folding_swapPD256(			; CHECK-LABEL: @sub_compare_folding_swapPD256(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[B:%.]], <4 x double> [[A:%.*]], i32 5, i8 -1)			; CHECK-NEXT: [[TMP0:%.]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[B:%.]], <4 x double> [[A:%.*]], i32 5)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
				; CHECK-NEXT: ret i8 [[TMP2]]
	;			;
	entry:			entry:
	%sub.i = fsub ninf <4 x double> %a, %b			%sub.i = fsub ninf <4 x double> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5, i8 -1)			%0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5)
	ret i8 %0			%1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}


	define i8 @sub_compare_folding_swapPD256_undef(<4 x double> %a, <4 x double> %b) {			define i8 @sub_compare_folding_swapPD256_undef(<4 x double> %a, <4 x double> %b) {
	; CHECK-LABEL: @sub_compare_folding_swapPD256_undef(			; CHECK-LABEL: @sub_compare_folding_swapPD256_undef(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> fsub (<4 x double> undef, <4 x double> undef), <4 x double> zeroinitializer, i32 5, i8 -1)			; CHECK-NEXT: [[TMP:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> fsub (<4 x double> undef, <4 x double> undef), <4 x double> zeroinitializer, i32 5)
	; CHECK-NEXT: ret i8 [[TMP]]			; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i1> [[TMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8
				; CHECK-NEXT: ret i8 [[TMP1]]
	;			;
	entry:			entry:
	%sub.i1 = fsub ninf <4 x double> undef, undef			%sub.i1 = fsub ninf <4 x double> undef, undef
	%tmp = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5, i8 -1)			%tmp = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5)
	ret i8 %tmp			%0 = shufflevector <4 x i1> %tmp, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%1 = bitcast <8 x i1> %0 to i8
				ret i8 %1
	}			}


	define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){			define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){
	; CHECK-LABEL: @sub_compare_folding_swapPD512(			; CHECK-LABEL: @sub_compare_folding_swapPD512(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[B:%.]], <8 x double> [[A:%.*]], i32 11, i8 -1, i32 4)			; CHECK-NEXT: [[TMP0:%.]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[B:%.]], <8 x double> [[A:%.*]], i32 11, i32 4)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8
				; CHECK-NEXT: ret i8 [[TMP1]]
	;			;
	entry:			entry:
	%sub.i = fsub ninf <8 x double> %a, %b			%sub.i = fsub ninf <8 x double> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i8 -1, i32 4)			%0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i32 4)
	ret i8 %0			%1 = bitcast <8 x i1> %0 to i8
				ret i8 %1
	}			}


	define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){			define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){
	; CHECK-LABEL: @sub_compare_folding_swapPS128(			; CHECK-LABEL: @sub_compare_folding_swapPS128(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[B:%.]], <4 x float> [[A:%.*]], i32 12, i8 -1)			; CHECK-NEXT: [[TMP0:%.]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[B:%.]], <4 x float> [[A:%.*]], i32 12)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
				; CHECK-NEXT: ret i8 [[TMP2]]
	;			;
	entry:			entry:
	%sub.i = fsub ninf <4 x float> %a, %b			%sub.i = fsub ninf <4 x float> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12, i8 -1)			%0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12)
	ret i8 %0			%1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				%2 = bitcast <8 x i1> %1 to i8
				ret i8 %2
	}			}


	define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){			define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){
	; CHECK-LABEL: @sub_compare_folding_swapPS256(			; CHECK-LABEL: @sub_compare_folding_swapPS256(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[B:%.]], <8 x float> [[A:%.*]], i32 5, i8 -1)			; CHECK-NEXT: [[TMP0:%.]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[B:%.]], <8 x float> [[A:%.*]], i32 5)
	; CHECK-NEXT: ret i8 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8
				; CHECK-NEXT: ret i8 [[TMP1]]
	;			;
	entry:			entry:
	%sub.i = fsub ninf <8 x float> %a, %b			%sub.i = fsub ninf <8 x float> %a, %b
	%0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5, i8 -1)			%0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5)
	ret i8 %0			%1 = bitcast <8 x i1> %0 to i8
				ret i8 %1
	}			}


	define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){			define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){
	; CHECK-LABEL: @sub_compare_folding_swapPS512(			; CHECK-LABEL: @sub_compare_folding_swapPS512(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[B:%.]], <16 x float> [[A:%.*]], i32 11, i16 -1, i32 4)			; CHECK-NEXT: [[TMP0:%.]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[B:%.]], <16 x float> [[A:%.*]], i32 11, i32 4)
	; CHECK-NEXT: ret i16 [[TMP0]]			; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i1> [[TMP0]] to i16
				; CHECK-NEXT: ret i16 [[TMP1]]
	;			;
	entry:			entry:
	%sub.i = fsub ninf <16 x float> %a, %b			%sub.i = fsub ninf <16 x float> %a, %b
	%0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i16 -1, i32 4)			%0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i32 4)
	ret i16 %0			%1 = bitcast <16 x i1> %0 to i16
				ret i16 %1
	}			}

	declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, i8)			declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32)
	declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, i8)			declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32)
	declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i8, i32)			declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
	declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)			declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32)
	declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)			declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32)
	declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32)			declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Change signatures of avx512 packed fp compare intrinsics to return a vXi1 mask type to be closer to an fcmp.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 133773

llvm/trunk/include/llvm/IR/IntrinsicsX86.td

llvm/trunk/lib/IR/AutoUpgrade.cpp

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/test/CodeGen/X86/avx512-cmp-kor-sequence.ll

llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll

llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll

llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll

llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll

llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll

llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll

llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512vl.ll

llvm/trunk/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll

[X86] Change signatures of avx512 packed fp compare intrinsics to return a vXi1 mask type to be closer to an fcmp.
ClosedPublic