Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -6386,29 +6386,24 @@ // Misc. let TargetPrefix = "x86" in { def int_x86_avx512_mask_cmp_ps_512 : - GCCBuiltin<"__builtin_ia32_cmpps512_mask">, - Intrinsic<[llvm_i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty, - llvm_i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_v16f32_ty, + llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_pd_512 : - GCCBuiltin<"__builtin_ia32_cmppd512_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_v8f64_ty, - llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_v8f64_ty, + llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_ps_256 : - GCCBuiltin<"__builtin_ia32_cmpps256_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v8f32_ty, llvm_v8f32_ty, - llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_pd_256 : - GCCBuiltin<"__builtin_ia32_cmppd256_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v4f64_ty, llvm_v4f64_ty, - llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_ps_128 : - GCCBuiltin<"__builtin_ia32_cmpps128_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty, - llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_pd_128 : - GCCBuiltin<"__builtin_ia32_cmppd128_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty, - llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_cmp_ss : GCCBuiltin<"__builtin_ia32_cmpss_mask">, Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty, Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -65,6 +65,19 @@ return true; } +// Upgrade the declaration of fp compare intrinsics that change return type +// from scalar to vXi1 mask. +static bool UpgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID, + Function *&NewFn) { + // Check if the return type is a vector. + if (F->getReturnType()->isVectorTy()) + return false; + + rename(F); + NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + return true; +} + static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { // All of the intrinsics matches below should be marked with which llvm // version started autoupgrading them. At some point in the future we would @@ -322,6 +335,24 @@ if (Name == "avx2.mpsadbw") // Added in 3.6 return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw, NewFn); + if (Name == "avx512.mask.cmp.pd.128") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_128, + NewFn); + if (Name == "avx512.mask.cmp.pd.256") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_256, + NewFn); + if (Name == "avx512.mask.cmp.pd.512") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_512, + NewFn); + if (Name == "avx512.mask.cmp.ps.128") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_128, + NewFn); + if (Name == "avx512.mask.cmp.ps.256") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_256, + NewFn); + if (Name == "avx512.mask.cmp.ps.512") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_512, + NewFn); // frcz.ss/sd may need to have an argument dropped. Added in 3.2 if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) { @@ -2417,6 +2448,34 @@ break; } + case Intrinsic::x86_avx512_mask_cmp_pd_128: + case Intrinsic::x86_avx512_mask_cmp_pd_256: + case Intrinsic::x86_avx512_mask_cmp_pd_512: + case Intrinsic::x86_avx512_mask_cmp_ps_128: + case Intrinsic::x86_avx512_mask_cmp_ps_256: + case Intrinsic::x86_avx512_mask_cmp_ps_512: { + SmallVector Args; + Args.push_back(CI->getArgOperand(0)); + Args.push_back(CI->getArgOperand(1)); + Args.push_back(CI->getArgOperand(2)); + if (CI->getNumArgOperands() == 5) + Args.push_back(CI->getArgOperand(4)); + + NewCall = Builder.CreateCall(NewFn, Args); + unsigned NumElts = Args[0]->getType()->getVectorNumElements(); + Value *Res = ApplyX86MaskOn1BitsVec(Builder, NewCall, CI->getArgOperand(3), + NumElts); + + std::string Name = CI->getName(); + if (!Name.empty()) { + CI->setName(Name + ".old"); + NewCall->setName(Name); + } + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + return; + } + case Intrinsic::thread_pointer: { NewCall = Builder.CreateCall(NewFn, {}); break; Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -20355,8 +20355,7 @@ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask, DAG.getIntPtrConstant(0, dl)); } - case CMP_MASK: - case CMP_MASK_CC: { + case CMP_MASK: { // Comparison intrinsics with masks. // Example of transformation: // (i8 (int_x86_avx512_mask_pcmpeq_q_128 @@ -20371,29 +20370,8 @@ SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); - SDValue Cmp; - if (IntrData->Type == CMP_MASK_CC) { - SDValue CC = Op.getOperand(3); - CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); - // We specify 2 possible opcodes for intrinsics with rounding modes. - // First, we check if the intrinsic may have non-default rounding mode, - // (IntrData->Opc1 != 0), then we check the rounding mode operand. - if (IntrData->Opc1 != 0) { - SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC, Rnd); - } - //default rounding mode - if(!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC); - - } else { - assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); - Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2)); - } + SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2)); SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, @@ -20401,6 +20379,29 @@ DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } + + case CMP_MASK_CC: { + MVT VT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Cmp; + SDValue CC = Op.getOperand(3); + CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + if (IntrData->Opc1 != 0) { + SDValue Rnd = Op.getOperand(4); + if (!isRoundModeCurDirection(Rnd)) + Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC, Rnd); + } + //default rounding mode + if (!Cmp.getNode()) + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC); + + return Cmp; + } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); Index: llvm/trunk/test/CodeGen/X86/avx512-cmp-kor-sequence.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ llvm/trunk/test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -12,35 +12,41 @@ ; CHECK-LABEL: cmp_kor_seq_16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k0 -; CHECK-NEXT: kmovw %k0, %edx -; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k0 -; CHECK-NEXT: kmovw %k0, %esi -; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k0 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2 +; CHECK-NEXT: korw %k2, %k1, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: orl %esi, %eax -; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq entry: - %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, i16 -1, i32 4) - %1 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %b, <16 x float> %x, i32 13, i16 -1, i32 4) - %2 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %c, <16 x float> %x, i32 13, i16 -1, i32 4) - %3 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %d, <16 x float> %x, i32 13, i16 -1, i32 4) - %4 = tail call i16 @llvm.x86.avx512.kor.w(i16 %0, i16 %1) #2 - %5 = tail call i16 @llvm.x86.avx512.kor.w(i16 %2, i16 %3) #2 - %6 = tail call i16 @llvm.x86.avx512.kor.w(i16 %4, i16 %5) #2 - ret i16 %6 + %0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, i32 4) + %1 = bitcast <16 x i1> %0 to i16 + %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %b, <16 x float> %x, i32 13, i32 4) + %3 = bitcast <16 x i1> %2 to i16 + %4 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %c, <16 x float> %x, i32 13, i32 4) + %5 = bitcast <16 x i1> %4 to i16 + %6 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %d, <16 x float> %x, i32 13, i32 4) + %7 = bitcast <16 x i1> %6 to i16 + %8 = bitcast i16 %1 to <16 x i1> + %9 = bitcast i16 %3 to <16 x i1> + %10 = or <16 x i1> %8, %9 + %11 = bitcast <16 x i1> %10 to i16 + %12 = bitcast i16 %5 to <16 x i1> + %13 = bitcast i16 %7 to <16 x i1> + %14 = or <16 x i1> %12, %13 + %15 = bitcast <16 x i1> %14 to i16 + %16 = bitcast i16 %11 to <16 x i1> + %17 = bitcast i16 %15 to <16 x i1> + %18 = or <16 x i1> %16, %17 + %19 = bitcast <16 x i1> %18 to i16 + ret i16 %19 } ; Function Attrs: nounwind readnone -declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32) #1 - -; Function Attrs: nounwind readnone -declare i16 @llvm.x86.avx512.kor.w(i16, i16) #1 +declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) #1 attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind } Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -3860,3 +3860,27 @@ %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7) ret i32 %res } + +define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: test_cmpps: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) + ret i16 %res +} +declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) + +define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: test_cmppd: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4) + ret i8 %res +} +declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll @@ -665,29 +665,33 @@ } declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly - define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) { +define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: test_cmpps: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq - %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) - ret i16 %res - } - declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) + %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8) + %1 = bitcast <16 x i1> %res to i16 + ret i16 %1 +} +declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) - define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) { +define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: test_cmppd: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq - %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4) - ret i8 %res - } - declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) + %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4) + %1 = bitcast <8 x i1> %res to i8 + ret i8 %1 +} +declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i32) + +; Function Attrs: nounwind readnone ; fp min - max define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) { @@ -5001,17 +5005,19 @@ ; CHECK-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1} ; CHECK-NEXT: retq entry: - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i8 -1, i32 4) - %1 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i8 -1, i32 4) - %conv = zext i8 %0 to i16 - %conv2 = zext i8 %1 to i16 - %2 = bitcast i16 %conv to <16 x i1> - %3 = bitcast i16 %conv2 to <16 x i1> - %4 = shufflevector <16 x i1> %2, <16 x i1> undef, <8 x i32> - %5 = shufflevector <16 x i1> %3, <16 x i1> undef, <8 x i32> - %6 = shufflevector <8 x i1> %4, <8 x i1> %5, <16 x i32> - %7 = select <16 x i1> %6, <16 x float> %f, <16 x float> %e - ret <16 x float> %7 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4) + %1 = bitcast <8 x i1> %0 to i8 + %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i32 4) + %3 = bitcast <8 x i1> %2 to i8 + %conv = zext i8 %1 to i16 + %conv2 = zext i8 %3 to i16 + %4 = bitcast i16 %conv to <16 x i1> + %5 = bitcast i16 %conv2 to <16 x i1> + %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> + %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> + %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> + %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e + ret <16 x float> %9 } define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) { @@ -5024,9 +5030,10 @@ ; CHECK-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1} ; CHECK-NEXT: retq entry: - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i8 -1, i32 4) - %conv = zext i8 %0 to i16 - %1 = bitcast i16 %conv to <16 x i1> - %2 = select <16 x i1> %1, <16 x float> %f, <16 x float> %e - ret <16 x float> %2 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4) + %1 = bitcast <8 x i1> %0 to i8 + %conv = zext i8 %1 to i16 + %2 = bitcast i16 %conv to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e + ret <16 x float> %3 } Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -6118,3 +6118,50 @@ ret i8 %res2 } +define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: test_cmpps_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1) + ret i8 %res +} +declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> , <8 x float> , i32, i8) + +define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_cmpps_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1) + ret i8 %res +} +declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> , <4 x float> , i32, i8) + +define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: test_cmppd_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1) + ret i8 %res +} +declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> , <4 x double> , i32, i8) + +define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test_cmppd_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1) + ret i8 %res +} +declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8) Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -720,10 +720,11 @@ ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1) - ret i8 %res - } - declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> , <8 x float> , i32, i8) + %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2) + %1 = bitcast <8 x i1> %res to i8 + ret i8 %1 +} +declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32) define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test_cmpps_128: @@ -732,10 +733,12 @@ ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1) - ret i8 %res - } - declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> , <4 x float> , i32, i8) + %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2) + %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} +declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32) define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: test_cmppd_256: @@ -744,10 +747,12 @@ ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1) - ret i8 %res - } - declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> , <4 x double> , i32, i8) + %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2) + %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} +declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32) define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: test_cmppd_128: @@ -756,10 +761,12 @@ ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1) - ret i8 %res - } - declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8) + %res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2) + %1 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} +declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32) define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_maskz_max_ps_256: Index: llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -21794,7 +21794,7 @@ } -declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) +declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: ; VLX: # %bb.0: # %entry @@ -23261,33 +23261,37 @@ entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> - %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8) - %3 = zext i16 %2 to i32 - ret i32 %3 + %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8) + %3 = bitcast <16 x i1> %2 to i16 + %4 = zext i16 %3 to i32 + ret i32 %4 } define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} -; VLX-NEXT: kmovw %k0, %eax +; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl %edi, %eax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> - %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8) - %3 = zext i16 %2 to i32 - ret i32 %3 + %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8) + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = bitcast <16 x i1> %4 to i16 + %6 = zext i16 %5 to i32 + ret i32 %6 } @@ -23472,40 +23476,42 @@ entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> - %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8) - %3 = zext i16 %2 to i64 - ret i64 %3 + %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8) + %3 = bitcast <16 x i1> %2 to i16 + %4 = zext i16 %3 to i64 + ret i64 %4 } define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; VLX-NEXT: kmovd %k0, %eax -; VLX-NEXT: movzwl %ax, %eax +; VLX-NEXT: andl %edi, %eax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: movzwl %ax, %eax +; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> - %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8) - %3 = zext i16 %2 to i64 - ret i64 %3 + %2 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i32 8) + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = bitcast <16 x i1> %4 to i16 + %6 = zext i16 %5 to i64 + ret i64 %6 } -declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) +declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i32) define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry @@ -25377,17 +25383,18 @@ entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> - %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8) - %3 = zext i8 %2 to i16 - ret i16 %3 + %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8) + %3 = bitcast <8 x i1> %2 to i8 + %4 = zext i8 %3 to i16 + ret i16 %4 } define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andb %dil, %al ; VLX-NEXT: movzbl %al, %eax ; VLX-NEXT: # kill: def $ax killed $ax killed $eax ; VLX-NEXT: vzeroupper @@ -25395,9 +25402,9 @@ ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andb %dil, %al ; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax ; NoVLX-NEXT: vzeroupper @@ -25405,9 +25412,12 @@ entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> - %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8) - %3 = zext i8 %2 to i16 - ret i16 %3 + %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8) + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = bitcast <8 x i1> %4 to i8 + %6 = zext i8 %5 to i16 + ret i16 %6 } @@ -25588,34 +25598,39 @@ entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> - %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8) - %3 = zext i8 %2 to i32 - ret i32 %3 + %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8) + %3 = bitcast <8 x i1> %2 to i8 + %4 = zext i8 %3 to i32 + ret i32 %4 } define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andb %dil, %al +; VLX-NEXT: movzbl %al, %eax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andb %dil, %al ; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> - %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8) - %3 = zext i8 %2 to i32 - ret i32 %3 + %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8) + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = bitcast <8 x i1> %4 to i8 + %6 = zext i8 %5 to i32 + ret i32 %6 } @@ -25803,35 +25818,39 @@ entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> - %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8) - %3 = zext i8 %2 to i64 - ret i64 %3 + %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8) + %3 = bitcast <8 x i1> %2 to i8 + %4 = zext i8 %3 to i64 + ret i64 %4 } define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andb %dil, %al ; VLX-NEXT: movzbl %al, %eax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andb %dil, %al ; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> - %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8) - %3 = zext i8 %2 to i64 - ret i64 %3 + %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i32 8) + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = bitcast <8 x i1> %4 to i8 + %6 = zext i8 %5 to i64 + ret i64 %6 } ; Test that we understand that cmpps with rounding zeros the upper bits of the mask register. @@ -25849,8 +25868,9 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq - %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) - %cast = bitcast i16 %res to <16 x i1> + %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8) + %1 = bitcast <16 x i1> %res to i16 + %cast = bitcast i16 %1 to <16 x i1> %shuffle = shufflevector <16 x i1> %cast, <16 x i1> zeroinitializer, <32 x i32> %cast2 = bitcast <32 x i1> %shuffle to i32 ret i32 %cast2 Index: llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -140,19 +140,21 @@ ;CHECK-LABEL: stack_fold_cmppd ;CHECK: vcmpeqpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-9]}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, i8 -1, i32 4) - ret i8 %res + %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, i32 4) + %2 = bitcast <8 x i1> %res to i8 + ret i8 %2 } -declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) +declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i32) define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) { ;CHECK-LABEL: stack_fold_cmpps ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, i16 -1, i32 4) - ret i16 %res + %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, i32 4) + %2 = bitcast <16 x i1> %res to i16 + ret i16 %2 } -declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) +declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_divsd_int Index: llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512vl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -152,37 +152,44 @@ ;CHECK-LABEL: stack_fold_cmppd ;CHECK: vcmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-9]}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a0, <2 x double> %a1, i32 0, i8 -1) - ret i8 %res + %res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a0, <2 x double> %a1, i32 0) + %2 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> + %3 = bitcast <8 x i1> %2 to i8 + ret i8 %3 } -declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8) +declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32) define i8 @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) { ;CHECK-LABEL: stack_fold_cmppd_ymm ;CHECK: vcmpeqpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-9]}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0, i8 -1) - ret i8 %res + %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0) + %2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> + %3 = bitcast <8 x i1> %2 to i8 + ret i8 %3 } -declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> , <4 x double> , i32, i8) +declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32) define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) { ;CHECK-LABEL: stack_fold_cmpps ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a0, <4 x float> %a1, i32 0, i8 -1) - ret i8 %res + %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a0, <4 x float> %a1, i32 0) + %2 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> + %3 = bitcast <8 x i1> %2 to i8 + ret i8 %3 } -declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> , <4 x float> , i32, i8) +declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32) define i8 @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) { ;CHECK-LABEL: stack_fold_cmpps_ymm ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0, i8 -1) - ret i8 %res + %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0) + %2 = bitcast <8 x i1> %res to i8 + ret i8 %2 } -declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> , <8 x float> , i32, i8) +declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32) define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_divpd Index: llvm/trunk/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll +++ llvm/trunk/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll @@ -7,91 +7,113 @@ ; CHECK-LABEL: @sub_compare_foldingPD128_safe( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SUB_SAFE:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.safe = fsub <2 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.safe , <2 x double> zeroinitializer, i32 5, i8 -1) - ret i8 %0 + %0 = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.safe, <2 x double> zeroinitializer, i32 5) + %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){ ; CHECK-LABEL: @sub_compare_foldingPD128( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i = fsub ninf <2 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.i , <2 x double> zeroinitializer, i32 5, i8 -1) - ret i8 %0 + %0 = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.i, <2 x double> zeroinitializer, i32 5) + %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){ ; CHECK-LABEL: @sub_compare_foldingPD256( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i1 = fsub ninf <4 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5, i8 -1) - ret i8 %0 + %0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5) + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){ ; CHECK-LABEL: @sub_compare_foldingPD512( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i8 -1, i32 4) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i2 = fsub ninf <8 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i8 -1, i32 4) - ret i8 %0 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i32 4) + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){ ; CHECK-LABEL: @sub_compare_foldingPS128( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i3 = fsub ninf <4 x float> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12, i8 -1) - ret i8 %0 + %0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12) + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){ ; CHECK-LABEL: @sub_compare_foldingPS256( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i4 = fsub ninf <8 x float> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5, i8 -1) - ret i8 %0 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5) + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){ ; CHECK-LABEL: @sub_compare_foldingPS512( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i16 -1, i32 4) -; CHECK-NEXT: ret i16 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i1> [[TMP0]] to i16 +; CHECK-NEXT: ret i16 [[TMP1]] ; entry: %sub.i5 = fsub ninf <16 x float> %a, %b - %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i16 -1, i32 4) - ret i16 %0 + %0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i32 4) + %1 = bitcast <16 x i1> %0 to i16 + ret i16 %1 } @@ -99,96 +121,118 @@ define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPD128( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i = fsub ninf <2 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5, i8 -1) - ret i8 %0 + %0 = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5) + %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPD256( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i = fsub ninf <4 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5, i8 -1) - ret i8 %0 + %0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5) + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_folding_swapPD256_undef(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @sub_compare_folding_swapPD256_undef( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> fsub (<4 x double> undef, <4 x double> undef), <4 x double> zeroinitializer, i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP]] +; CHECK-NEXT: [[TMP:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> fsub (<4 x double> undef, <4 x double> undef), <4 x double> zeroinitializer, i32 5) +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i1> [[TMP]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i1 = fsub ninf <4 x double> undef, undef - %tmp = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5, i8 -1) - ret i8 %tmp + %tmp = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5) + %0 = shufflevector <4 x i1> %tmp, <4 x i1> zeroinitializer, <8 x i32> + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPD512( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i8 -1, i32 4) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i = fsub ninf <8 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i8 -1, i32 4) - ret i8 %0 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i32 4) + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPS128( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i = fsub ninf <4 x float> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12, i8 -1) - ret i8 %0 + %0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12) + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPS256( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i = fsub ninf <8 x float> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5, i8 -1) - ret i8 %0 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5) + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPS512( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i16 -1, i32 4) -; CHECK-NEXT: ret i16 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i1> [[TMP0]] to i16 +; CHECK-NEXT: ret i16 [[TMP1]] ; entry: %sub.i = fsub ninf <16 x float> %a, %b - %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i16 -1, i32 4) - ret i16 %0 + %0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i32 4) + %1 = bitcast <16 x i1> %0 to i16 + ret i16 %1 } -declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, i8) -declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, i8) -declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i8, i32) -declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8) -declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8) -declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32) +declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32) +declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32) +declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i32) +declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32) +declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32) +declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)