Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -6386,29 +6386,24 @@ // Misc. let TargetPrefix = "x86" in { def int_x86_avx512_mask_cmp_ps_512 : - GCCBuiltin<"__builtin_ia32_cmpps512_mask">, - Intrinsic<[llvm_i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty, - llvm_i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_v16f32_ty, + llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_pd_512 : - GCCBuiltin<"__builtin_ia32_cmppd512_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_v8f64_ty, - llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_v8f64_ty, + llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_ps_256 : - GCCBuiltin<"__builtin_ia32_cmpps256_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v8f32_ty, llvm_v8f32_ty, - llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_pd_256 : - GCCBuiltin<"__builtin_ia32_cmppd256_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v4f64_ty, llvm_v4f64_ty, - llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_ps_128 : - GCCBuiltin<"__builtin_ia32_cmpps128_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty, - llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cmp_pd_128 : - GCCBuiltin<"__builtin_ia32_cmppd128_mask">, - Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty, - llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_cmp_ss : GCCBuiltin<"__builtin_ia32_cmpss_mask">, Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty, Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -65,6 +65,19 @@ return true; } +// Upgrade the declaration of fp compare intrinsics that change return type +// from scalar to vXi1 mask. +static bool UpgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID, + Function *&NewFn) { + // Check if the return type is a vector. + if (F->getReturnType()->isVectorTy()) + return false; + + rename(F); + NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + return true; +} + static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { // All of the intrinsics matches below should be marked with which llvm // version started autoupgrading them. At some point in the future we would @@ -322,6 +335,24 @@ if (Name == "avx2.mpsadbw") // Added in 3.6 return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw, NewFn); + if (Name == "avx512.mask.cmp.pd.128") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_128, + NewFn); + if (Name == "avx512.mask.cmp.pd.256") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_256, + NewFn); + if (Name == "avx512.mask.cmp.pd.512") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_pd_512, + NewFn); + if (Name == "avx512.mask.cmp.ps.128") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_128, + NewFn); + if (Name == "avx512.mask.cmp.ps.256") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_256, + NewFn); + if (Name == "avx512.mask.cmp.ps.512") // Added in 7.0 + return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_512, + NewFn); // frcz.ss/sd may need to have an argument dropped. Added in 3.2 if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) { @@ -2417,6 +2448,34 @@ break; } + case Intrinsic::x86_avx512_mask_cmp_pd_128: + case Intrinsic::x86_avx512_mask_cmp_pd_256: + case Intrinsic::x86_avx512_mask_cmp_pd_512: + case Intrinsic::x86_avx512_mask_cmp_ps_128: + case Intrinsic::x86_avx512_mask_cmp_ps_256: + case Intrinsic::x86_avx512_mask_cmp_ps_512: { + SmallVector Args; + Args.push_back(CI->getArgOperand(0)); + Args.push_back(CI->getArgOperand(1)); + Args.push_back(CI->getArgOperand(2)); + if (CI->getNumArgOperands() == 5) + Args.push_back(CI->getArgOperand(4)); + + NewCall = Builder.CreateCall(NewFn, Args); + unsigned NumElts = Args[0]->getType()->getVectorNumElements(); + Value *Res = ApplyX86MaskOn1BitsVec(Builder, NewCall, CI->getArgOperand(3), + NumElts); + + std::string Name = CI->getName(); + if (!Name.empty()) { + CI->setName(Name + ".old"); + NewCall->setName(Name); + } + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + return; + } + case Intrinsic::thread_pointer: { NewCall = Builder.CreateCall(NewFn, {}); break; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -20309,8 +20309,7 @@ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask, DAG.getIntPtrConstant(0, dl)); } - case CMP_MASK: - case CMP_MASK_CC: { + case CMP_MASK: { // Comparison intrinsics with masks. // Example of transformation: // (i8 (int_x86_avx512_mask_pcmpeq_q_128 @@ -20325,29 +20324,8 @@ SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); - SDValue Cmp; - if (IntrData->Type == CMP_MASK_CC) { - SDValue CC = Op.getOperand(3); - CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); - // We specify 2 possible opcodes for intrinsics with rounding modes. - // First, we check if the intrinsic may have non-default rounding mode, - // (IntrData->Opc1 != 0), then we check the rounding mode operand. - if (IntrData->Opc1 != 0) { - SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC, Rnd); - } - //default rounding mode - if(!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC); - - } else { - assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); - Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2)); - } + SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2)); SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, @@ -20355,6 +20333,29 @@ DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } + + case CMP_MASK_CC: { + MVT VT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Cmp; + SDValue CC = Op.getOperand(3); + CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + if (IntrData->Opc1 != 0) { + SDValue Rnd = Op.getOperand(4); + if (!isRoundModeCurDirection(Rnd)) + Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC, Rnd); + } + //default rounding mode + if (!Cmp.getNode()) + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC); + + return Cmp; + } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); Index: test/CodeGen/X86/avx512-cmp-kor-sequence.ll =================================================================== --- test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -12,16 +12,13 @@ ; CHECK-LABEL: cmp_kor_seq_16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k0 -; CHECK-NEXT: kmovw %k0, %edx -; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k0 -; CHECK-NEXT: kmovw %k0, %esi -; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k0 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2 +; CHECK-NEXT: korw %k2, %k1, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: orl %esi, %eax -; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq entry: Index: test/CodeGen/X86/avx512vl-vec-masked-cmp.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -23269,17 +23269,17 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} -; VLX-NEXT: kmovw %k0, %eax +; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl %edi, %eax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23480,19 +23480,17 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; VLX-NEXT: kmovd %k0, %eax -; VLX-NEXT: movzwl %ax, %eax +; VLX-NEXT: andl %edi, %eax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: movzwl %ax, %eax +; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25385,9 +25383,9 @@ define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andb %dil, %al ; VLX-NEXT: movzbl %al, %eax ; VLX-NEXT: # kill: def $ax killed $ax killed $eax ; VLX-NEXT: vzeroupper @@ -25395,9 +25393,9 @@ ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andb %dil, %al ; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax ; NoVLX-NEXT: vzeroupper @@ -25596,17 +25594,18 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andb %dil, %al +; VLX-NEXT: movzbl %al, %eax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andb %dil, %al ; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq @@ -25811,18 +25810,18 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andb %dil, %al ; VLX-NEXT: movzbl %al, %eax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andb %dil, %al ; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq Index: test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll =================================================================== --- test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll +++ test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll @@ -7,91 +7,113 @@ ; CHECK-LABEL: @sub_compare_foldingPD128_safe( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SUB_SAFE:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.safe = fsub <2 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.safe , <2 x double> zeroinitializer, i32 5, i8 -1) - ret i8 %0 + %0 = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.safe, <2 x double> zeroinitializer, i32 5) + %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){ ; CHECK-LABEL: @sub_compare_foldingPD128( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i = fsub ninf <2 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.i , <2 x double> zeroinitializer, i32 5, i8 -1) - ret i8 %0 + %0 = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %sub.i, <2 x double> zeroinitializer, i32 5) + %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){ ; CHECK-LABEL: @sub_compare_foldingPD256( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i1 = fsub ninf <4 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5, i8 -1) - ret i8 %0 + %0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5) + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){ ; CHECK-LABEL: @sub_compare_foldingPD512( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i8 -1, i32 4) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i2 = fsub ninf <8 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i8 -1, i32 4) - ret i8 %0 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i32 4) + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){ ; CHECK-LABEL: @sub_compare_foldingPS128( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i3 = fsub ninf <4 x float> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12, i8 -1) - ret i8 %0 + %0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12) + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){ ; CHECK-LABEL: @sub_compare_foldingPS256( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i4 = fsub ninf <8 x float> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5, i8 -1) - ret i8 %0 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5) + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){ ; CHECK-LABEL: @sub_compare_foldingPS512( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i16 -1, i32 4) -; CHECK-NEXT: ret i16 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i1> [[TMP0]] to i16 +; CHECK-NEXT: ret i16 [[TMP1]] ; entry: %sub.i5 = fsub ninf <16 x float> %a, %b - %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i16 -1, i32 4) - ret i16 %0 + %0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i32 4) + %1 = bitcast <16 x i1> %0 to i16 + ret i16 %1 } @@ -99,96 +121,118 @@ define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPD128( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i = fsub ninf <2 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5, i8 -1) - ret i8 %0 + %0 = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5) + %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPD256( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i = fsub ninf <4 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5, i8 -1) - ret i8 %0 + %0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5) + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_folding_swapPD256_undef(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @sub_compare_folding_swapPD256_undef( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> fsub (<4 x double> undef, <4 x double> undef), <4 x double> zeroinitializer, i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP]] +; CHECK-NEXT: [[TMP:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> fsub (<4 x double> undef, <4 x double> undef), <4 x double> zeroinitializer, i32 5) +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i1> [[TMP]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i1 = fsub ninf <4 x double> undef, undef - %tmp = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5, i8 -1) - ret i8 %tmp + %tmp = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5) + %0 = shufflevector <4 x i1> %tmp, <4 x i1> zeroinitializer, <8 x i32> + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPD512( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i8 -1, i32 4) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i = fsub ninf <8 x double> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i8 -1, i32 4) - ret i8 %0 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i32 4) + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPS128( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; entry: %sub.i = fsub ninf <4 x float> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12, i8 -1) - ret i8 %0 + %0 = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12) + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 } define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPS256( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5, i8 -1) -; CHECK-NEXT: ret i8 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8 +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %sub.i = fsub ninf <8 x float> %a, %b - %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5, i8 -1) - ret i8 %0 + %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5) + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 } define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){ ; CHECK-LABEL: @sub_compare_folding_swapPS512( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i16 -1, i32 4) -; CHECK-NEXT: ret i16 [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i1> [[TMP0]] to i16 +; CHECK-NEXT: ret i16 [[TMP1]] ; entry: %sub.i = fsub ninf <16 x float> %a, %b - %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i16 -1, i32 4) - ret i16 %0 + %0 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i32 4) + %1 = bitcast <16 x i1> %0 to i16 + ret i16 %1 } -declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, i8) -declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, i8) -declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i8, i32) -declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8) -declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8) -declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32) +declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32) +declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32) +declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i32) +declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32) +declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32) +declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)