Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -6066,77 +6066,41 @@ // Bitwise ternary logic let TargetPrefix = "x86" in { - def int_x86_avx512_mask_pternlog_d_128 : - GCCBuiltin<"__builtin_ia32_pternlogd128_mask">, + def int_x86_avx512_pternlog_d_128 : + GCCBuiltin<"__builtin_ia32_pternlogd128">, Intrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_maskz_pternlog_d_128 : - GCCBuiltin<"__builtin_ia32_pternlogd128_maskz">, - Intrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_pternlog_d_256 : - GCCBuiltin<"__builtin_ia32_pternlogd256_mask">, - Intrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], + [IntrNoMem]>; - def int_x86_avx512_maskz_pternlog_d_256 : - GCCBuiltin<"__builtin_ia32_pternlogd256_maskz">, + def int_x86_avx512_pternlog_d_256 : + GCCBuiltin<"__builtin_ia32_pternlogd256">, Intrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_pternlog_d_512 : - GCCBuiltin<"__builtin_ia32_pternlogd512_mask">, - Intrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, - llvm_i16_ty], [IntrNoMem]>; + [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], + [IntrNoMem]>; - def int_x86_avx512_maskz_pternlog_d_512 : - GCCBuiltin<"__builtin_ia32_pternlogd512_maskz">, + def int_x86_avx512_pternlog_d_512 : + GCCBuiltin<"__builtin_ia32_pternlogd512">, Intrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, - llvm_i16_ty], [IntrNoMem]>; + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, + llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pternlog_q_128 : - GCCBuiltin<"__builtin_ia32_pternlogq128_mask">, + def int_x86_avx512_pternlog_q_128 : + GCCBuiltin<"__builtin_ia32_pternlogq128">, Intrinsic<[llvm_v2i64_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_maskz_pternlog_q_128 : - GCCBuiltin<"__builtin_ia32_pternlogq128_maskz">, - Intrinsic<[llvm_v2i64_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_pternlog_q_256 : - GCCBuiltin<"__builtin_ia32_pternlogq256_mask">, - Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], + [IntrNoMem]>; - def int_x86_avx512_maskz_pternlog_q_256 : - GCCBuiltin<"__builtin_ia32_pternlogq256_maskz">, + def int_x86_avx512_pternlog_q_256 : + GCCBuiltin<"__builtin_ia32_pternlogq256">, Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_pternlog_q_512 : - GCCBuiltin<"__builtin_ia32_pternlogq512_mask">, - Intrinsic<[llvm_v8i64_ty], - [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], + [IntrNoMem]>; - def int_x86_avx512_maskz_pternlog_q_512 : - GCCBuiltin<"__builtin_ia32_pternlogq512_maskz">, + def int_x86_avx512_pternlog_q_512 : + GCCBuiltin<"__builtin_ia32_pternlogq512">, Intrinsic<[llvm_v8i64_ty], - [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, - llvm_i8_ty], [IntrNoMem]>; + [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], + [IntrNoMem]>; } // Misc. Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -257,6 +257,8 @@ Name.startswith("avx512.mask.pmovsx") || // Added in 4.0 Name.startswith("avx512.mask.pmovzx") || // Added in 4.0 Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0 + Name.startswith("avx512.mask.pternlog.") || // Added in 7.0 + Name.startswith("avx512.maskz.pternlog.") || // Added in 7.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -2530,6 +2532,34 @@ Value *Ops[] = { Arg0, Arg1, Arg2 }; Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), Ops); + } else if (IsX86 && (Name.startswith("avx512.mask.pternlog.") || + Name.startswith("avx512.maskz.pternlog."))) { + bool ZeroMask = Name[11] == 'z'; + unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits(); + unsigned EltWidth = CI->getType()->getScalarSizeInBits(); + Intrinsic::ID IID; + if (VecWidth == 128 && EltWidth == 32) + IID = Intrinsic::x86_avx512_pternlog_d_128; + else if (VecWidth == 256 && EltWidth == 32) + IID = Intrinsic::x86_avx512_pternlog_d_256; + else if (VecWidth == 512 && EltWidth == 32) + IID = Intrinsic::x86_avx512_pternlog_d_512; + else if (VecWidth == 128 && EltWidth == 64) + IID = Intrinsic::x86_avx512_pternlog_q_128; + else if (VecWidth == 256 && EltWidth == 64) + IID = Intrinsic::x86_avx512_pternlog_q_256; + else if (VecWidth == 512 && EltWidth == 64) + IID = Intrinsic::x86_avx512_pternlog_q_512; + else + llvm_unreachable("Unexpected intrinsic"); + + Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1), + CI->getArgOperand(2), CI->getArgOperand(3) }; + Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), + Args); + Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) + : CI->getArgOperand(0); + Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru); } else if (IsX86 && Name.startswith("avx512.mask.") && upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) { // Rep will be updated by the call in the condition. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -20627,23 +20627,6 @@ Src2, Src3, Src1), Mask, PassThru, Subtarget, DAG); } - case TERLOG_OP_MASK: - case TERLOG_OP_MASKZ: { - SDValue Src1 = Op.getOperand(1); - SDValue Src2 = Op.getOperand(2); - SDValue Src3 = Op.getOperand(3); - SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4)); - SDValue Mask = Op.getOperand(5); - MVT VT = Op.getSimpleValueType(); - SDValue PassThru = Src1; - // Set PassThru element. - if (IntrData->Type == TERLOG_OP_MASKZ) - PassThru = getZeroVector(VT, Subtarget, DAG, dl); - - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3, Src4), - Mask, PassThru, Subtarget, DAG); - } case CVTPD2PS: // ISD::FP_ROUND has a second argument that indicates if the truncation // does not change the value. Set it to 0 since it can change. Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -36,7 +36,7 @@ COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, - TERLOG_OP_MASK, TERLOG_OP_MASKZ, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, + FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, FIXUPIMMS_MASKZ, GATHER_AVX2, ROUNDP, ROUNDS }; @@ -949,18 +949,6 @@ X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK, - X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), @@ -1281,18 +1269,6 @@ X86ISD::VFIXUPIMMS, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ, X86ISD::VFIXUPIMMS, 0), - X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ, - X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ, - X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, ISD::FMA, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, ISD::FMA, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, ISD::FMA, @@ -1456,6 +1432,12 @@ X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -4231,3 +4231,71 @@ %res4 = add <16 x i32> %res3, %res2 ret <16 x i32> %res4 } + +declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) + +define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) + +define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) + +define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) + +define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -3206,71 +3206,75 @@ ret <4 x float> %res2 } -declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) +declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) - %res2 = add <16 x i32> %res, %res1 + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) + %2 = bitcast i16 %x4 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + %4 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) + %res2 = add <16 x i32> %3, %4 ret <16 x i32> %res2 } -declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) - define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) - %res2 = add <16 x i32> %res, %res1 + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) + %2 = bitcast i16 %x4 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + %4 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) + %res2 = add <16 x i32> %3, %4 ret <16 x i32> %res2 } -declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) +declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) - %res2 = add <8 x i64> %res, %res1 + %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0 + %4 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) + %res2 = add <8 x i64> %3, %4 ret <8 x i64> %res2 } -declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) - define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) - %res2 = add <8 x i64> %res, %res1 + %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + %4 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) + %res2 = add <8 x i64> %3, %4 ret <8 x i64> %res2 } Index: test/CodeGen/X86/avx512-vpternlog-commute.ll =================================================================== --- test/CodeGen/X86/avx512-vpternlog-commute.ll +++ test/CodeGen/X86/avx512-vpternlog-commute.ll @@ -3,16 +3,15 @@ ; These test cases demonstrate cases where vpternlog could benefit from being commuted. -declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) -declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) +declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) define <16 x i32> @vpternlog_v16i32_012(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: vpternlog_v16i32_012: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpternlogd $114, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_102(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { @@ -20,8 +19,8 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vpternlogd $78, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_210(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { @@ -29,8 +28,8 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vpternlogd $92, %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_012_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) { @@ -39,8 +38,8 @@ ; CHECK-NEXT: vpternlogd $46, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_012_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) { @@ -49,8 +48,8 @@ ; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_012_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) { @@ -59,8 +58,8 @@ ; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_102_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) { @@ -69,8 +68,8 @@ ; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_102_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) { @@ -79,8 +78,8 @@ ; CHECK-NEXT: vpternlogd $46, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_102_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) { @@ -89,8 +88,8 @@ ; CHECK-NEXT: vpternlogd $78, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_210_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) { @@ -99,8 +98,8 @@ ; CHECK-NEXT: vpternlogd $78, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_210_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) { @@ -109,8 +108,8 @@ ; CHECK-NEXT: vpternlogd $92, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_210_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) { @@ -119,8 +118,8 @@ ; CHECK-NEXT: vpternlogd $58, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_021_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) { @@ -129,8 +128,8 @@ ; CHECK-NEXT: vpternlogd $58, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_021_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) { @@ -139,8 +138,8 @@ ; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_021_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) { @@ -149,8 +148,8 @@ ; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_012_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -159,8 +158,10 @@ ; CHECK-NEXT: kmovd %edi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -170,8 +171,10 @@ ; CHECK-NEXT: vpternlogd $114, %zmm2, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -181,8 +184,10 @@ ; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_mask1(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -192,9 +197,9 @@ ; CHECK-NEXT: vpternlogd $78, %zmm2, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x1 ret <16 x i32> %res2 } @@ -205,23 +210,25 @@ ; CHECK-NEXT: vpternlogd $58, %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x2 ret <16 x i32> %res2 } define <16 x i32> @vpternlog_v16i32_012_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { ; CHECK-LABEL: vpternlog_v16i32_012_load0_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_load0_mask1(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -231,9 +238,9 @@ ; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x1 ret <16 x i32> %res2 } @@ -245,9 +252,9 @@ ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x2 ret <16 x i32> %res2 } @@ -258,8 +265,10 @@ ; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_load1_mask2(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -270,9 +279,9 @@ ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x2 ret <16 x i32> %res2 } @@ -283,8 +292,10 @@ ; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_load2_mask1(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) { @@ -295,9 +306,9 @@ ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x1 ret <16 x i32> %res2 } @@ -308,21 +319,25 @@ ; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) { ; CHECK-LABEL: vpternlog_v16i32_102_load1_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) { @@ -333,8 +348,10 @@ ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -345,8 +362,10 @@ ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -357,34 +376,40 @@ ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) { ; CHECK-LABEL: vpternlog_v16i32_210_load2_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { ; CHECK-LABEL: vpternlog_v16i32_021_load0_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -394,8 +419,10 @@ ; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) { @@ -405,8 +432,10 @@ ; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -415,8 +444,10 @@ ; CHECK-NEXT: kmovd %edi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -425,8 +456,10 @@ ; CHECK-NEXT: kmovd %edi, %k1 ; CHECK-NEXT: vpternlogd $78, %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -435,8 +468,10 @@ ; CHECK-NEXT: kmovd %edi, %k1 ; CHECK-NEXT: vpternlogd $92, %zmm1, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -446,8 +481,10 @@ ; CHECK-NEXT: vpternlogd $46, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -457,8 +494,10 @@ ; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) { @@ -468,8 +507,10 @@ ; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -479,8 +520,10 @@ ; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -490,8 +533,10 @@ ; CHECK-NEXT: vpternlogd $46, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) { @@ -501,8 +546,10 @@ ; CHECK-NEXT: vpternlogd $78, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -512,8 +559,10 @@ ; CHECK-NEXT: vpternlogd $78, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -523,8 +572,10 @@ ; CHECK-NEXT: vpternlogd $92, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) { @@ -534,8 +585,10 @@ ; CHECK-NEXT: vpternlogd $58, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -545,8 +598,10 @@ ; CHECK-NEXT: vpternlogd $58, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x0 = load <16 x i32>, <16 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -556,8 +611,10 @@ ; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x1 = load <16 x i32>, <16 x i32>* %x1ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) { @@ -567,8 +624,10 @@ ; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2ptr - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) { @@ -579,8 +638,8 @@ %x0_scalar = load i32, i32* %ptr_x0 %vecinit.i = insertelement <16 x i32> undef, i32 %x0_scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_012_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) { @@ -591,8 +650,8 @@ %x1_scalar = load i32, i32* %ptr_x1 %vecinit.i = insertelement <16 x i32> undef, i32 %x1_scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_012_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) { @@ -603,8 +662,8 @@ %x2_scalar = load i32, i32* %ptr_x2 %vecinit.i = insertelement <16 x i32> undef, i32 %x2_scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_102_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) { @@ -615,8 +674,8 @@ %x0_scalar = load i32, i32* %ptr_x0 %vecinit.i = insertelement <16 x i32> undef, i32 %x0_scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_102_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) { @@ -627,8 +686,8 @@ %x1_scalar = load i32, i32* %ptr_x1 %vecinit.i = insertelement <16 x i32> undef, i32 %x1_scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_102_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) { @@ -639,8 +698,8 @@ %x2_scalar = load i32, i32* %ptr_x2 %vecinit.i = insertelement <16 x i32> undef, i32 %x2_scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_210_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) { @@ -651,8 +710,8 @@ %x0_scalar = load i32, i32* %ptr_x0 %vecinit.i = insertelement <16 x i32> undef, i32 %x0_scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_210_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) { @@ -663,8 +722,8 @@ %x1_scalar = load i32, i32* %ptr_x1 %vecinit.i = insertelement <16 x i32> undef, i32 %x1_scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_210_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) { @@ -675,23 +734,25 @@ %x2_scalar = load i32, i32* %ptr_x2 %vecinit.i = insertelement <16 x i32> undef, i32 %x2_scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + ret <16 x i32> %1 } define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { ; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 +; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -703,8 +764,10 @@ %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) { @@ -716,8 +779,10 @@ %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -729,23 +794,27 @@ %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) { ; CHECK-LABEL: vpternlog_v16i32_102_broadcast1_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 +; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) { @@ -758,8 +827,10 @@ %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -772,8 +843,10 @@ %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -786,38 +859,44 @@ %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) { ; CHECK-LABEL: vpternlog_v16i32_210_broadcast2_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 +; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { ; CHECK-LABEL: vpternlog_v16i32_021_broadcast0_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2 +; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -829,8 +908,10 @@ %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) { @@ -842,8 +923,10 @@ %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -855,8 +938,10 @@ %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -868,8 +953,10 @@ %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) { @@ -881,8 +968,10 @@ %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -894,8 +983,10 @@ %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -907,8 +998,10 @@ %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_102_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) { @@ -920,8 +1013,10 @@ %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -933,8 +1028,10 @@ %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -946,8 +1043,10 @@ %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_210_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) { @@ -959,8 +1058,10 @@ %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -972,8 +1073,10 @@ %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) { @@ -985,8 +1088,10 @@ %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_021_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) { @@ -998,8 +1103,10 @@ %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask1(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) { @@ -1011,9 +1118,9 @@ %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x1 ret <16 x i32> %res2 } @@ -1027,9 +1134,9 @@ %x0scalar = load i32, i32* %x0ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0 %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x2 ret <16 x i32> %res2 } @@ -1043,9 +1150,9 @@ %x1scalar = load i32, i32* %x1ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0 %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x2 ret <16 x i32> %res2 } @@ -1059,8 +1166,8 @@ %x2scalar = load i32, i32* %x2ptr %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0 %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1) + %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114) %mask.cast = bitcast i16 %mask to <16 x i1> - %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1 + %res2 = select <16 x i1> %mask.cast, <16 x i32> %1, <16 x i32> %x1 ret <16 x i32> %res2 } Index: test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -6817,3 +6817,139 @@ %res4 = add <4 x i64> %res3, %res2 ret <4 x i64> %res4 } + +declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8) + +define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) + %res1 = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8) + +define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) + %res1 = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32, i8) + +define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32, i8) + +define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) + %res1 = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32, i8) + +define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32, i8) + +define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) + %res1 = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -2222,20 +2222,23 @@ ret <8 x float> %res2 } -declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8) +declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] ; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1) - %res2 = add <4 x i32> %res, %res1 + %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 + %4 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) + %res2 = add <4 x i32> %3, %4 ret <4 x i32> %res2 } @@ -2244,32 +2247,37 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] ; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) - %res1 = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1) - %res2 = add <4 x i32> %res, %res1 + %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer + %4 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) + %res2 = add <4 x i32> %3, %4 ret <4 x i32> %res2 } -declare <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8) +declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] ; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1) - %res2 = add <8 x i32> %res, %res1 + %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 + %4 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) + %res2 = add <8 x i32> %3, %4 ret <8 x i32> %res2 } @@ -2278,83 +2286,93 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] ; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1) - %res2 = add <8 x i32> %res, %res1 + %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer + %4 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) + %res2 = add <8 x i32> %3, %4 ret <8 x i32> %res2 } -declare <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32, i8) +declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] ; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1) - %res2 = add <2 x i64> %res, %res1 + %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x0 + %4 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) + %res2 = add <2 x i64> %3, %4 ret <2 x i64> %res2 } -declare <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32, i8) - define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] ; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) - %res1 = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1) - %res2 = add <2 x i64> %res, %res1 + %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer + %4 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) + %res2 = add <2 x i64> %3, %4 ret <2 x i64> %res2 } -declare <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32, i8) +declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] ; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1) - %res2 = add <4 x i64> %res, %res1 + %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x0 + %4 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) + %res2 = add <4 x i64> %3, %4 ret <4 x i64> %res2 } -declare <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32, i8) - define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] ; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1) - %res2 = add <4 x i64> %res, %res1 + %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) + %2 = bitcast i8 %x4 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer + %4 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) + %res2 = add <4 x i64> %3, %4 ret <4 x i64> %res2 } Index: test/CodeGen/X86/stack-folding-int-avx512.ll =================================================================== --- test/CodeGen/X86/stack-folding-int-avx512.ll +++ test/CodeGen/X86/stack-folding-int-avx512.ll @@ -1798,20 +1798,20 @@ ;CHECK-LABEL: stack_fold_ternlogd ;CHECK: vpternlogd $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) - ret <16 x i32> %res + %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) + ret <16 x i32> %2 } -declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) +declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { ;CHECK-LABEL: stack_fold_ternlogq ;CHECK: vpternlogq $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) - ret <8 x i64> %res + %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) + ret <8 x i64> %2 } -declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) +declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) { ;CHECK-LABEL: stack_fold_punpckhbw_zmm