Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -4026,6 +4026,12 @@ def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">, Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">, + Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], + [IntrNoMem]>; def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">, Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -15998,19 +15998,24 @@ } if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { - assert(MaskVT == MVT::v64i1 && "Unexpected mask VT!"); - assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); - // In case 32bit mode, bitcast i64 is illegal, extend/split it. - SDValue Lo, Hi; - Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, - DAG.getConstant(0, dl, MVT::i32)); - Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, - DAG.getConstant(1, dl, MVT::i32)); + if (MaskVT == MVT::v64i1) { + assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); + // In case 32bit mode, bitcast i64 is illegal, extend/split it. + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(0, dl, MVT::i32)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(1, dl, MVT::i32)); - Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v32i1, Lo); - Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v32i1, Hi); + Lo = DAG.getBitcast(MVT::v32i1, Lo); + Hi = DAG.getBitcast(MVT::v32i1, Hi); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Hi, Lo); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); + } else { + MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); + return DAG.getBitcast(MaskVT, + DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); + } } else { MVT BitcastVT = MVT::getVectorVT(MVT::i1, @@ -16600,6 +16605,18 @@ return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); } + case KUNPCK: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + // Arguments should be swapped. + SDValue Res = DAG.getNode(IntrData->Opc0, dl, + MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), + Src2, Src1); + return DAG.getBitcast(VT, Res); + } default: break; } @@ -20001,8 +20018,9 @@ } } case ISD::INTRINSIC_WO_CHAIN: { - Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)); - return; + if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) + Results.push_back(V); + return; } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2396,16 +2396,6 @@ defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS; defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W; -multiclass avx512_mask_unpck_int { - let Predicates = [HasAVX512] in - def : Pat<(!cast("int_x86_avx512_"##IntName##"_bw") - (i16 GR16:$src1), (i16 GR16:$src2)), - (COPY_TO_REGCLASS (!cast(InstName##"BWrr") - (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)), - (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>; -} -defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">; - // Mask bit testing multiclass avx512_mask_testop opc, string OpcodeStr, RegisterClass KRC, SDNode OpNode, Predicate prd> { @@ -2496,6 +2486,9 @@ def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>; + def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>; Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -30,7 +30,7 @@ COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC, - TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK }; struct IntrinsicData { @@ -341,7 +341,9 @@ X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), - + X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD, @@ -1827,7 +1829,7 @@ "Intrinsic data tables should have unique entries"); } -// X86 specific compare constants. +// X86 specific compare constants. // They must be kept in synch with avxintrin.h #define _X86_CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ #define _X86_CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -65,9 +65,9 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) { ; CHECK-LABEL: unpckbw_test: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kunpckbw %k0, %k1, %k0 +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: kunpckbw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) @@ -6160,76 +6160,103 @@ } define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae -; CHECK: vcomisd {sae}, %xmm1, %xmm0 -; CHECK-NEXT: sete %al - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8) +; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae: +; CHECK: ## BB#0: +; CHECK-NEXT: vcomisd {sae}, %xmm1, %xmm0 +; CHECK-NEXT: sete %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8) ret i32 %res } define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae -; CHECK: vucomisd {sae}, %xmm1, %xmm0 -; CHECK-NEXT: sete %al - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8) +; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae: +; CHECK: ## BB#0: +; CHECK-NEXT: vucomisd {sae}, %xmm1, %xmm0 +; CHECK-NEXT: sete %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8) ret i32 %res } define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx512_comi_sd_eq -; CHECK: vcomisd %xmm1, %xmm0 -; CHECK-NEXT: sete %al - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4) +; CHECK-LABEL: test_x86_avx512_comi_sd_eq: +; CHECK: ## BB#0: +; CHECK-NEXT: vcomisd %xmm1, %xmm0 +; CHECK-NEXT: sete %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4) ret i32 %res } define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq -; CHECK: vucomisd %xmm1, %xmm0 -; CHECK-NEXT: sete %al - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4) +; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq: +; CHECK: ## BB#0: +; CHECK-NEXT: vucomisd %xmm1, %xmm0 +; CHECK-NEXT: sete %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4) ret i32 %res } define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae -; CHECK: vcomisd {sae}, %xmm1, %xmm0 -; CHECK-NEXT: sbbl %eax, %eax - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8) +; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae: +; CHECK: ## BB#0: +; CHECK-NEXT: vcomisd {sae}, %xmm1, %xmm0 +; CHECK-NEXT: sbbl %eax, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8) ret i32 %res } define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae -; CHECK: vucomisd {sae}, %xmm1, %xmm0 -; CHECK-NEXT: sbbl %eax, %eax - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8) +; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae: +; CHECK: ## BB#0: +; CHECK-NEXT: vucomisd {sae}, %xmm1, %xmm0 +; CHECK-NEXT: sbbl %eax, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8) ret i32 %res } define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx512_comi_sd_lt -; CHECK: vcomisd %xmm1, %xmm0 -; CHECK-NEXT: sbbl %eax, %eax - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4) +; CHECK-LABEL: test_x86_avx512_comi_sd_lt: +; CHECK: ## BB#0: +; CHECK-NEXT: vcomisd %xmm1, %xmm0 +; CHECK-NEXT: sbbl %eax, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4) ret i32 %res } define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt -; CHECK: vucomisd %xmm1, %xmm0 -; CHECK-NEXT: sbbl %eax, %eax - %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4) +; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt: +; CHECK: ## BB#0: +; CHECK-NEXT: vucomisd %xmm1, %xmm0 +; CHECK-NEXT: sbbl %eax, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4) ret i32 %res } -declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) +declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt -; CHECK: vucomiss %xmm1, %xmm0 -; CHECK-NEXT: sbbl %eax, %eax - %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4) +; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt: +; CHECK: ## BB#0: +; CHECK-NEXT: vucomiss %xmm1, %xmm0 +; CHECK-NEXT: sbbl %eax, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4) ret i32 %res } @@ -6238,21 +6265,32 @@ define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk: -; CHECK: vmovss %xmm1, %xmm0, %xmm2 {%k1} +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) ret <4 x float> %res } define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz: -; CHECK: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2) ret <4 x float> %res } define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr: -; CHECK: vmovss %xmm1, %xmm0, %xmm0 +; CHECK: ## BB#0: +; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1) ret <4 x float> %res } @@ -6260,21 +6298,32 @@ declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8) define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr: -; CHECK: vmovsd %xmm1, %xmm0, %xmm0 +; CHECK: ## BB#0: +; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1) ret <2 x double> %res } define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz: -; CHECK: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2) ret <2 x double> %res } define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk: -; CHECK: vmovsd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) ret <2 x double> %res } Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2736,3 +2736,53 @@ %res2 = add <8 x i64> %res, %res1 ret <8 x i64> %res2 } + +declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32) + +define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { +; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k0 +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: kunpckwd %k1, %k0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckwd %k1, %k0, %k0 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: retl + %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) + ret i32 %res +} + +declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64) + +define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { +; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k0 +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: kunpckdq %k1, %k0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: subl $12, %esp +; AVX512F-32-NEXT: .Ltmp8: +; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0 +; AVX512F-32-NEXT: kmovq %k0, (%esp) +; AVX512F-32-NEXT: movl (%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: retl + %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) + ret i64 %res +}