Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -3738,15 +3738,6 @@ def int_x86_avx512_kxnor_w : GCCBuiltin<"__builtin_ia32_kxnorhi">, Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">, - Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], - [IntrNoMem]>; - def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem]>; def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">, Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -78,6 +78,7 @@ Name=="ssse3.pabs.d.128" || // Added in 6.0 Name.startswith("avx512.mask.shuf.i") || // Added in 6.0 Name.startswith("avx512.mask.shuf.f") || // Added in 6.0 + Name.startswith("avx512.kunpck") || //added in 6.0 Name.startswith("avx2.pabs.") || // Added in 6.0 Name.startswith("avx512.mask.pabs.") || // Added in 6.0 Name.startswith("avx512.broadcastm") || // Added in 6.0 @@ -1065,6 +1066,12 @@ Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0)); Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); + } else if (IsX86 && (Name.startswith("avx512.kunpck"))) { + uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2; + uint64_t And = (1 << Shift) - 1; + Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And); + Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift); + Rep = Builder.CreateOr(LowBits, HighBits); } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) { Type *I32Ty = Type::getInt32Ty(C); Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0), Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -30007,6 +30007,53 @@ SDValue N0 = BitCast.getOperand(0); EVT VecVT = N0->getValueType(0); + if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() && + N0->getOpcode() == ISD::OR) { + SDValue Op0 = N0->getOperand(0); + SDValue Op1 = N0->getOperand(1); + MVT TrunckVT; + MVT BitcastVT; + switch (VT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v16i1: + TrunckVT = MVT::i8; + BitcastVT = MVT::v8i1; + break; + case MVT::v32i1: + TrunckVT = MVT::i16; + BitcastVT = MVT::v16i1; + break; + case MVT::v64i1: + TrunckVT = MVT::i32; + BitcastVT = MVT::v32i1; + break; + } + bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL; + bool isArg0UndefLeft = + Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND; + bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL; + bool isArg1UndefLeft = + Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND; + SDValue OpLeft; + SDValue OpRight; + if (isArg0UndefRight && isArg1UndefLeft) { + OpLeft = Op0; + OpRight = Op1; + } else if (isArg1UndefRight && isArg0UndefLeft) { + OpLeft = Op1; + OpRight = Op0; + } else + return SDValue(); + SDLoc DL(BitCast); + SDValue Shr = OpLeft->getOperand(0); + SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr); + SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1); + SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight); + SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2); + } + if (!VT.isScalarInteger() || !VecVT.isSimple()) return SDValue(); Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -479,9 +479,6 @@ X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0), - X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), - X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), - X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -5,6 +5,59 @@ ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c +define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_kunpackb: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 +; X32-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; X32-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 +; X32-NEXT: kunpckbw %k0, %k1, %k1 +; X32-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_kunpackb: +; X64: # %bb.0: # %entry +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; X64-NEXT: kunpckbw %k0, %k1, %k1 +; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__A to <16 x i32> + %1 = bitcast <8 x i64> %__B to <16 x i32> + %2 = icmp ne <16 x i32> %0, %1 + %3 = bitcast <16 x i1> %2 to i16 + %4 = bitcast <8 x i64> %__C to <16 x i32> + %5 = bitcast <8 x i64> %__D to <16 x i32> + %6 = icmp ne <16 x i32> %4, %5 + %7 = bitcast <16 x i1> %6 to i16 + %8 = and i16 %7, 255 + %shl.i = shl i16 %3, 8 + %or.i = or i16 %8, %shl.i + %9 = bitcast <8 x i64> %__E to <16 x i32> + %10 = bitcast <8 x i64> %__F to <16 x i32> + %11 = icmp ne <16 x i32> %9, %10 + %12 = bitcast i16 %or.i to <16 x i1> + %13 = and <16 x i1> %11, %12 + %14 = bitcast <16 x i1> %13 to i16 + ret i16 %14 +} + define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) { ; X32-LABEL: test_mm512_shuffle_f32x4: ; X32: # %bb.0: # %entry Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1,7 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s - define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { +declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone + +define i16 @unpckbw_test(i16 %a0, i16 %a1) { +; CHECK-LABEL: unpckbw_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: shll $8, %esi +; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: ## kill: %ax %ax %eax +; CHECK-NEXT: retq + %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) + ret i16 %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpbroadcastd %edi, %zmm1 Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -96,21 +96,6 @@ ret i16 %t2 } -declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone - -define i16 @unpckbw_test(i16 %a0, i16 %a1) { -; CHECK-LABEL: unpckbw_test: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: kunpckbw %k1, %k0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: ## kill: %ax %ax %eax -; CHECK-NEXT: retq - %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) - ret i16 %res -} - declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone ; TODO: the two kxnor instructions here a no op and should be elimintaed, ; probably by FoldConstantArithmetic in SelectionDAG. Index: test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -4,6 +4,117 @@ ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c +define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { +; X32-LABEL: test_mm512_kunpackd: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 +; X32-NEXT: vmovdqa64 72(%ebp), %zmm4 +; X32-NEXT: vmovdqa64 8(%ebp), %zmm5 +; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 +; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; X32-NEXT: vpcmpneqb %zmm5, %zmm2, %k0 +; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X32-NEXT: kunpckdq %k0, %k1, %k1 +; X32-NEXT: vpcmpneqb %zmm3, %zmm4, %k0 {%k1} +; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_kunpackd: +; X64: # %bb.0: # %entry +; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 +; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1 +; X64-NEXT: kunpckdq %k0, %k1, %k1 +; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1} +; X64-NEXT: kmovq %k0, %rax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__B to <64 x i8> + %1 = bitcast <8 x i64> %__A to <64 x i8> + %2 = icmp ne <64 x i8> %0, %1 + %3 = bitcast <64 x i1> %2 to i64 + %4 = bitcast <8 x i64> %__C to <64 x i8> + %5 = bitcast <8 x i64> %__D to <64 x i8> + %6 = icmp ne <64 x i8> %4, %5 + %7 = bitcast <64 x i1> %6 to i64 + %and.i = and i64 %7, 4294967295 + %shl.i = shl i64 %3, 32 + %or.i = or i64 %and.i, %shl.i + %8 = bitcast <8 x i64> %__E to <64 x i8> + %9 = bitcast <8 x i64> %__F to <64 x i8> + %10 = icmp ne <64 x i8> %8, %9 + %11 = bitcast i64 %or.i to <64 x i1> + %12 = and <64 x i1> %10, %11 + %13 = bitcast <64 x i1> %12 to i64 + ret i64 %13 +} + +define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { +; X32-LABEL: test_mm512_kunpackw: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 +; X32-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 +; X32-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1 +; X32-NEXT: kunpckwd %k0, %k1, %k1 +; X32-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_kunpackw: +; X64: # %bb.0: # %entry +; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 +; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1 +; X64-NEXT: kunpckwd %k0, %k1, %k1 +; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__B to <32 x i16> + %1 = bitcast <8 x i64> %__A to <32 x i16> + %2 = icmp ne <32 x i16> %0, %1 + %3 = bitcast <32 x i1> %2 to i32 + %4 = bitcast <8 x i64> %__C to <32 x i16> + %5 = bitcast <8 x i64> %__D to <32 x i16> + %6 = icmp ne <32 x i16> %4, %5 + %7 = bitcast <32 x i1> %6 to i32 + %and.i = and i32 %7, 65535 + %shl.i = shl i32 %3, 16 + %or.i = or i32 %and.i, %shl.i + %8 = bitcast <8 x i64> %__E to <32 x i16> + %9 = bitcast <8 x i64> %__F to <32 x i16> + %10 = icmp ne <32 x i16> %8, %9 + %11 = bitcast i32 %or.i to <32 x i1> + %12 = and <32 x i1> %10, %11 + %13 = bitcast <32 x i1> %12 to i32 + ret i32 %13 +} + + define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) { ; X32-LABEL: test_mm512_mask_set1_epi8: ; X32: # %bb.0: # %entry @@ -694,13 +805,13 @@ ; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; X32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] ; X32-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; X32-NEXT: vpmovb2m %zmm0, %k0 -; X32-NEXT: vpmovm2b %k0, %zmm0 -; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: shrl $30, %ecx ; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X32-NEXT: vpmovm2b %k0, %zmm2 ; X32-NEXT: vpbroadcastw %xmm2, %xmm2 ; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -1422,13 +1533,13 @@ ; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] ; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; X32-NEXT: vpmovb2m %zmm0, %k0 -; X32-NEXT: vpmovm2b %k0, %zmm0 -; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: shrl $30, %ecx ; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X32-NEXT: vpmovm2b %k0, %zmm2 ; X32-NEXT: vpbroadcastw %xmm2, %xmm2 ; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2,6 +2,45 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 +declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32) + +define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { +; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: movzwl %di, %eax +; AVX512BW-NEXT: shll $16, %esi +; AVX512BW-NEXT: orl %esi, %eax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: shll $16, %eax +; AVX512F-32-NEXT: orl %ecx, %eax +; AVX512F-32-NEXT: retl + %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) + ret i32 %res +} + +declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64) + +define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { +; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: shlq $32, %rsi +; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: xorl %eax, %eax +; AVX512F-32-NEXT: retl + %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) + ret i64 %res +} + declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1455,55 +1455,6 @@ ret <8 x i64> %res2 } -declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32) - -define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { -; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k0 -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: kunpckwd %k1, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0 -; AVX512F-32-NEXT: kmovd %k0, %eax -; AVX512F-32-NEXT: retl - %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) - ret i32 %res -} - -declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64) - -define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { -; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rdi, %k0 -; AVX512BW-NEXT: kmovq %rsi, %k1 -; AVX512BW-NEXT: kunpckdq %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $12, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0 -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $12, %esp -; AVX512F-32-NEXT: retl - %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) - ret i64 %res -} - declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>) define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {