Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -1070,11 +1070,23 @@ Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); } else if (IsX86 && (Name.startswith("avx512.kunpck"))) { - uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2; - uint64_t And = (1ULL << Shift) - 1; - Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And); - Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift); - Rep = Builder.CreateOr(LowBits, HighBits); + unsigned NumElts = CI->getType()->getScalarSizeInBits(); + Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts); + Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts); + uint32_t Indices[64]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i; + + // First extract half of each vector. This gives better codegen than + // doing it in a single shuffle. + LHS = Builder.CreateShuffleVector(LHS, LHS, + makeArrayRef(Indices, NumElts / 2)); + RHS = Builder.CreateShuffleVector(RHS, RHS, + makeArrayRef(Indices, NumElts / 2)); + // Concat the vectors. + Rep = Builder.CreateShuffleVector(LHS, RHS, + makeArrayRef(Indices, NumElts)); + Rep = Builder.CreateBitCast(Rep, CI->getType()); } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) { Type *I32Ty = Type::getInt32Ty(C); Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0), Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -38,24 +38,21 @@ ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: - %0 = bitcast <8 x i64> %__A to <16 x i32> - %1 = bitcast <8 x i64> %__B to <16 x i32> - %2 = icmp ne <16 x i32> %0, %1 - %3 = bitcast <16 x i1> %2 to i16 - %4 = bitcast <8 x i64> %__C to <16 x i32> - %5 = bitcast <8 x i64> %__D to <16 x i32> - %6 = icmp ne <16 x i32> %4, %5 - %7 = bitcast <16 x i1> %6 to i16 - %8 = and i16 %7, 255 - %shl.i = shl i16 %3, 8 - %or.i = or i16 %8, %shl.i - %9 = bitcast <8 x i64> %__E to <16 x i32> - %10 = bitcast <8 x i64> %__F to <16 x i32> - %11 = icmp ne <16 x i32> %9, %10 - %12 = bitcast i16 %or.i to <16 x i1> - %13 = and <16 x i1> %11, %12 - %14 = bitcast <16 x i1> %13 to i16 - ret i16 %14 + %0 = bitcast <8 x i64> %__E to <16 x i32> + %1 = bitcast <8 x i64> %__F to <16 x i32> + %2 = bitcast <8 x i64> %__A to <16 x i32> + %3 = bitcast <8 x i64> %__B to <16 x i32> + %4 = icmp ne <16 x i32> %2, %3 + %5 = bitcast <8 x i64> %__C to <16 x i32> + %6 = bitcast <8 x i64> %__D to <16 x i32> + %7 = icmp ne <16 x i32> %5, %6 + %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> + %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> + %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> + %11 = icmp ne <16 x i32> %0, %1 + %12 = and <16 x i1> %11, %10 + %13 = bitcast <16 x i1> %12 to i16 + ret i16 %13 } define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) { Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -6,9 +6,10 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) { ; CHECK-LABEL: unpckbw_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: shll $8, %esi -; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: kunpckbw %k0, %k1, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) Index: test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -15,16 +15,10 @@ ; X32-NEXT: andl $-64, %esp ; X32-NEXT: subl $64, %esp ; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 -; X32-NEXT: vmovdqa64 72(%ebp), %zmm4 -; X32-NEXT: vmovdqa64 8(%ebp), %zmm5 ; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 -; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; X32-NEXT: vpcmpneqb %zmm5, %zmm2, %k0 -; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vpcmpneqb 8(%ebp), %zmm2, %k1 ; X32-NEXT: kunpckdq %k0, %k1, %k1 -; X32-NEXT: vpcmpneqb %zmm3, %zmm4, %k0 {%k1} +; X32-NEXT: vpcmpneqb 72(%ebp), %zmm3, %k0 {%k1} ; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -43,22 +37,19 @@ ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: - %0 = bitcast <8 x i64> %__B to <64 x i8> - %1 = bitcast <8 x i64> %__A to <64 x i8> - %2 = icmp ne <64 x i8> %0, %1 - %3 = bitcast <64 x i1> %2 to i64 - %4 = bitcast <8 x i64> %__C to <64 x i8> - %5 = bitcast <8 x i64> %__D to <64 x i8> - %6 = icmp ne <64 x i8> %4, %5 - %7 = bitcast <64 x i1> %6 to i64 - %and.i = and i64 %7, 4294967295 - %shl.i = shl i64 %3, 32 - %or.i = or i64 %and.i, %shl.i - %8 = bitcast <8 x i64> %__E to <64 x i8> - %9 = bitcast <8 x i64> %__F to <64 x i8> - %10 = icmp ne <64 x i8> %8, %9 - %11 = bitcast i64 %or.i to <64 x i1> - %12 = and <64 x i1> %10, %11 + %0 = bitcast <8 x i64> %__E to <64 x i8> + %1 = bitcast <8 x i64> %__F to <64 x i8> + %2 = bitcast <8 x i64> %__B to <64 x i8> + %3 = bitcast <8 x i64> %__A to <64 x i8> + %4 = icmp ne <64 x i8> %2, %3 + %5 = bitcast <8 x i64> %__C to <64 x i8> + %6 = bitcast <8 x i64> %__D to <64 x i8> + %7 = icmp ne <64 x i8> %5, %6 + %8 = shufflevector <64 x i1> %4, <64 x i1> undef, <32 x i32> + %9 = shufflevector <64 x i1> %7, <64 x i1> undef, <32 x i32> + %10 = shufflevector <32 x i1> %8, <32 x i1> %9, <64 x i32> + %11 = icmp ne <64 x i8> %0, %1 + %12 = and <64 x i1> %11, %10 %13 = bitcast <64 x i1> %12 to i64 ret i64 %13 } @@ -94,22 +85,19 @@ ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: - %0 = bitcast <8 x i64> %__B to <32 x i16> - %1 = bitcast <8 x i64> %__A to <32 x i16> - %2 = icmp ne <32 x i16> %0, %1 - %3 = bitcast <32 x i1> %2 to i32 - %4 = bitcast <8 x i64> %__C to <32 x i16> - %5 = bitcast <8 x i64> %__D to <32 x i16> - %6 = icmp ne <32 x i16> %4, %5 - %7 = bitcast <32 x i1> %6 to i32 - %and.i = and i32 %7, 65535 - %shl.i = shl i32 %3, 16 - %or.i = or i32 %and.i, %shl.i - %8 = bitcast <8 x i64> %__E to <32 x i16> - %9 = bitcast <8 x i64> %__F to <32 x i16> - %10 = icmp ne <32 x i16> %8, %9 - %11 = bitcast i32 %or.i to <32 x i1> - %12 = and <32 x i1> %10, %11 + %0 = bitcast <8 x i64> %__E to <32 x i16> + %1 = bitcast <8 x i64> %__F to <32 x i16> + %2 = bitcast <8 x i64> %__B to <32 x i16> + %3 = bitcast <8 x i64> %__A to <32 x i16> + %4 = icmp ne <32 x i16> %2, %3 + %5 = bitcast <8 x i64> %__C to <32 x i16> + %6 = bitcast <8 x i64> %__D to <32 x i16> + %7 = icmp ne <32 x i16> %5, %6 + %8 = shufflevector <32 x i1> %4, <32 x i1> undef, <16 x i32> + %9 = shufflevector <32 x i1> %7, <32 x i1> undef, <16 x i32> + %10 = shufflevector <16 x i1> %8, <16 x i1> %9, <32 x i32> + %11 = icmp ne <32 x i16> %0, %1 + %12 = and <32 x i1> %11, %10 %13 = bitcast <32 x i1> %12 to i32 ret i32 %13 } Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -7,17 +7,18 @@ define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { ; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: movzwl %di, %eax -; AVX512BW-NEXT: shll $16, %esi -; AVX512BW-NEXT: orl %esi, %eax +; AVX512BW-NEXT: kmovd %edi, %k0 +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: kunpckwd %k0, %k1, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: shll $16, %eax -; AVX512F-32-NEXT: orl %ecx, %eax +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0 +; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: retl %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) ret i32 %res @@ -28,15 +29,23 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { ; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: movl %edi, %eax -; AVX512BW-NEXT: shlq $32, %rsi -; AVX512BW-NEXT: orq %rsi, %rax +; AVX512BW-NEXT: kmovq %rdi, %k0 +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: kunpckdq %k0, %k1, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: subl $12, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0 +; AVX512F-32-NEXT: kmovq %k0, (%esp) +; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: addl $12, %esp ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) ret i64 %res