Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -1986,57 +1986,6 @@ // Vector load with broadcast let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_mask_pbroadcast_b_gpr_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastb128_gpr_mask">, - Intrinsic<[llvm_v16i8_ty], - [llvm_i8_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pbroadcast_b_gpr_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastb256_gpr_mask">, - Intrinsic<[llvm_v32i8_ty], - [llvm_i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pbroadcast_b_gpr_512 : - GCCBuiltin<"__builtin_ia32_pbroadcastb512_gpr_mask">, - Intrinsic<[llvm_v64i8_ty], - [llvm_i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_pbroadcast_w_gpr_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastw128_gpr_mask">, - Intrinsic<[llvm_v8i16_ty], - [llvm_i16_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pbroadcast_w_gpr_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastw256_gpr_mask">, - Intrinsic<[llvm_v16i16_ty], - [llvm_i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pbroadcast_w_gpr_512 : - GCCBuiltin<"__builtin_ia32_pbroadcastw512_gpr_mask">, - Intrinsic<[llvm_v32i16_ty], - [llvm_i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_pbroadcast_d_gpr_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastd128_gpr_mask">, - Intrinsic<[llvm_v4i32_ty], - [llvm_i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pbroadcast_d_gpr_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastd256_gpr_mask">, - Intrinsic<[llvm_v8i32_ty], - [llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pbroadcast_d_gpr_512 : - GCCBuiltin<"__builtin_ia32_pbroadcastd512_gpr_mask">, - Intrinsic<[llvm_v16i32_ty], - [llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_pbroadcast_q_gpr_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastq128_gpr_mask">, - Intrinsic<[llvm_v2i64_ty], - [llvm_i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pbroadcast_q_gpr_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastq256_gpr_mask">, - Intrinsic<[llvm_v4i64_ty], - [llvm_i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pbroadcast_q_gpr_512 : - GCCBuiltin<"__builtin_ia32_pbroadcastq512_gpr_mask">, - Intrinsic<[llvm_v8i64_ty], - [llvm_i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pbroadcast_q_mem_512 : GCCBuiltin<"__builtin_ia32_pbroadcastq512_mem_mask">, Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -72,7 +72,11 @@ // like to use this information to remove upgrade code for some older // intrinsics. It is currently undecided how we will determine that future // point. - if (Name.startswith("sse2.pcmpeq.") || // Added in 3.1 + if (Name.startswith("avx512.mask.pbroadcast.b.gpr") || // Added in 5.0 + Name.startswith("avx512.mask.pbroadcast.w.gpr") || // Added in 5.0 + Name.startswith("avx512.mask.pbroadcast.d.gpr") || // Added in 5.0 + Name.startswith("avx512.mask.pbroadcast.q.gpr") || // Added in 5.0 + Name.startswith("sse2.pcmpeq.") || // Added in 3.1 Name.startswith("sse2.pcmpgt.") || // Added in 3.1 Name.startswith("avx2.pcmpeq.") || // Added in 3.1 Name.startswith("avx2.pcmpgt.") || // Added in 3.1 @@ -1007,6 +1011,15 @@ Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT, CI->getArgOperand(0), CI->getArgOperand(1)); Rep = Builder.CreateSExt(Rep, CI->getType(), ""); + } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast.b.gpr") || + Name.startswith("avx512.mask.pbroadcast.w.gpr") || + Name.startswith("avx512.mask.pbroadcast.d.gpr") || + Name.startswith("avx512.mask.pbroadcast.q.gpr"))) { + unsigned NumElts = + CI->getArgOperand(1)->getType()->getVectorNumElements(); + Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0)); + Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, + CI->getArgOperand(1)); } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) { Type *I32Ty = Type::getInt32Ty(C); Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0), Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -824,30 +824,6 @@ X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_512, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_128, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_256, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_512, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_128, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_256, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_512, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_128, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_256, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_512, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK, Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1,6 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + + define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { + ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: + ; CHECK: ## BB#0: + ; CHECK: vpbroadcastd %edi, {{%zmm[0-9]}} + ; CHECK: kmovw %esi, %k1 + ; CHECK: vpbroadcastd %edi, %zmm0 {%k1} + ; CHECK: vpaddd %zmm0, {{%zmm[0-9]}}, %zmm0 + ; CHECK: vpbroadcastd %edi, %zmm1 {%k1} {z} + ; CHECK: vpaddd %zmm0, %zmm1, %zmm0 + ; CHECK: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res2, %res3 + ret <16 x i32> %res4 + } +declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) + + + define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) { + ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: + ; CHECK: ## BB#0: + ; CHECK: vpbroadcastq %rdi, {{%zmm[0-9]}} + ; CHECK: kmovw %esi, %k1 + ; CHECK: vpbroadcastq %rdi, %zmm0 {%k1} + ; CHECK: vpaddq %zmm0, %zmm1, %zmm0 + ; CHECK: vpbroadcastq %rdi, %zmm1 {%k1} {z} + ; CHECK: vpaddq %zmm0, {{%zmm[0-9]}}, %zmm0 + ; CHECK: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res2, %res3 + ret <8 x i64> %res4 + } + declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) + declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) { Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -4203,44 +4203,8 @@ ret i8 %res2 } -define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z} -; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} -; CHECK-NEXT: vpbroadcastd %edi, %zmm2 -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res2, %res3 - ret <16 x i32> %res4 -} -declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) -define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z} -; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} -; CHECK-NEXT: vpbroadcastq %rdi, %zmm2 -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 -} -declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2,6 +2,68 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 + +declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) + + define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { + ; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: + ; AVX512BW: ## BB#0: + ; AVX512BW: vpbroadcastb %edi, {{%zmm[0-9]}} + ; AVX512BW: kmovq %rsi, %k1 + ; AVX512BW: vpbroadcastb %edi, %zmm0 {%k1} + ; AVX512BW: vpaddb %zmm0, {{%zmm[0-9]}}, %zmm0 + ; AVX512BW: vpbroadcastb %edi, %zmm1 {%k1} {z} + ; AVX512BW: vpaddb %zmm0, %zmm1, %zmm0 + ; AVX512BW: retq + ; + ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: + ; AVX512F-32: # BB#0: + ; AVX512F-32: movb {{[0-9]+}}(%esp), %al + ; AVX512F-32: vpbroadcastb %eax, {{%zmm[0-9]}} + ; AVX512F-32: kmovq {{[0-9]+}}(%esp), %k1 + ; AVX512F-32: vpbroadcastb %eax, %zmm0 {%k1} + ; AVX512F-32: vpaddb %zmm0, {{%zmm[0-9]}}, %zmm0 + ; AVX512F-32: vpbroadcastb %eax, %zmm1 {%k1} {z} + ; AVX512F-32: vpaddb %zmm0, %zmm1, %zmm0 + ; AVX512F-32: retl + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) + %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) + %res3 = add <64 x i8> %res, %res1 + %res4 = add <64 x i8> %res2, %res3 + ret <64 x i8> %res4 + } + +declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32) + define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { + ; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: + ; AVX512BW: ## BB#0: + ; AVX512BW: vpbroadcastw %edi, {{%zmm[0-9]}} + ; AVX512BW: kmovd %esi, %k1 + ; AVX512BW: vpbroadcastw %edi, %zmm0 {%k1} + ; AVX512BW: vpaddw %zmm0, {{%zmm[0-9]}}, %zmm0 + ; AVX512BW: vpbroadcastw %edi, %zmm1 {%k1} {z} + ; AVX512BW: vpaddw %zmm0, %zmm1, %zmm0 + ; AVX512BW: retq + ; + ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: + ; AVX512F-32: # BB#0: + ; AVX512F-32: movw {{[0-9]+}}(%esp), %ax + ; AVX512F-32: vpbroadcastw %eax, {{%zmm[0-9]}} + ; AVX512F-32: kmovd {{[0-9]+}}(%esp), %k1 + ; AVX512F-32: vpbroadcastw %eax, %zmm0 {%k1} + ; AVX512F-32: vpaddw %zmm0, {{%zmm[0-9]}}, %zmm0 + ; AVX512F-32: vpbroadcastw %eax, %zmm1 {%k1} {z} + ; AVX512F-32: vpaddw %zmm0, %zmm1, %zmm0 + ; AVX512F-32: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) + %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) + %res3 = add <32 x i16> %res, %res1 + %res4 = add <32 x i16> %res2, %res3 + ret <32 x i16> %res4 + } + declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64) define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) { Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1915,69 +1915,6 @@ ret i32 %res2 } -declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) - -define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { -; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: kmovq %rsi, %k1 -; AVX512BW-NEXT: vpbroadcastb %edi, %zmm1 {%k1} {z} -; AVX512BW-NEXT: vpbroadcastb %edi, %zmm0 {%k1} -; AVX512BW-NEXT: vpbroadcastb %edi, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: -; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: movb {{[0-9]+}}(%esp), %al -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 -; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm1 {%k1} {z} -; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm0 {%k1} -; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm2 -; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0 -; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res2, %res3 - ret <64 x i8> %res4 -} - -declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32) - -define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { -; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpbroadcastw %edi, %zmm1 {%k1} {z} -; AVX512BW-NEXT: vpbroadcastw %edi, %zmm0 {%k1} -; AVX512BW-NEXT: vpbroadcastw %edi, %zmm2 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: -; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: movw {{[0-9]+}}(%esp), %ax -; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1 {%k1} {z} -; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm0 {%k1} -; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm2 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res2, %res3 - ret <32 x i16> %res4 -} define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) { Index: test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -1,6 +1,90 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s +declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastb %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xcf] +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7] +; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; CHECK-NEXT: vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf] +; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) + %res3 = add <16 x i8> %res, %res1 + %res4 = add <16 x i8> %res2, %res3 + ret <16 x i8> %res4 +} + + +declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastw %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xcf] +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; CHECK-NEXT: vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) + %res3 = add <8 x i16> %res, %res1 + %res4 = add <8 x i16> %res2, %res3 + ret <8 x i16> %res4 +} + + + declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) + + define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastb %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xcf] +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7] +; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; CHECK-NEXT: vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf] +; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) + %res3 = add <32 x i8> %res, %res1 + %res4 = add <32 x i8> %res2, %res3 + ret <32 x i8> %res4 + } + + + +declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16) + + define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastw %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xcf] +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT: vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) + %res3 = add <16 x i16> %res, %res1 + %res4 = add <16 x i16> %res2, %res3 + ret <16 x i16> %res4 + } + declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32) define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) { Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -2793,82 +2793,4 @@ ret i16 %res2 } -declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) -define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf] -; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7] -; CHECK-NEXT: vpbroadcastb %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xd7] -; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] -; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res2, %res3 - ret <32 x i8> %res4 -} - -declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) - -define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf] -; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7] -; CHECK-NEXT: vpbroadcastb %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7] -; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] -; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res2, %res3 - ret <16 x i8> %res4 -} - -declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16) - -define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf] -; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7] -; CHECK-NEXT: vpbroadcastw %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res2, %res3 - ret <16 x i16> %res4 -} - -declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) - -define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf] -; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7] -; CHECK-NEXT: vpbroadcastw %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7] -; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res2, %res3 - ret <8 x i16> %res4 -} Index: test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -1,6 +1,90 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s +declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastd %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xcf] +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res2, %res3 + ret <4 x i32> %res4 +} + + +declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xcf] +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res2, %res3 + ret <2 x i64> %res4 +} + + + declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) + + define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastd %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xcf] +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; CHECK-NEXT: vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res2, %res3 + ret <8 x i32> %res4 + } + + declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) + + define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xcf] +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res2, %res3 + ret <4 x i64> %res4 + } + + + declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8) define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask, i32 * %y_ptr) { @@ -3925,9 +4009,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; CHECK-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI276_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI280_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 5, value: LCPI276_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 5, value: LCPI280_1-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> , <8 x i32> , <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -4508,9 +4592,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,18446744073709551607] ; CHECK-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI304_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI308_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI304_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI308_1-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> , <2 x i64> , <2 x i64> zeroinitializer, i8 -1) ret <2 x i64> %res Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4157,85 +4157,6 @@ ret i8 %res2 } -declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf] -; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7] -; CHECK-NEXT: vpbroadcastd %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xd7] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res2, %res3 - ret <8 x i32> %res4 -} - -declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8) - -define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf] -; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7] -; CHECK-NEXT: vpbroadcastd %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xd7] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res2, %res3 - ret <4 x i32> %res4 -} - -declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf] -; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7] -; CHECK-NEXT: vpbroadcastq %rdi, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xd7] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res2, %res3 - ret <4 x i64> %res4 -} - -declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8) - -define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf] -; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7] -; CHECK-NEXT: vpbroadcastq %rdi, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd7] -; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res2, %res3 - ret <2 x i64> %res4 -} define <2 x i64> @test_x86_avx512_psra_q_128(<2 x i64> %a0, <2 x i64> %a1) { Index: test/CodeGen/X86/vselect-packss.ll =================================================================== --- test/CodeGen/X86/vselect-packss.ll +++ test/CodeGen/X86/vselect-packss.ll @@ -4,7 +4,394 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL + +; +; General cases - packing of vector comparison to legal vector result types +; + +define <16 x i8> @vselect_packss_v16i16(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) { +; SSE-LABEL: vselect_packss_v16i16: +; SSE: # BB#0: +; SSE-NEXT: pcmpeqw %xmm3, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: vselect_packss_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vselect_packss_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vselect_packss_v16i16: +; AVX512: # BB#0: +; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp eq <16 x i16> %a0, %a1 + %2 = sext <16 x i1> %1 to <16 x i8> + %3 = and <16 x i8> %2, %a2 + %4 = xor <16 x i8> %2, + %5 = and <16 x i8> %4, %a3 + %6 = or <16 x i8> %3, %5 + ret <16 x i8> %6 +} + +define <16 x i8> @vselect_packss_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a2, <16 x i8> %a3) { +; SSE-LABEL: vselect_packss_v16i32: +; SSE: # BB#0: +; SSE-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: vselect_packss_v16i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vselect_packss_v16i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vselect_packss_v16i32: +; AVX512: # BB#0: +; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp eq <16 x i32> %a0, %a1 + %2 = sext <16 x i1> %1 to <16 x i8> + %3 = and <16 x i8> %2, %a2 + %4 = xor <16 x i8> %2, + %5 = and <16 x i8> %4, %a3 + %6 = or <16 x i8> %3, %5 + ret <16 x i8> %6 +} + +define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8> %a2, <16 x i8> %a3) { +; SSE2-LABEL: vselect_packss_v16i64: +; SSE2: # BB#0: +; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,0,3,2] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,0,3,2] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: packsswb %xmm8, %xmm7 +; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: packsswb %xmm6, %xmm5 +; SSE2-NEXT: packsswb %xmm7, %xmm5 +; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packsswb %xmm4, %xmm3 +; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packsswb %xmm2, %xmm1 +; SSE2-NEXT: packsswb %xmm3, %xmm1 +; SSE2-NEXT: packsswb %xmm5, %xmm1 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: vselect_packss_v16i64: +; SSE42: # BB#0: +; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm7 +; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm6 +; SSE42-NEXT: packsswb %xmm7, %xmm6 +; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm5 +; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm4 +; SSE42-NEXT: packsswb %xmm5, %xmm4 +; SSE42-NEXT: packsswb %xmm6, %xmm4 +; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm2 +; SSE42-NEXT: packsswb %xmm3, %xmm2 +; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm1 +; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm0 +; SSE42-NEXT: packsswb %xmm1, %xmm0 +; SSE42-NEXT: packsswb %xmm2, %xmm0 +; SSE42-NEXT: packsswb %xmm4, %xmm0 +; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE42-NEXT: pand %xmm0, %xmm1 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 +; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: vselect_packss_v16i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 +; AVX1-NEXT: vpcmpeqq %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpacksswb %xmm8, %xmm3, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1 +; AVX1-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vselect_packss_v16i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpeqq %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vpcmpeqq %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1 +; AVX2-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vselect_packss_v16i64: +; AVX512: # BB#0: +; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm6 +; AVX512-NEXT: vpextrq $1, %xmm6, %rcx +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm7 +; AVX512-NEXT: vpextrq $1, %xmm7, %rdx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpq %rcx, %rdx +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm8 +; AVX512-NEXT: vmovq %xmm6, %rdx +; AVX512-NEXT: vmovq %xmm7, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm6[0],xmm8[0] +; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm7 +; AVX512-NEXT: vpextrq $1, %xmm7, %rdx +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6 +; AVX512-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm9 +; AVX512-NEXT: vmovq %xmm7, %rdx +; AVX512-NEXT: vmovq %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm8 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512-NEXT: vpextrq $1, %xmm7, %rdx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm9 +; AVX512-NEXT: vmovq %xmm7, %rdx +; AVX512-NEXT: vmovq %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm7 +; AVX512-NEXT: vmovq %xmm2, %rdx +; AVX512-NEXT: vmovq %xmm0, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm8 +; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm6 +; AVX512-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm7 +; AVX512-NEXT: vmovq %xmm2, %rdx +; AVX512-NEXT: vmovq %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm6 +; AVX512-NEXT: vpextrq $1, %xmm6, %rdx +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm7 +; AVX512-NEXT: vpextrq $1, %xmm7, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm0 +; AVX512-NEXT: vmovq %xmm6, %rdx +; AVX512-NEXT: vmovq %xmm7, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX512-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm7 +; AVX512-NEXT: vmovq %xmm0, %rdx +; AVX512-NEXT: vmovq %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX512-NEXT: vpextrq $1, %xmm3, %rdx +; AVX512-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmoveq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vmovq %xmm3, %rdx +; AVX512-NEXT: vmovq %xmm1, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: cmoveq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand %xmm4, %xmm0, %xmm1 +; AVX512-NEXT: vpandn %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp eq <16 x i64> %a0, %a1 + %2 = sext <16 x i1> %1 to <16 x i8> + %3 = and <16 x i8> %2, %a2 + %4 = xor <16 x i8> %2, + %5 = and <16 x i8> %4, %a3 + %6 = or <16 x i8> %3, %5 + ret <16 x i8> %6 +} + +; +; PACKSS case +; define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) { ; SSE-LABEL: vselect_packss: