Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -4440,66 +4440,6 @@ Intrinsic<[llvm_v16i32_ty], [llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_broadcastf32x4_256 : - GCCBuiltin<"__builtin_ia32_broadcastf32x4_256_mask">, - Intrinsic<[llvm_v8f32_ty], - [llvm_v4f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcastf32x4_512 : - GCCBuiltin<"__builtin_ia32_broadcastf32x4_512">, - Intrinsic<[llvm_v16f32_ty], - [llvm_v4f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcastf32x8_512 : - GCCBuiltin<"__builtin_ia32_broadcastf32x8_512_mask">, - Intrinsic<[llvm_v16f32_ty], - [llvm_v8f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcastf64x2_256 : - GCCBuiltin<"__builtin_ia32_broadcastf64x2_256_mask">, - Intrinsic<[llvm_v4f64_ty], - [llvm_v2f64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcastf64x2_512 : - GCCBuiltin<"__builtin_ia32_broadcastf64x2_512_mask">, - Intrinsic<[llvm_v8f64_ty], - [llvm_v2f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcastf64x4_512 : - GCCBuiltin<"__builtin_ia32_broadcastf64x4_512">, - Intrinsic<[llvm_v8f64_ty], - [llvm_v4f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcasti32x4_256 : - GCCBuiltin<"__builtin_ia32_broadcasti32x4_256_mask">, - Intrinsic<[llvm_v8i32_ty], - [llvm_v4i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcasti32x4_512 : - GCCBuiltin<"__builtin_ia32_broadcasti32x4_512">, - Intrinsic<[llvm_v16i32_ty], - [llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcasti32x8_512 : - GCCBuiltin<"__builtin_ia32_broadcasti32x8_512_mask">, - Intrinsic<[llvm_v16i32_ty], - [llvm_v8i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcasti64x2_256 : - GCCBuiltin<"__builtin_ia32_broadcasti64x2_256_mask">, - Intrinsic<[llvm_v4i64_ty], - [llvm_v2i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcasti64x2_512 : - GCCBuiltin<"__builtin_ia32_broadcasti64x2_512_mask">, - Intrinsic<[llvm_v8i64_ty], - [llvm_v2i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_broadcasti64x4_512 : - GCCBuiltin<"__builtin_ia32_broadcasti64x4_512">, - Intrinsic<[llvm_v8i64_ty], - [llvm_v4i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_broadcastmw_512 : GCCBuiltin<"__builtin_ia32_broadcastmw512">, Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>; Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -239,6 +239,14 @@ Name.startswith("avx2.pblendd.") || // Added in 3.7 Name.startswith("avx.vbroadcastf128") || // Added in 4.0 Name == "avx2.vbroadcasti128" || // Added in 3.7 + Name.startswith("avx512.mask.broadcastf32x4.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcastf64x2.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcasti32x4.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcasti64x2.") || // Added in 6.0 + Name == "avx512.mask.broadcastf32x8.512" || // Added in 6.0 + Name == "avx512.mask.broadcasti32x8.512" || // Added in 6.0 + Name == "avx512.mask.broadcastf64x4.512" || // Added in 6.0 + Name == "avx512.mask.broadcasti64x4.512" || // Added in 6.0 Name == "xop.vpcmov" || // Added in 3.8 Name == "xop.vpcmov.256" || // Added in 5.0 Name.startswith("avx512.mask.move.s") || // Added in 4.0 @@ -1221,6 +1229,21 @@ else Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()), { 0, 1, 2, 3, 0, 1, 2, 3 }); + } else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") || + Name.startswith("avx512.mask.broadcasti"))) { + unsigned NumSrcElts = + CI->getArgOperand(0)->getType()->getVectorNumElements(); + unsigned NumDstElts = CI->getType()->getVectorNumElements(); + + SmallVector ShuffleMask(NumDstElts); + for (unsigned i = 0; i != NumDstElts; ++i) + ShuffleMask[i] = i % NumSrcElts; + + Rep = Builder.CreateShuffleVector(CI->getArgOperand(0), + CI->getArgOperand(0), + ShuffleMask); + Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, + CI->getArgOperand(1)); } else if (IsX86 && (Name.startswith("avx2.pbroadcast") || Name.startswith("avx2.vbroadcast") || Name.startswith("avx512.pbroadcast") || Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -19908,23 +19908,6 @@ DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } - case BRCST_SUBVEC_TO_VEC: { - SDValue Src = Op.getOperand(1); - SDValue Passthru = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); - EVT resVT = Passthru.getValueType(); - SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, - DAG.getUNDEF(resVT), Src, - DAG.getIntPtrConstant(0, dl)); - SDValue immVal; - if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) - immVal = DAG.getConstant(0x44, dl, MVT::i8); - else - immVal = DAG.getConstant(0, dl, MVT::i8); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - subVec, subVec, immVal), - Mask, Passthru, Subtarget, DAG); - } case BRCST32x2_TO_VEC: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -32,7 +32,7 @@ FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3, VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, - COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC, + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST32x2_TO_VEC, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, @@ -482,36 +482,12 @@ X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC, X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, BRCST32x2_TO_VEC, X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, BRCST32x2_TO_VEC, X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, BRCST32x2_TO_VEC, X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -3444,3 +3444,140 @@ } declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone + +declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + + %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1) + %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask) + %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask) + %res4 = fadd <16 x float> %res1, %res2 + %res5 = fadd <16 x float> %res3, %res4 + ret <16 x float> %res5 +} + +define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(<4 x float>* %x0ptr, <16 x float> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + %x0 = load <4 x float>, <4 x float>* %x0ptr + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask) + ret <16 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + + %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) + %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) + %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask) + %res4 = fadd <8 x double> %res1, %res2 + %res5 = fadd <8 x double> %res3, %res4 + ret <8 x double> %res5 +} + +define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(<4 x double>* %x0ptr, <8 x double> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + + %x0 = load <4 x double>, <4 x double>* %x0ptr + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) + ret <8 x double> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + + %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) + %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) + %res4 = add <16 x i32> %res1, %res2 + %res5 = add <16 x i32> %res3, %res4 + ret <16 x i32> %res5 +} + +define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(<4 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + + %x0 = load <4 x i32>, <4 x i32>* %x0ptr + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + + %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) + %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) + %res4 = add <8 x i64> %res1, %res2 + %res5 = add <8 x i64> %res3, %res4 + ret <8 x i64> %res5 +} + +define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(<4 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq + + %x0 = load <4 x i64>, <4 x i64>* %x0ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) + ret <8 x i64> %res +} Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll @@ -3822,145 +3822,6 @@ declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) -declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - - %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask) - %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask) - %res4 = fadd <16 x float> %res1, %res2 - %res5 = fadd <16 x float> %res3, %res4 - ret <16 x float> %res5 -} - -define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(<4 x float>* %x0ptr, <16 x float> %x2, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %xmm1 -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: retq - %x0 = load <4 x float>, <4 x float>* %x0ptr - %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask) - ret <16 x float> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - - %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) - %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask) - %res4 = fadd <8 x double> %res1, %res2 - %res5 = fadd <8 x double> %res3, %res4 - ret <8 x double> %res5 -} - -define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(<4 x double>* %x0ptr, <8 x double> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovapd (%rdi), %ymm1 -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,0,1,2,3] -; CHECK-NEXT: retq - - %x0 = load <4 x double>, <4 x double>* %x0ptr - %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) - ret <8 x double> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - - %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) - %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) - %res4 = add <16 x i32> %res1, %res2 - %res5 = add <16 x i32> %res3, %res4 - ret <16 x i32> %res5 -} - -define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(<4 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: retq - - %x0 = load <4 x i32>, <4 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - - %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) - %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) - %res4 = add <8 x i64> %res1, %res2 - %res5 = add <8 x i64> %res3, %res4 - ret <8 x i64> %res5 -} - -define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(<4 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,0,1,2,3] -; CHECK-NEXT: retq - - %x0 = load <4 x i64>, <4 x i64>* %x0ptr - %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) - ret <8 x i64> %res -} - declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { Index: llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -155,3 +155,141 @@ %res = call <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8 %x0) ret <8 x i64> %res } + +declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + + %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1) + %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask) + %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask) + %res4 = fadd <16 x float> %res1, %res2 + %res5 = fadd <16 x float> %res3, %res4 + ret <16 x float> %res5 +} + +define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512_load(<8 x float>* %x0ptr, <16 x float> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; CHECK-NEXT: retq + + %x0 = load <8 x float>, <8 x float>* %x0ptr + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask) + ret <16 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + + %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1) + %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask) + %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask) + %res4 = fadd <8 x double> %res1, %res2 + %res5 = fadd <8 x double> %res3, %res4 + ret <8 x double> %res5 +} + +define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512_load(<2 x double>* %x0ptr, <8 x double> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + + %x0 = load <2 x double>, <2 x double>* %x0ptr + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask) + ret <8 x double> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + + %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) + %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) + %res4 = add <16 x i32> %res1, %res2 + %res5 = add <16 x i32> %res3, %res4 + ret <16 x i32> %res5 +} + +define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512_load(<8 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; CHECK-NEXT: retq + + %x0 = load <8 x i32>, <8 x i32>* %x0ptr + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + + %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) + %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) + %res4 = add <8 x i64> %res1, %res2 + %res5 = add <8 x i64> %res3, %res4 + ret <8 x i64> %res5 +} + +define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512_load(<2 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1] +; CHECK-NEXT: retq + + %x0 = load <2 x i64>, <2 x i64>* %x0ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) + ret <8 x i64> %res +} Index: llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -461,143 +461,3 @@ %res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0) ret i8 %res } - -declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - - %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask) - %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask) - %res4 = fadd <16 x float> %res1, %res2 - %res5 = fadd <16 x float> %res3, %res4 - ret <16 x float> %res5 -} - -define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512_load(<8 x float>* %x0ptr, <16 x float> %x2, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; CHECK-NEXT: retq - - %x0 = load <8 x float>, <8 x float>* %x0ptr - %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask) - ret <16 x float> %res -} - -declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - - %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask) - %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask) - %res4 = fadd <8 x double> %res1, %res2 - %res5 = fadd <8 x double> %res3, %res4 - ret <8 x double> %res5 -} - -define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512_load(<2 x double>* %x0ptr, <8 x double> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovapd (%rdi), %xmm1 -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,0,1,0,1,0,1] -; CHECK-NEXT: retq - - %x0 = load <2 x double>, <2 x double>* %x0ptr - %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask) - ret <8 x double> %res -} - -declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - - %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) - %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) - %res4 = add <16 x i32> %res1, %res2 - %res5 = add <16 x i32> %res3, %res4 - ret <16 x i32> %res5 -} - -define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512_load(<8 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; CHECK-NEXT: retq - - %x0 = load <8 x i32>, <8 x i32>* %x0ptr - %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) - ret <16 x i32> %res -} - -declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - - %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) - %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) - %res4 = add <8 x i64> %res1, %res2 - %res5 = add <8 x i64> %res3, %res4 - ret <8 x i64> %res5 -} - -define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512_load(<2 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,0,1,0,1,0,1] -; CHECK-NEXT: retq - - %x0 = load <2 x i64>, <2 x i64>* %x0ptr - %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) - ret <8 x i64> %res -} Index: llvm/trunk/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1667,3 +1667,73 @@ %res = call <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8 %x0) ret <4 x i64> %res } + +declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xc8,0x01] +; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] +; CHECK-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc0,0x01] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + + %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1) + %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) + %res3 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> zeroinitializer, i8 %mask) + %res4 = fadd <4 x double> %res1, %res2 + %res5 = fadd <4 x double> %res3, %res4 + ret <4 x double> %res5 +} + +define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(<2 x double>* %x0ptr, <4 x double> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vbroadcastf64x2 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x1a,0x07] +; CHECK-NEXT: ## ymm0 {%k1} = mem[0,1,0,1] +; CHECK-NEXT: retq ## encoding: [0xc3] + + %x0 = load <2 x double>, <2 x double>* %x0ptr + %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) + ret <4 x double> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xc8,0x01] +; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] +; CHECK-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc0,0x01] +; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + + %res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) + %res3 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> zeroinitializer, i8 %mask) + %res4 = add <4 x i64> %res1, %res2 + %res5 = add <4 x i64> %res3, %res4 + ret <4 x i64> %res5 +} + +define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256_load(<2 x i64>* %x0ptr, <4 x i64> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vbroadcasti64x2 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x5a,0x07] +; CHECK-NEXT: ## ymm0 {%k1} = mem[0,1,0,1] +; CHECK-NEXT: retq ## encoding: [0xc3] + + %x0 = load <2 x i64>, <2 x i64>* %x0ptr + %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) + ret <4 x i64> %res +} Index: llvm/trunk/test/CodeGen/X86/avx512dqvl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -743,81 +743,3 @@ %res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0) ret i8 %res } - -declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x double>, i8) - -define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd0,0x00] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1] -; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xc8,0x00] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1] -; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc0,0x00] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - - %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) - %res3 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> zeroinitializer, i8 %mask) - %res4 = fadd <4 x double> %res1, %res2 - %res5 = fadd <4 x double> %res3, %res4 - ret <4 x double> %res5 -} - -define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(<2 x double>* %x0ptr, <4 x double> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovapd (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x0f] -; CHECK-NEXT: vshuff64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x23,0xc1,0x00] -; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,0,1] -; CHECK-NEXT: retq ## encoding: [0xc3] - - %x0 = load <2 x double>, <2 x double>* %x0ptr - %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) - ret <4 x double> %res -} - -declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x43,0xd0,0x00] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1] -; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xc8,0x00] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1] -; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc0,0x00] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1] -; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - - %res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) - %res3 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> zeroinitializer, i8 %mask) - %res4 = add <4 x i64> %res1, %res2 - %res5 = add <4 x i64> %res3, %res4 - ret <4 x i64> %res5 -} - -define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256_load(<2 x i64>* %x0ptr, <4 x i64> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f] -; CHECK-NEXT: vshufi64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x43,0xc1,0x00] -; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,0,1] -; CHECK-NEXT: retq ## encoding: [0xc3] - - %x0 = load <2 x i64>, <2 x i64>* %x0ptr - %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) - ret <4 x i64> %res -} Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -5701,3 +5701,69 @@ } declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone + +declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xc8,0x01] +; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] +; CHECK-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) + %res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask) + %res4 = fadd <8 x float> %res1, %res2 + %res5 = fadd <8 x float> %res3, %res4 + ret <8 x float> %res5 +} + +define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256_load(<4 x float>* %x0ptr, <8 x float> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vbroadcastf32x4 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1a,0x07] +; CHECK-NEXT: ## ymm0 {%k1} = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %x0 = load <4 x float>, <4 x float>* %x0ptr + %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) + ret <8 x float> %res +} + +declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xc8,0x01] +; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc9] +; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) + %res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) + %res4 = add <8 x i32> %res1, %res2 + %res5 = add <8 x i32> %res3, %res4 + ret <8 x i32> %res5 +} + +define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256_load(<4 x i32>* %x0ptr, <8 x i32> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256_load: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vbroadcasti32x4 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x5a,0x07] +; CHECK-NEXT: ## ymm0 {%k1} = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %x0 = load <4 x i32>, <4 x i32>* %x0ptr + %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) + ret <8 x i32> %res +} Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3426,80 +3426,6 @@ declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone -declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8) - -define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd0,0x00] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xc8,0x00] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc0,0x00] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) - %res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask) - %res4 = fadd <8 x float> %res1, %res2 - %res5 = fadd <8 x float> %res3, %res4 - ret <8 x float> %res5 -} - -define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256_load(<4 x float>* %x0ptr, <8 x float> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] -; CHECK-NEXT: vshuff32x4 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x23,0xc1,0x00] -; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,2,3,0,1,2,3] -; CHECK-NEXT: retq ## encoding: [0xc3] - %x0 = load <4 x float>, <4 x float>* %x0ptr - %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) - ret <8 x float> %res -} - -declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x43,0xd0,0x00] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xc8,0x00] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc0,0x00] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) - %res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) - %res4 = add <8 x i32> %res1, %res2 - %res5 = add <8 x i32> %res3, %res4 - ret <8 x i32> %res5 -} - -define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256_load(<4 x i32>* %x0ptr, <8 x i32> %x2, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256_load: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f] -; CHECK-NEXT: vshufi32x4 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x43,0xc1,0x00] -; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,2,3,0,1,2,3] -; CHECK-NEXT: retq ## encoding: [0xc3] - %x0 = load <4 x i32>, <4 x i32>* %x0ptr - %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) - ret <8 x i32> %res -} - declare <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {