Skip to content

Commit 561092f

Browse files
committedAug 11, 2017
[AVX512] Remove and autoupgrade many of the broadcast intrinsics
Summary: This autoupgrades most of the broadcast intrinsics. They've been unused in clang for some time. This leaves the 32x2 intrinsics because they are still used in clang. Reviewers: RKSimon, zvi, igorb Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D36606 llvm-svn: 310725
1 parent 0f30fe9 commit 561092f

12 files changed

+435
-533
lines changed
 

‎llvm/include/llvm/IR/IntrinsicsX86.td

-60
Original file line numberDiff line numberDiff line change
@@ -4440,66 +4440,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
44404440
Intrinsic<[llvm_v16i32_ty],
44414441
[llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
44424442

4443-
def int_x86_avx512_mask_broadcastf32x4_256 :
4444-
GCCBuiltin<"__builtin_ia32_broadcastf32x4_256_mask">,
4445-
Intrinsic<[llvm_v8f32_ty],
4446-
[llvm_v4f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
4447-
4448-
def int_x86_avx512_mask_broadcastf32x4_512 :
4449-
GCCBuiltin<"__builtin_ia32_broadcastf32x4_512">,
4450-
Intrinsic<[llvm_v16f32_ty],
4451-
[llvm_v4f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;
4452-
4453-
def int_x86_avx512_mask_broadcastf32x8_512 :
4454-
GCCBuiltin<"__builtin_ia32_broadcastf32x8_512_mask">,
4455-
Intrinsic<[llvm_v16f32_ty],
4456-
[llvm_v8f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;
4457-
4458-
def int_x86_avx512_mask_broadcastf64x2_256 :
4459-
GCCBuiltin<"__builtin_ia32_broadcastf64x2_256_mask">,
4460-
Intrinsic<[llvm_v4f64_ty],
4461-
[llvm_v2f64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
4462-
4463-
def int_x86_avx512_mask_broadcastf64x2_512 :
4464-
GCCBuiltin<"__builtin_ia32_broadcastf64x2_512_mask">,
4465-
Intrinsic<[llvm_v8f64_ty],
4466-
[llvm_v2f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;
4467-
4468-
def int_x86_avx512_mask_broadcastf64x4_512 :
4469-
GCCBuiltin<"__builtin_ia32_broadcastf64x4_512">,
4470-
Intrinsic<[llvm_v8f64_ty],
4471-
[llvm_v4f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;
4472-
4473-
def int_x86_avx512_mask_broadcasti32x4_256 :
4474-
GCCBuiltin<"__builtin_ia32_broadcasti32x4_256_mask">,
4475-
Intrinsic<[llvm_v8i32_ty],
4476-
[llvm_v4i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
4477-
4478-
def int_x86_avx512_mask_broadcasti32x4_512 :
4479-
GCCBuiltin<"__builtin_ia32_broadcasti32x4_512">,
4480-
Intrinsic<[llvm_v16i32_ty],
4481-
[llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
4482-
4483-
def int_x86_avx512_mask_broadcasti32x8_512 :
4484-
GCCBuiltin<"__builtin_ia32_broadcasti32x8_512_mask">,
4485-
Intrinsic<[llvm_v16i32_ty],
4486-
[llvm_v8i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
4487-
4488-
def int_x86_avx512_mask_broadcasti64x2_256 :
4489-
GCCBuiltin<"__builtin_ia32_broadcasti64x2_256_mask">,
4490-
Intrinsic<[llvm_v4i64_ty],
4491-
[llvm_v2i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
4492-
4493-
def int_x86_avx512_mask_broadcasti64x2_512 :
4494-
GCCBuiltin<"__builtin_ia32_broadcasti64x2_512_mask">,
4495-
Intrinsic<[llvm_v8i64_ty],
4496-
[llvm_v2i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
4497-
4498-
def int_x86_avx512_mask_broadcasti64x4_512 :
4499-
GCCBuiltin<"__builtin_ia32_broadcasti64x4_512">,
4500-
Intrinsic<[llvm_v8i64_ty],
4501-
[llvm_v4i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
4502-
45034443
def int_x86_avx512_broadcastmw_512 :
45044444
GCCBuiltin<"__builtin_ia32_broadcastmw512">,
45054445
Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>;

‎llvm/lib/IR/AutoUpgrade.cpp

+23
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,14 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
239239
Name.startswith("avx2.pblendd.") || // Added in 3.7
240240
Name.startswith("avx.vbroadcastf128") || // Added in 4.0
241241
Name == "avx2.vbroadcasti128" || // Added in 3.7
242+
Name.startswith("avx512.mask.broadcastf32x4.") || // Added in 6.0
243+
Name.startswith("avx512.mask.broadcastf64x2.") || // Added in 6.0
244+
Name.startswith("avx512.mask.broadcasti32x4.") || // Added in 6.0
245+
Name.startswith("avx512.mask.broadcasti64x2.") || // Added in 6.0
246+
Name == "avx512.mask.broadcastf32x8.512" || // Added in 6.0
247+
Name == "avx512.mask.broadcasti32x8.512" || // Added in 6.0
248+
Name == "avx512.mask.broadcastf64x4.512" || // Added in 6.0
249+
Name == "avx512.mask.broadcasti64x4.512" || // Added in 6.0
242250
Name == "xop.vpcmov" || // Added in 3.8
243251
Name == "xop.vpcmov.256" || // Added in 5.0
244252
Name.startswith("avx512.mask.move.s") || // Added in 4.0
@@ -1221,6 +1229,21 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
12211229
else
12221230
Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
12231231
{ 0, 1, 2, 3, 0, 1, 2, 3 });
1232+
} else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") ||
1233+
Name.startswith("avx512.mask.broadcasti"))) {
1234+
unsigned NumSrcElts =
1235+
CI->getArgOperand(0)->getType()->getVectorNumElements();
1236+
unsigned NumDstElts = CI->getType()->getVectorNumElements();
1237+
1238+
SmallVector<uint32_t, 8> ShuffleMask(NumDstElts);
1239+
for (unsigned i = 0; i != NumDstElts; ++i)
1240+
ShuffleMask[i] = i % NumSrcElts;
1241+
1242+
Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
1243+
CI->getArgOperand(0),
1244+
ShuffleMask);
1245+
Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
1246+
CI->getArgOperand(1));
12241247
} else if (IsX86 && (Name.startswith("avx2.pbroadcast") ||
12251248
Name.startswith("avx2.vbroadcast") ||
12261249
Name.startswith("avx512.pbroadcast") ||

‎llvm/lib/Target/X86/X86ISelLowering.cpp

-17
Original file line numberDiff line numberDiff line change
@@ -19908,23 +19908,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
1990819908
DAG.getIntPtrConstant(0, dl));
1990919909
return DAG.getBitcast(Op.getValueType(), Res);
1991019910
}
19911-
case BRCST_SUBVEC_TO_VEC: {
19912-
SDValue Src = Op.getOperand(1);
19913-
SDValue Passthru = Op.getOperand(2);
19914-
SDValue Mask = Op.getOperand(3);
19915-
EVT resVT = Passthru.getValueType();
19916-
SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19917-
DAG.getUNDEF(resVT), Src,
19918-
DAG.getIntPtrConstant(0, dl));
19919-
SDValue immVal;
19920-
if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19921-
immVal = DAG.getConstant(0x44, dl, MVT::i8);
19922-
else
19923-
immVal = DAG.getConstant(0, dl, MVT::i8);
19924-
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19925-
subVec, subVec, immVal),
19926-
Mask, Passthru, Subtarget, DAG);
19927-
}
1992819911
case BRCST32x2_TO_VEC: {
1992919912
SDValue Src = Op.getOperand(1);
1993019913
SDValue PassThru = Op.getOperand(2);

‎llvm/lib/Target/X86/X86IntrinsicsInfo.h

+1-25
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ enum IntrinsicType : uint16_t {
3232
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
3333
VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
3434
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
35-
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC,
35+
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST32x2_TO_VEC,
3636
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
3737
EXPAND_FROM_MEM,
3838
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
@@ -482,36 +482,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
482482
X86ISD::VBROADCAST, 0),
483483
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
484484
X86ISD::VBROADCAST, 0),
485-
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
486-
X86ISD::SHUF128, 0),
487-
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
488-
X86ISD::SHUF128, 0),
489-
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC,
490-
X86ISD::SHUF128, 0),
491-
X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC,
492-
X86ISD::SHUF128, 0),
493-
X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC,
494-
X86ISD::SHUF128, 0),
495-
X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
496-
X86ISD::SHUF128, 0),
497485
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, BRCST32x2_TO_VEC,
498486
X86ISD::VBROADCAST, 0),
499487
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, BRCST32x2_TO_VEC,
500488
X86ISD::VBROADCAST, 0),
501489
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, BRCST32x2_TO_VEC,
502490
X86ISD::VBROADCAST, 0),
503-
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
504-
X86ISD::SHUF128, 0),
505-
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
506-
X86ISD::SHUF128, 0),
507-
X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC,
508-
X86ISD::SHUF128, 0),
509-
X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC,
510-
X86ISD::SHUF128, 0),
511-
X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC,
512-
X86ISD::SHUF128, 0),
513-
X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
514-
X86ISD::SHUF128, 0),
515491
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
516492
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
517493
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,

‎llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll

+137
Original file line numberDiff line numberDiff line change
@@ -3444,3 +3444,140 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
34443444
}
34453445

34463446
declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
3447+
3448+
declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)
3449+
3450+
define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
3451+
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
3452+
; CHECK: ## BB#0:
3453+
; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
3454+
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
3455+
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
3456+
; CHECK-NEXT: kmovw %edi, %k1
3457+
; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
3458+
; CHECK-NEXT: vmovaps %zmm0, %zmm2 {%k1} {z}
3459+
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
3460+
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
3461+
; CHECK-NEXT: retq
3462+
3463+
%res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
3464+
%res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
3465+
%res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
3466+
%res4 = fadd <16 x float> %res1, %res2
3467+
%res5 = fadd <16 x float> %res3, %res4
3468+
ret <16 x float> %res5
3469+
}
3470+
3471+
define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(<4 x float>* %x0ptr, <16 x float> %x2, i16 %mask) {
3472+
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512_load:
3473+
; CHECK: ## BB#0:
3474+
; CHECK-NEXT: kmovw %esi, %k1
3475+
; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3476+
; CHECK-NEXT: retq
3477+
%x0 = load <4 x float>, <4 x float>* %x0ptr
3478+
%res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
3479+
ret <16 x float> %res
3480+
}
3481+
3482+
declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8)
3483+
3484+
define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
3485+
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
3486+
; CHECK: ## BB#0:
3487+
; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
3488+
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2
3489+
; CHECK-NEXT: kmovw %edi, %k1
3490+
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
3491+
; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
3492+
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
3493+
; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
3494+
; CHECK-NEXT: retq
3495+
3496+
%res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
3497+
%res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
3498+
%res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
3499+
%res4 = fadd <8 x double> %res1, %res2
3500+
%res5 = fadd <8 x double> %res3, %res4
3501+
ret <8 x double> %res5
3502+
}
3503+
3504+
define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(<4 x double>* %x0ptr, <8 x double> %x2, i8 %mask) {
3505+
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load:
3506+
; CHECK: ## BB#0:
3507+
; CHECK-NEXT: kmovw %esi, %k1
3508+
; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
3509+
; CHECK-NEXT: retq
3510+
3511+
%x0 = load <4 x double>, <4 x double>* %x0ptr
3512+
%res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
3513+
ret <8 x double> %res
3514+
}
3515+
3516+
declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16)
3517+
3518+
define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
3519+
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
3520+
; CHECK: ## BB#0:
3521+
; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
3522+
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3523+
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3524+
; CHECK-NEXT: kmovw %edi, %k1
3525+
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
3526+
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
3527+
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
3528+
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
3529+
; CHECK-NEXT: retq
3530+
3531+
%res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
3532+
%res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
3533+
%res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
3534+
%res4 = add <16 x i32> %res1, %res2
3535+
%res5 = add <16 x i32> %res3, %res4
3536+
ret <16 x i32> %res5
3537+
}
3538+
3539+
define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(<4 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) {
3540+
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512_load:
3541+
; CHECK: ## BB#0:
3542+
; CHECK-NEXT: kmovw %esi, %k1
3543+
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3544+
; CHECK-NEXT: retq
3545+
3546+
%x0 = load <4 x i32>, <4 x i32>* %x0ptr
3547+
%res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
3548+
ret <16 x i32> %res
3549+
}
3550+
3551+
declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8)
3552+
3553+
define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
3554+
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
3555+
; CHECK: ## BB#0:
3556+
; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
3557+
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
3558+
; CHECK-NEXT: kmovw %edi, %k1
3559+
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
3560+
; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
3561+
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
3562+
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
3563+
; CHECK-NEXT: retq
3564+
3565+
%res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
3566+
%res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
3567+
%res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
3568+
%res4 = add <8 x i64> %res1, %res2
3569+
%res5 = add <8 x i64> %res3, %res4
3570+
ret <8 x i64> %res5
3571+
}
3572+
3573+
define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(<4 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) {
3574+
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load:
3575+
; CHECK: ## BB#0:
3576+
; CHECK-NEXT: kmovw %esi, %k1
3577+
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
3578+
; CHECK-NEXT: retq
3579+
3580+
%x0 = load <4 x i64>, <4 x i64>* %x0ptr
3581+
%res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
3582+
ret <8 x i64> %res
3583+
}

0 commit comments

Comments
 (0)
Please sign in to comment.