Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -1971,22 +1971,59 @@ GCCBuiltin<"__builtin_ia32_maskstoreps256">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f32_ty], [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_storeu_ps_128 : + GCCBuiltin<"__builtin_ia32_storeups128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_ps_256 : + GCCBuiltin<"__builtin_ia32_storeups256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; def int_x86_avx512_mask_storeu_ps_512 : GCCBuiltin<"__builtin_ia32_storeups512_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_storeu_pd_128 : + GCCBuiltin<"__builtin_ia32_storeupd128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_pd_256 : + GCCBuiltin<"__builtin_ia32_storeupd256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; def int_x86_avx512_mask_storeu_pd_512 : GCCBuiltin<"__builtin_ia32_storeupd512_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_store_ps_128 : + GCCBuiltin<"__builtin_ia32_storeaps128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_store_ps_256 : + GCCBuiltin<"__builtin_ia32_storeaps256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; def int_x86_avx512_mask_store_ps_512 : GCCBuiltin<"__builtin_ia32_storeaps512_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_store_pd_128 : + GCCBuiltin<"__builtin_ia32_storeapd128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_store_pd_256 : + GCCBuiltin<"__builtin_ia32_storeapd256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; def int_x86_avx512_mask_store_pd_512 : GCCBuiltin<"__builtin_ia32_storeapd512_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], - [IntrReadWriteArgMem]>; + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_store_ss : GCCBuiltin<"__builtin_ia32_storess_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty], @@ -2894,14 +2931,84 @@ GCCBuiltin<"__builtin_ia32_maskstoreq256">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty], [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_storeu_b_128 : + GCCBuiltin<"__builtin_ia32_storedquqi128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_b_256 : + GCCBuiltin<"__builtin_ia32_storedquqi256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_b_512 : + GCCBuiltin<"__builtin_ia32_storedquqi512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v64i8_ty, llvm_i64_ty], + [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_storeu_w_128 : + GCCBuiltin<"__builtin_ia32_storedquhi128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_w_256 : + GCCBuiltin<"__builtin_ia32_storedquhi256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_w_512 : + GCCBuiltin<"__builtin_ia32_storedquhi512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_storeu_d_128 : + GCCBuiltin<"__builtin_ia32_storedqusi128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_d_256 : + GCCBuiltin<"__builtin_ia32_storedqusi256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; def int_x86_avx512_mask_storeu_d_512 : GCCBuiltin<"__builtin_ia32_storedqusi512_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_storeu_q_128 : + GCCBuiltin<"__builtin_ia32_storedqudi128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_q_256 : + GCCBuiltin<"__builtin_ia32_storedqudi256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; def int_x86_avx512_mask_storeu_q_512 : GCCBuiltin<"__builtin_ia32_storedqudi512_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_store_d_128 : + GCCBuiltin<"__builtin_ia32_movdqa32store128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_store_d_256 : + GCCBuiltin<"__builtin_ia32_movdqa32store256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_store_d_512 : + GCCBuiltin<"__builtin_ia32_movdqa32store512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + + def int_x86_avx512_mask_store_q_128 : + GCCBuiltin<"__builtin_ia32_movdqa64store128_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_store_q_256 : + GCCBuiltin<"__builtin_ia32_movdqa64store256_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_store_q_512 : + GCCBuiltin<"__builtin_ia32_movdqa64store512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; } // Variable bit shift ops Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4227,6 +4227,14 @@ Info.writeMem = true; break; } + case STOREA: + case STOREU: { + Info.ptrVal = I.getArgOperand(0); + Info.memVT = MVT::getVT(I.getArgOperand(1)->getType()); + Info.align = (IntrData->Type == STOREA ? Info.memVT.getSizeInBits()/8 : 1); + Info.writeMem = true; + break; + } default: return false; } @@ -17659,6 +17667,26 @@ return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, MemIntr->getMemOperand(), ISD::NON_EXTLOAD); } + case STOREU: + case STOREA: { + SDValue Mask = Op.getOperand(4); + SDValue Data = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + MemIntrinsicSDNode *MemIntr = dyn_cast(Op); + assert(MemIntr && "Expected MemIntrinsicSDNode!"); + + if (isAllOnesConstant(Mask)) // return just a store + return DAG.getStore(Chain, dl, Data, Addr, MemIntr->getMemOperand()); + + EVT VT = MemIntr->getMemoryVT(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + return DAG.getMaskedStore(Chain, dl, Data, Addr, VMask, VT, + MemIntr->getMemOperand(), false); + } } } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2707,24 +2707,6 @@ avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; -def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src), - GR16:$mask), - (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), - VR512:$src)>; -def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), - GR8:$mask), - (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), - VR512:$src)>; - -def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src), - GR16:$mask), - (VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), - VR512:$src)>; -def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src), - GR8:$mask), - (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), - VR512:$src)>; - defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, @@ -2759,15 +2741,6 @@ (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)), (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; -def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src), - GR16:$mask), - (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), - VR512:$src)>; -def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src), - GR8:$mask), - (VMOVDQU64Zmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), - VR512:$src)>; - let AddedComplexity = 20 in { def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src), (bc_v8i64 (v16i32 immAllZerosV)))), Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -29,8 +29,9 @@ INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, - EXPAND_FROM_MEM, LOADA, LOADU, BLEND, INSERT_SUBVEC, - TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK + EXPAND_FROM_MEM, LOADA, LOADU, STOREA, STOREU, BLEND, INSERT_SUBVEC, + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, + CONVERT_MASK_TO_VEC, CONVERT_TO_MASK }; struct IntrinsicData { @@ -197,6 +198,36 @@ X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_store_d_128, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_d_256, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_d_512, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_pd_128, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_pd_256, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_pd_512, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_ps_128, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_ps_256, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_ps_512, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_q_128, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_q_256, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_store_q_512, STOREA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_b_128, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_b_256, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_b_512, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_d_128, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_d_256, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_d_512, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_pd_128, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_pd_256, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_pd_512, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_ps_128, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_ps_256, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_ps_512, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_q_128, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_q_256, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_q_512, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_w_128, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_w_256, STOREU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_storeu_w_512, STOREU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -858,49 +858,57 @@ } declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16) -define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) { +define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) { ; CHECK-LABEL: test_store1: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovups %zmm0, (%rsi) ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask) + call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1) ret void } declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 ) -define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) { +define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) { ; CHECK-LABEL: test_store2: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovupd %zmm0, (%rsi) ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask) + call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1) ret void } declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8) -define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) { +define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) { ; CHECK-LABEL: test_mask_store_aligned_ps: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovaps %zmm0, (%rsi) ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask) + call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1) ret void } declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 ) -define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) { +define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) { ; CHECK-LABEL: test_mask_store_aligned_pd: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovapd %zmm0, (%rsi) ; CHECK-NEXT: retq call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask) + call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1) ret void } @@ -922,6 +930,62 @@ ret <16 x float> %res4 } +declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8) + +define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16) + +define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu32 %zmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2) + call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8) + +define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16) + +define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2) + call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1) + ret void +} + declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16) define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) { @@ -6897,8 +6961,6 @@ ret <8 x i64> %res4 } - - declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { @@ -6938,4 +7000,3 @@ %res4 = add <16 x i32> %res3, %res2 ret <16 x i32> %res4 } - Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -3107,7 +3107,6 @@ ret <32 x i16> %res4 } - declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) { @@ -3147,3 +3146,51 @@ %res4 = add <32 x i16> %res3, %res2 ret <32 x i16> %res4 } + +declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64) + +define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 +; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1} +; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%eax) +; AVX512F-32-NEXT: retl + call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2) + call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32) + +define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edx, %k1 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1} +; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%eax) +; AVX512F-32-NEXT: retl + call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2) + call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1) + ret void +} Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -5107,3 +5107,59 @@ %res4 = add <16 x i16> %res3, %res2 ret <16 x i16> %res4 } + +declare void @llvm.x86.avx512.mask.storeu.b.128(i8*, <16 x i8>, i16) + +define void@test_int_x86_avx512_mask_storeu_b_128(i8* %ptr1, i8* %ptr2, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu8 %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr1, <16 x i8> %x1, i16 %x2) + call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr2, <16 x i8> %x1, i16 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.b.256(i8*, <32 x i8>, i32) + +define void@test_int_x86_avx512_mask_storeu_b_256(i8* %ptr1, i8* %ptr2, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovd %edx, %k1 +; CHECK-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu8 %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr1, <32 x i8> %x1, i32 %x2) + call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr2, <32 x i8> %x1, i32 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.w.128(i8*, <8 x i16>, i8) + +define void@test_int_x86_avx512_mask_storeu_w_128(i8* %ptr1, i8* %ptr2, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu16 %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr1, <8 x i16> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr2, <8 x i16> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.w.256(i8*, <16 x i16>, i16) + +define void@test_int_x86_avx512_mask_storeu_w_256(i8* %ptr1, i8* %ptr2, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu16 %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr1, <16 x i16> %x1, i16 %x2) + call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1) + ret void +} Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -7466,3 +7466,227 @@ %res4 = add <8 x i32> %res3, %res2 ret <8 x i32> %res4 } + +declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8) + +define void@test_int_x86_avx512_mask_store_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovapd %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovapd %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.pd.256(i8*, <4 x double>, i8) + +define void@test_int_x86_avx512_mask_store_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovapd %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovapd %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.pd.128(i8*, <2 x double>, i8) + +define void@test_int_x86_avx512_mask_storeu_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovupd %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovupd %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.pd.256(i8*, <4 x double>, i8) + +define void@test_int_x86_avx512_mask_storeu_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovupd %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovupd %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.ps.128(i8*, <4 x float>, i8) + +define void@test_int_x86_avx512_mask_store_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovaps %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovaps %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.ps.256(i8*, <8 x float>, i8) + +define void@test_int_x86_avx512_mask_store_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovaps %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovaps %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.ps.128(i8*, <4 x float>, i8) + +define void@test_int_x86_avx512_mask_storeu_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovups %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.ps.256(i8*, <8 x float>, i8) + +define void@test_int_x86_avx512_mask_storeu_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovups %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.q.128(i8*, <2 x i64>, i8) + +define void@test_int_x86_avx512_mask_storeu_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu64 %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu64 %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.q.256(i8*, <4 x i64>, i8) + +define void@test_int_x86_avx512_mask_storeu_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu64 %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu64 %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.d.128(i8*, <4 x i32>, i8) + +define void@test_int_x86_avx512_mask_storeu_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu32 %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.d.256(i8*, <8 x i32>, i8) + +define void@test_int_x86_avx512_mask_storeu_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqu32 %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.q.128(i8*, <2 x i64>, i8) + +define void@test_int_x86_avx512_mask_store_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_q_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqa64 %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqa64 %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.q.256(i8*, <4 x i64>, i8) + +define void@test_int_x86_avx512_mask_store_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqa64 %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqa64 %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.d.128(i8*, <4 x i32>, i8) + +define void@test_int_x86_avx512_mask_store_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqa32 %xmm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1) + ret void +} + +declare void @llvm.x86.avx512.mask.store.d.256(i8*, <8 x i32>, i8) + +define void@test_int_x86_avx512_mask_store_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_store_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} +; CHECK-NEXT: vmovdqa32 %ymm0, (%rsi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2) + call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1) + ret void +}