Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -1890,25 +1890,69 @@ def int_x86_avx_maskload_ps_256 : GCCBuiltin<"__builtin_ia32_maskloadps256">, Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8i32_ty], [IntrReadArgMem]>; - def int_x86_avx512_mask_loadu_ps_512 : GCCBuiltin<"__builtin_ia32_loadups512_mask">, - Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], - [IntrReadArgMem]>; - def int_x86_avx512_mask_loadu_pd_512 : GCCBuiltin<"__builtin_ia32_loadupd512_mask">, - Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], - [IntrReadArgMem]>; - def int_x86_avx512_mask_load_ps_512 : GCCBuiltin<"__builtin_ia32_loadaps512_mask">, - Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], - [IntrReadArgMem]>; - def int_x86_avx512_mask_load_pd_512 : GCCBuiltin<"__builtin_ia32_loadapd512_mask">, - Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], - [IntrReadArgMem]>; - def int_x86_avx512_mask_move_ss : GCCBuiltin<"__builtin_ia32_movss_mask">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_move_sd : GCCBuiltin<"__builtin_ia32_movsd_mask">, - Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], - [IntrNoMem]>; + def int_x86_avx512_mask_loadu_ps_128 : + GCCBuiltin<"__builtin_ia32_loadups128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_ps_256 : + GCCBuiltin<"__builtin_ia32_loadups256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_ps_512 : + GCCBuiltin<"__builtin_ia32_loadups512_mask">, + Intrinsic<[llvm_v16f32_ty], + [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrReadArgMem]>; + + def int_x86_avx512_mask_loadu_pd_128 : + GCCBuiltin<"__builtin_ia32_loadupd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_pd_256 : + GCCBuiltin<"__builtin_ia32_loadupd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_pd_512 : + GCCBuiltin<"__builtin_ia32_loadupd512_mask">, + Intrinsic<[llvm_v8f64_ty], + [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrReadArgMem]>; + + def int_x86_avx512_mask_load_ps_128 : + GCCBuiltin<"__builtin_ia32_loadaps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_load_ps_256 : + GCCBuiltin<"__builtin_ia32_loadaps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_load_ps_512 : + GCCBuiltin<"__builtin_ia32_loadaps512_mask">, + Intrinsic<[llvm_v16f32_ty], + [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrReadArgMem]>; + + def int_x86_avx512_mask_load_pd_128 : + GCCBuiltin<"__builtin_ia32_loadapd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_load_pd_256 : + GCCBuiltin<"__builtin_ia32_loadapd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_load_pd_512 : + GCCBuiltin<"__builtin_ia32_loadapd512_mask">, + Intrinsic<[llvm_v8f64_ty], + [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrReadArgMem]>; + + def int_x86_avx512_mask_move_ss : + GCCBuiltin<"__builtin_ia32_movss_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_move_sd : + GCCBuiltin<"__builtin_ia32_movsd_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], + [IntrNoMem]>; } // Conditional store ops Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -837,6 +837,13 @@ /// from i32 to i8 but not from i32 to i16. bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + /// Given an intrinsic, checks if on the target the intrinsic will need to map + /// to a MemIntrinsicNode (touches memory). If this is the case, it returns + /// true and store the intrinsic information into the IntrinsicInfo that was + /// passed to the function. + bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + unsigned Intrinsic) const override; + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4159,6 +4159,35 @@ } } + +bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + + const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); + if (!IntrData) + return false; + + switch (IntrData->Type) { + case LOADA: + case LOADU: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1); + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + default: + break; + } + + return false; +} + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. @@ -17526,7 +17555,6 @@ return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); @@ -17552,7 +17580,6 @@ case TRUNCATE_TO_MEM_VI32: return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); @@ -17572,6 +17599,23 @@ Mask, PassThru, Subtarget, DAG), Chain}; return DAG.getMergeValues(Results, dl); } + case LOADU: + case LOADA: { + SDValue Mask = Op.getOperand(4); + SDValue PassThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + MachineMemOperand *MMO = cast(Op)->getMemOperand(); + + if (isAllOnesConstant(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MMO); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, MMO, + ISD::NON_EXTLOAD); + } } } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2707,30 +2707,6 @@ avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; -def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), - (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; - -def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr, - (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), - (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; - -def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), - (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; - -def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, - (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), - (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; - -def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VMOVAPDZrm addr:$ptr)>; - -def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VMOVAPSZrm addr:$ptr)>; - def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src), GR16:$mask), (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -29,7 +29,7 @@ INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, - EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC, + EXPAND_FROM_MEM, LOADA, LOADU, BLEND, INSERT_SUBVEC, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK }; @@ -143,6 +143,18 @@ EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_load_pd_128, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_pd_256, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_pd_512, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_ps_128, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_ps_256, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_ps_512, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -907,49 +907,79 @@ declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8) -define <16 x float> @test_maskz_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) { -; CHECK-LABEL: test_maskz_load_aligned_ps: +define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) { +; CHECK-LABEL: test_mask_load_aligned_ps: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} +; CHECK-NEXT: vmovaps (%rdi), %zmm1 {%k1} {z} +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask) - ret <16 x float> %res + %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1) + %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask) + %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask) + %res4 = fadd <16 x float> %res2, %res1 + ret <16 x float> %res4 } declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16) -define <8 x double> @test_maskz_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) { -; CHECK-LABEL: test_maskz_load_aligned_pd: +define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_ps: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} +; CHECK-NEXT: vmovups (%rdi), %zmm1 {%k1} {z} +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask) - ret <8 x double> %res + %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1) + %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask) + %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask) + %res4 = fadd <16 x float> %res2, %res1 + ret <16 x float> %res4 } -declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8) +declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16) -define <16 x float> @test_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) { -; CHECK-LABEL: test_load_aligned_ps: +define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_aligned_pd: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm0 +; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} +; CHECK-NEXT: vmovapd (%rdi), %zmm1 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1) - ret <16 x float> %res + %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1) + %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask) + %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask) + %res4 = fadd <8 x double> %res2, %res1 + ret <8 x double> %res4 } -define <8 x double> @test_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) { -; CHECK-LABEL: test_load_aligned_pd: +declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8) + +define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_pd: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovapd (%rdi), %zmm0 +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovupd (%rdi), %zmm0 +; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} +; CHECK-NEXT: vmovupd (%rdi), %zmm1 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1) - ret <8 x double> %res + %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1) + %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask) + %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask) + %res4 = fadd <8 x double> %res2, %res1 + ret <8 x double> %res4 } -declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*) +declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8) define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) { ; CHECK-LABEL: test_valign_q: Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -6388,6 +6388,158 @@ ret <4 x i64> %res4 } +define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_aligned_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} +; CHECK-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z} +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1) + %res1 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> %res, i8 %mask) + %res2 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask) + %res4 = fadd <8 x float> %res2, %res1 + ret <8 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8*, <8 x float>, i8) + +define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} +; CHECK-NEXT: vmovups (%rdi), %ymm1 {%k1} {z} +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1) + %res1 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> %res, i8 %mask) + %res2 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask) + %res4 = fadd <8 x float> %res2, %res1 + ret <8 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8*, <8 x float>, i8) + +define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_aligned_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovapd (%rdi), %ymm0 +; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} +; CHECK-NEXT: vmovapd (%rdi), %ymm1 {%k1} {z} +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1) + %res1 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> %res, i8 %mask) + %res2 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask) + %res4 = fadd <4 x double> %res2, %res1 + ret <4 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8*, <4 x double>, i8) + +define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovupd (%rdi), %ymm0 +; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} +; CHECK-NEXT: vmovupd (%rdi), %ymm1 {%k1} {z} +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1) + %res1 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> %res, i8 %mask) + %res2 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask) + %res4 = fadd <4 x double> %res2, %res1 + ret <4 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8*, <4 x double>, i8) + +define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_aligned_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} +; CHECK-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z} +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1) + %res1 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> %res, i8 %mask) + %res2 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask) + %res4 = fadd <4 x float> %res2, %res1 + ret <4 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8*, <4 x float>, i8) + +define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} +; CHECK-NEXT: vmovups (%rdi), %xmm1 {%k1} {z} +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1) + %res1 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> %res, i8 %mask) + %res2 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask) + %res4 = fadd <4 x float> %res2, %res1 + ret <4 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8*, <4 x float>, i8) + +define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_aligned_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovapd (%rdi), %xmm0 +; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} +; CHECK-NEXT: vmovapd (%rdi), %xmm1 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1) + %res1 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> %res, i8 %mask) + %res2 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask) + %res4 = fadd <2 x double> %res2, %res1 + ret <2 x double> %res4 +} + +declare <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8*, <2 x double>, i8) + +define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovupd (%rdi), %xmm0 +; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} +; CHECK-NEXT: vmovupd (%rdi), %xmm1 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1) + %res1 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> %res, i8 %mask) + %res2 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask) + %res4 = fadd <2 x double> %res2, %res1 + ret <2 x double> %res4 +} + +declare <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8*, <2 x double>, i8) + declare <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32>, <4 x i32>, <4 x i32>, i8) define <4 x i32>@test_int_x86_avx512_mask_psrav4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {